# Pre procesamiento

En este *notebook* se aplicará el pre-procesamiento a cada comentario de reddit. El resultado se guardará en un archivo que es similar al archivo origen, con la única diferencia que el comentario estará conformado por *strings* procesados.

Se realizan los siguientes pre-procesamientos:
1. Eliminación de *stop words*
2. Lematización utilizando Spacy
3. Eliminación de las palabras menos frecuentes
4. Conversión de los lemas a minúscula
5. Eliminación de palabras no alfanuméricas
6. Solo se consideran palabras cuyo *part-of-speech* son un nombre propio, un sustantivo o un pronombre. [Ver *Universal POS tags*](https://universaldependencies.org/docs/u/pos/)

### Fuente

- [Twitter Topic Modeling](https://towardsdatascience.com/twitter-topic-modeling-e0e3315b12e2)


In [13]:
import pandas as pd
import re, nltk, spacy, gensim
import emoji
from spacy.tokenizer import Tokenizer
import string
import pickle

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

TEXT_FILE_READ = 'docs/reddit_data.csv'	# Text to be processed
TEXT_SAVE_FILE = 'docs/preprocessing_reddit_data.csv'
FILENAME_PICKLE = "docs/tmpreddit.pickle"

In [2]:
nlp = spacy.load('es_core_news_sm', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed
reddit = pd.read_csv(TEXT_FILE_READ)
tmpreddit = pd.read_csv(TEXT_FILE_READ)

In [4]:
def give_emoji_free_text(text):
    """
    Removes emoji's from tweets
    Accepts:
        Text (tweets)
    Returns:
        Text (emoji free tweets)
    """
    emoji_list = [c for c in text if c in emoji.UNICODE_EMOJI]
    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
    return clean_text

def url_free_text(text):
    '''
    Cleans text from urls
    '''
    text = re.sub(r'http\S+', '', text)
    return text


def email_free_text(text):
    '''
    Cleans text from emails
    '''
    text = re.sub('\S*@\S*\s?', '', text)
    return text

def quotes_free_text(text):
    '''
    Cleans text from quotes
    '''
    text = re.sub("\'", "", text)
    return text


# Apply the function above and get tweets free of emoji's
call_emoji_free = lambda x: give_emoji_free_text(x)

# Apply `call_emoji_free` which calls the function to remove all emoji's
tmpreddit['emails_free'] = tmpreddit['body'].apply(email_free_text)

#Create a new column with url free tweets
tmpreddit['quotes_free'] = tmpreddit['emails_free'].apply(quotes_free_text)

# Apply `call_emoji_free` which calls the function to remove all emoji's
tmpreddit['emoji_free'] = tmpreddit['quotes_free'].apply(call_emoji_free)

#Create a new column with url free tweets
tmpreddit['url_free'] = tmpreddit['emoji_free'].apply(url_free_text)

#print(tmpreddit[:1])

In [6]:
# Tokenizer
tokenizer = Tokenizer(nlp.vocab)

tokens = []

for doc in tokenizer.pipe(tmpreddit['url_free'], batch_size=500):
    doc_tokens = []
    for token in doc:
        if token.text.lower() not in nlp.Defaults.stop_words:
            doc_tokens.append(token.text.lower())
    tokens.append(doc_tokens)

# Makes tokens column
tmpreddit['tokens'] = tokens


In [7]:
# Make tokens a string again
tmpreddit['tokens_back_to_text'] = [' '.join(map(str, l)) for l in tmpreddit['tokens']]

def get_lemmas(text):
    '''Used to lemmatize the processed tweets'''
    lemmas = []

    doc = nlp(text)

    # Something goes here :P
    for token in doc:
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ in ['NOUN', 'ADJ', 'VERB', 'ADV', 'PROPN', 'PROPN']):
            lemmas.append(token.lemma_)

    return lemmas

tmpreddit['lemmas'] = tmpreddit['tokens_back_to_text'].apply(get_lemmas)


In [8]:
# Make lemmas a string again
tmpreddit['lemmas_back_to_text'] = [' '.join(map(str, l)) for l in tmpreddit['lemmas']]

# Tokenizer function
def tokenize(text):
    """
    Parses a string into a list of semantic units (words)
    Args:
        text (str): The string that the function will tokenize.
    Returns:
        list: tokens parsed out
    """
    # Removing url's
    pattern = r"http\S+"

    tokens = re.sub(pattern, "", text) # https://www.youtube.com/watch?v=O2onA4r5UaY
    tokens = re.sub('[^a-zA-Z 0-9]', '', text)
    tokens = re.sub('[%s]' % re.escape(string.punctuation), '', text) # Remove punctuation
    tokens = re.sub('\w*\d\w*', '', text) # Remove words containing numbers
    tokens = re.sub('@*!*\$*', '', text) # Remove @ ! $
    tokens = tokens.strip(',') # TESTING THIS LINE
    tokens = tokens.strip('?') # TESTING THIS LINE
    tokens = tokens.strip('!') # TESTING THIS LINE
    tokens = tokens.strip("'") # TESTING THIS LINE
    tokens = tokens.strip(".") # TESTING THIS LINE

    tokens = tokens.lower().split() # Make text lowercase and split it

    return tokens

# Apply tokenizer
tmpreddit['lemma_tokens'] = tmpreddit['lemmas_back_to_text'].apply(tokenize)



In [9]:
for index,row in enumerate(reddit['body']):
    reddit['body'][index] = tmpreddit['lemmas_back_to_text'][index]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reddit['body'][index] = tmpreddit['lemmas_back_to_text'][index]


In [10]:
reddit.to_csv(TEXT_SAVE_FILE, index=False)

fileObj = open(FILENAME_PICKLE, 'wb')
pickle.dump(tmpreddit, fileObj)
fileObj.close()