# Text Processing 

This notebook provides a function that removes unnecessary characters such as stop words or punctuation from the tweets received in the dataset.

## 1. Importations

In [1]:
#nltk.download()


In [2]:
#DAND CE FICHIER JE FAIS PLEIN DE TESTS
# J'analyse les smileys

import numpy as np
import pandas as pd
import re
from our_functionsv3 import read_data
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize, TweetTokenizer

In [3]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

### 2. Load data

In [99]:
train_pos = read_data("twitter-datasets/train_pos.txt")
train_neg = read_data("twitter-datasets/train_neg.txt")

pos = pd.DataFrame(train_pos, columns=["tweet"])
neg = pd.DataFrame(train_neg, columns=["tweet"])

In [100]:
test = read_data("twitter-datasets/test_data.txt")
test_pd = pd.DataFrame(test, columns=["tweet"])

In [5]:
#train_pos_full = read_data("twitter-datasets/train_pos_full.txt")
#train_neg_full = read_data("twitter-datasets/train_neg_full.txt")

#pos_full = pd.DataFrame(train_pos_full, columns=["tweet"])
#neg_full = pd.DataFrame(train_neg_full, columns=["tweet"])

## 3. Obtain stop words

In this case, the stop words are only selected from the english dictionnary, but some words are kept: "not" or some of them that can be useful to detect pos or neg.

In [6]:
#stop_words = list(get_stop_words('en'))         
nltk_words = list(stopwords.words('english')) 
#stop_words.extend(nltk_words)

In [7]:
nltk_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'if', 'or', 'because', 'as', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

### 4.2 PreProcessTweets function

In [93]:
def pre_process_tweets(data):
    
    # make a copy to be sure that data itself is not changed and that we can compare it later.
    data2=data.copy()
    
    # change n't into not to keep this information. Without it, the words with n't would just be erased.
    data2["tweet"] = data2["tweet"].str.replace("n't", " not")
    
    #Assimilate smileys to words
    data2 = pre_process_smiley(data2)
    
    # go into preprocessing to separate all words and punctuation
    data2["tweet"] = data2["tweet"].apply(lambda x: preprocess1(x))
    
    # reconstruct the #
    data2["tweet"] = data2["tweet"].str.replace("# ", "#")
    
    # go into preprocessing
    data2["tweet"] = data2["tweet"].apply(lambda y: preprocess2(y))
    
    return data2

In [94]:
def preprocess1(tweet):
    # this tweet tokenizer is used to separate each words and ponctuation in a sentence
    output = [x.strip().lower() for x in nltk.word_tokenize(tweet)]
    
    return " ".join(output)

In [95]:
def preprocess2(tweet):
    
    
    tknzr = TweetTokenizer(strip_handles=True)
    words = [x.strip().lower() for x in tknzr.tokenize(tweet)]

    # erase all the words that contains a ponctuation or other special signs but keep the one with an #
    #words = [word for word in words if (word.isalpha() or word.startswith("#"))]
    
    # erase all the words contained in the nltk_words = the stopwords defined earlier
    output = [w for w in words if not w in nltk_words]
    
    return " ".join(output)

In [96]:
def pre_process_smiley(data):
    """replace the smileys by words corresponding to their meaning
    Example :
    <3 --> love
    =) --> happy
    etc.
    """
    
    #===================================================================
    # Watch out : the order in which we do the replacements matters !!!!
    #===================================================================
    
    #key words
    love = ' love '
    kiss = ' kiss '
    happy =  ' happy '
    sad = ' sad '
    
    #special caracters
    data["tweet"] = data["tweet"].str.replace('<3', love)
    
    #smiley kiss
    data["tweet"] = data["tweet"].str.replace(':\*\( \{ \} \)', kiss) #or maybe nothing
    data["tweet"] = data["tweet"].str.replace(":\*\(", sad)
    data["tweet"] = data["tweet"].str.replace(':\*\)', ' happy kiss ')
    data["tweet"] = data["tweet"].str.replace(':\*\{ \}', kiss)
    data["tweet"] = data["tweet"].str.replace(':\*p', kiss)
    data["tweet"] = data["tweet"].str.replace(':\*', kiss)
    data["tweet"] = data["tweet"].str.replace(': \*', kiss)
    
    #smiley happy
    data["tweet"] = data["tweet"].str.replace(""":'\)""", happy)
    data["tweet"] = data["tweet"].str.replace('8\)', happy)
    data["tweet"] = data["tweet"].str.replace(' 8d ', happy) # ' 8d ' and not '8d' bc of 8days...
    data["tweet"] = data["tweet"].str.replace(':}', happy)
    data["tweet"] = data["tweet"].str.replace('{:', happy)
    data["tweet"] = data["tweet"].str.replace(': \)', happy)
    data["tweet"] = data["tweet"].str.replace('\( :', happy)
    data["tweet"] = data["tweet"].str.replace(':\)', happy)
    data["tweet"] = data["tweet"].str.replace('\(:', happy)
    data["tweet"] = data["tweet"].str.replace('=\)', happy) #important one
    data["tweet"] = data["tweet"].str.replace('\(=', happy)
    
    #smiley sad
    data["tweet"] = data["tweet"].str.replace('=\(', sad) #important one
    data["tweet"] = data["tweet"].str.replace('\)=', sad)
    data["tweet"] = data["tweet"].str.replace(':c', sad)
    data["tweet"] = data["tweet"].str.replace(':\(', sad)
    data["tweet"] = data["tweet"].str.replace('p:', sad)
    
    return data

In [91]:
#=======================================================================================
#just to check it works
#DO NOT COMPUTE THIS CELL TO COMPUTE THE PREPROCESSED DATA FILES
#=======================================================================================

#To check the kiss smiley
pos.tweet[0] = "A is :*(     B is :')     C is <3"
pos.tweet[1] = "D is :*)     E is :*{ }   F is :*( { } )"
pos.tweet[2] = "G is :*p     H is :*      I is : *"

#to check the happy smiley
pos.tweet[3] = "A is :')'     B is 8)      C is 8d "
pos.tweet[4] = "D is : )      E is :)      F is ( :"
pos.tweet[5] = "G is (:       H is LOL     I is :}"
pos.tweet[6] = "J is {:       K is =)      L is (="

#To check the sad smiley
pos.tweet[7] = "SAD : A is =(     B is )=     C is :c"
pos.tweet[8] = "SAD : D is :(     E is p:     F is NOTHING"


train_pos_preprocessed = pre_process_tweets(pos)
train_neg_preprocessed = pre_process_tweets(neg)

#train_pos_preprocessed.tweet[0:10]

In [101]:
#================================================
# COMPUTATION : CREATE OUR PRE_PROCESS FILES !!!
#================================================

train_pos_preprocessed = pre_process_tweets(pos)
train_neg_preprocessed = pre_process_tweets(neg)

train_pos_preprocessed.to_csv('twitter-datasets/train_pos_preprocessed_smiley_hashtag.txt', header=None, index=False, sep='\t')
train_neg_preprocessed.to_csv('twitter-datasets/train_neg_preprocessed_smiley_hashtag.txt', header=None, index=False, sep='\t')

In [102]:
test_preprocessed = pre_process_tweets(test_pd)
test_preprocessed.to_csv('twitter-datasets/test_preprocessed_smiley_hashtag.txt', header=None, index=False, sep='\t')

In [1]:
#train_pos_preprocessed

NameError: name 'train_pos_preprocessed' is not defined