# Text Processing 

This notebook provides a function that removes unnecessary characters such as stop words or punctuation from the tweets received in the dataset.

## 1. Importations

In [12]:
import numpy as np
import pandas as pd
import re
from our_functionsv3 import read_data
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize, TweetTokenizer

In [13]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### 2. Load data

In [14]:
train_pos = read_data("twitter-datasets/train_pos.txt")
train_neg = read_data("twitter-datasets/train_neg.txt")

pos = pd.DataFrame(train_pos, columns=["tweet"])
neg = pd.DataFrame(train_neg, columns=["tweet"])

In [15]:
test = read_data("twitter-datasets/test_data.txt")
test_pd = pd.DataFrame(test, columns=["tweet"])

In [5]:
#train_pos_full = read_data("twitter-datasets/train_pos_full.txt")
#train_neg_full = read_data("twitter-datasets/train_neg_full.txt")

#pos_full = pd.DataFrame(train_pos_full, columns=["tweet"])
#neg_full = pd.DataFrame(train_neg_full, columns=["tweet"])

## 3. Obtain stop words

In this case, the stop words are only selected from the english dictionnary, but some words are kept: "not" or some of them that can be useful to detect pos or neg.

In [16]:
#stop_words = list(get_stop_words('en'))         
nltk_words = list(stopwords.words('english')) 
#stop_words.extend(nltk_words)

In [17]:
nltk_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'if', 'or', 'because', 'as', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

### 4.2 PreProcessTweets function

In [18]:
def pre_process_tweets(data):
    
    # make a copy to be sure that data itself is not changed and that we can compare it later.
    data2=data.copy()
    
    # change n't into not to keep this information. Without it, the words with n't would just be erased.
    data2["tweet"] = data2["tweet"].str.replace("n't", " not")
    
    # go into preprocessing to separate all words and punctuation
    data2["tweet"] = data2["tweet"].apply(lambda x: preprocess1(x))
    
    # reconstruct the #
    data2["tweet"] = data2["tweet"].str.replace("# ", "#")
    
    # go into preprocessing
    data2["tweet"] = data2["tweet"].apply(lambda y: preprocess2(y))
    
    return data2

In [19]:
def preprocess1(tweet):
    # this tweet tokenizer is used to separate each words and ponctuation in a sentence
    output = [x.strip().lower() for x in nltk.word_tokenize(tweet)]
    
    return " ".join(output)

In [20]:
def preprocess2(tweet):
    
    
    tknzr = TweetTokenizer(strip_handles=True)
    words = [x.strip().lower() for x in tknzr.tokenize(tweet)]

    # erase all the words that contains a ponctuation or other special signs but keep the one with an #
    words = [word for word in words if (word.isalpha() or word.startswith("#"))]
    
    # erase all the words contained in the nltk_words = the stopwords defined earlier
    output = [w for w in words if not w in nltk_words]
    
    return " ".join(output)

In [21]:
train_pos_preprocessed = pre_process_tweets(pos)
train_neg_preprocessed = pre_process_tweets(neg)

train_pos_preprocessed.to_csv('twitter-datasets/train_pos_preprocessed_withhastags.txt', header=None, index=False, sep='\t')
train_neg_preprocessed.to_csv('twitter-datasets/train_neg_preprocessed_withhastags.txt', header=None, index=False, sep='\t')

In [22]:
test_preprocessed = pre_process_tweets(test_pd)
test_preprocessed.to_csv('twitter-datasets/test_preprocessed_withhastags.txt', header=None, index=False, sep='\t')