# Text Processing 

This notebook provides a function that removes unnecessary characters such as stop words or punctuation from the tweets received in the dataset.

## 1. Importations

In [101]:
import numpy as np
import pandas as pd
import nltk
from stop_words import get_stop_words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

## 2. Load data

In [102]:
train_pos = pd.read_csv("datasets/train_pos.txt", sep="\n", names = ["tweet"], error_bad_lines=False);
train_neg = pd.read_csv("datasets/train_neg.txt", sep="\n", names = ["tweet"], error_bad_lines=False);

In [103]:
train_pos.head(5)

Unnamed: 0,tweet
0,<user> i dunno justin read my mention or not ....
1,"because your logic is so dumb , i won't even c..."
2,<user> just put casper in a box ! looved the...
3,<user> <user> thanks sir > > don't trip lil ma...
4,visiting my brother tmr is the bestest birthda...


## 3. Obtain stop words

In this case, the stop words are only limited to the english dictionnary.

In [76]:
stop_words = list(get_stop_words('en'))         
nltk_words = list(stopwords.words('english')) 
stop_words.extend(nltk_words)

Example of the first 10 stop words

In [116]:
print(stop_words[:10])

['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and']


## 4. Tweet Preprocessing

### 4.1 Basic example

In [87]:
# Tokenization of the data (separation of the tweets into words or characters, e.g. "<", "#")
# Removal of space charaters and set to lower casing
words = [x.strip().lower() for x in nltk.word_tokenize(train_pos["tweet"][0])]
print(words)

['<', 'user', '>', 'i', 'dunno', 'justin', 'read', 'my', 'mention', 'or', 'not', '.', 'only', 'justin', 'and', 'god', 'knows', 'about', 'that', ',', 'but', 'i', 'hope', 'you', 'will', 'follow', 'me', '#', 'believe', '15']


In [88]:
# Keep only the words that are only contains characters that are part of the alphabet
words = [word for word in words if word.isalpha()]
print(words)

['user', 'i', 'dunno', 'justin', 'read', 'my', 'mention', 'or', 'not', 'only', 'justin', 'and', 'god', 'knows', 'about', 'that', 'but', 'i', 'hope', 'you', 'will', 'follow', 'me', 'believe']


In [90]:
# Remove stop words
output = [w for w in words if not w in stop_words]
print(output)

['user', 'dunno', 'justin', 'read', 'mention', 'justin', 'god', 'knows', 'hope', 'follow', 'believe']


In [91]:
# Recombine the strings to form a sentence
print(" ".join(output))

user dunno justin read mention justin god knows hope follow believe


### 4.2 PreProcessTweets function

In [92]:
def Preprocess(tweet):
    
    words = [x.strip().lower() for x in nltk.word_tokenize(tweet)]
    words = [word for word in words if word.isalpha()]
    output = [w for w in words if not w in stop_words]
    
    return " ".join(output)

In [111]:
def PreProcessTweets(data):
    
    data["tweet"] = data["tweet"].apply(lambda x: Preprocess(x))
    
    return data

In [117]:
train_pos_preprocessed = PreProcessTweets(train_pos)
train_neg_preprocessed = PreProcessTweets(train_neg)

In [118]:
train_pos_preprocessed.to_csv('train_pos_preprocessed.txt', header=None, index=False, sep='\t')
train_neg_preprocessed.to_csv('train_neg_preprocessed.txt', header=None, index=False, sep='\t')