# Text Processing 

This notebook provides a function that removes unnecessary characters such as stop words or punctuation from the tweets received in the dataset.

## 1. Importations

In [69]:
import numpy as np
import pandas as pd
import nltk
#from stop_words import get_stop_words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

### 2. Load data

In [70]:
def read_data(data):
    with open(data, "r") as file:
        tweets = str()
        for _,line in enumerate(file):
            tweets += line
        tweets = tweets.split('\n')
        del tweets[-1]
    return tweets

In [79]:
train_pos = read_data("twitter-datasets/train_pos.txt")

In [48]:
train_neg = read_data("twitter-datasets/train_neg.txt")

In [None]:
train_pos_full = read_data("twitter-datasets/train_pos_full.txt")

In [78]:
train_neg_full = read_data("twitter-datasets/train_neg_full.txt")

UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 7072: character maps to <undefined>

In [49]:
pos = pd.DataFrame(train_pos, columns=["tweet"])
neg = pd.DataFrame(train_neg, columns=["tweet"])

In [None]:
pos_full = pd.DataFrame(train_pos_full, columns=["tweet"])

In [None]:
neg_full = pd.DataFrame(train_neg_full, columns=["tweet"])

In [71]:
test = read_data("twitter-datasets/test_data.txt")
test_pd = pd.DataFrame(test, columns=["tweet"])

In [30]:
len(pos)

100000

In [29]:
len(neg)

100000

In [56]:
len(train_neg_preprocessed)

100000

In [57]:
len(train_pos_preprocessed)

100000

In [4]:
train_pos.head(5)

Unnamed: 0,tweet
0,<user> i dunno justin read my mention or not ....
1,"because your logic is so dumb , i won't even c..."
2,<user> just put casper in a box ! looved the...
3,<user> <user> thanks sir > > don't trip lil ma...
4,visiting my brother tmr is the bestest birthda...


## 3. Obtain stop words

In this case, the stop words are only limited to the english dictionnary.

In [72]:
#stop_words = list(get_stop_words('en'))         
nltk_words = list(stopwords.words('english')) 
#stop_words.extend(nltk_words)

Example of the first 10 stop words

In [116]:
#print(stop_words[:10])

['a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and']


## 4. Tweet Preprocessing

### 4.1 Basic example

In [87]:
# Tokenization of the data (separation of the tweets into words or characters, e.g. "<", "#")
# Removal of space charaters and set to lower casing
words = [x.strip().lower() for x in nltk.word_tokenize(train_pos["tweet"][0])]
print(words)

['<', 'user', '>', 'i', 'dunno', 'justin', 'read', 'my', 'mention', 'or', 'not', '.', 'only', 'justin', 'and', 'god', 'knows', 'about', 'that', ',', 'but', 'i', 'hope', 'you', 'will', 'follow', 'me', '#', 'believe', '15']


In [88]:
# Keep only the words that are only contains characters that are part of the alphabet
words = [word for word in words if word.isalpha()]
print(words)

['user', 'i', 'dunno', 'justin', 'read', 'my', 'mention', 'or', 'not', 'only', 'justin', 'and', 'god', 'knows', 'about', 'that', 'but', 'i', 'hope', 'you', 'will', 'follow', 'me', 'believe']


In [90]:
# Remove stop words
output = [w for w in words if not w in stop_words]
print(output)

['user', 'dunno', 'justin', 'read', 'mention', 'justin', 'god', 'knows', 'hope', 'follow', 'believe']


In [91]:
# Recombine the strings to form a sentence
print(" ".join(output))

user dunno justin read mention justin god knows hope follow believe


### 4.2 PreProcessTweets function

In [73]:
def preprocess(tweet):
    
    words = [x.strip().lower() for x in nltk.word_tokenize(tweet)]
    words = [word for word in words if word.isalpha()]
    output = [w for w in words if not w in nltk_words]
    
    return " ".join(output)

In [74]:
def pre_process_tweets(data):
    
    data["tweet"] = data["tweet"].apply(lambda x: preprocess(x))
    
    return data

In [63]:
train_pos_preprocessed = pre_process_tweets(pos)
train_neg_preprocessed = pre_process_tweets(neg)

In [64]:
train_pos_preprocessed.to_csv('twitter-datasets/train_pos_preprocessed.txt', header=None, index=False, sep='\t')
train_neg_preprocessed.to_csv('twitter-datasets/train_neg_preprocessed.txt', header=None, index=False, sep='\t')

In [None]:
train_pos_preprocessed_full = pre_process_tweets(pos_full)

In [None]:
train_neg_preprocessed_full = pre_process_tweets(neg_full)

In [None]:
train_pos_preprocessed_full.to_csv('twitter-datasets/train_pos_preprocessed_full.txt', header=None, index=False, sep='\t')

In [None]:
train_neg_preprocessed_full.to_csv('twitter-datasets/train_neg_preprocessed_full.txt', header=None, index=False, sep='\t')

In [75]:
test_preprocessed = pre_process_tweets(test_pd)
test_preprocessed.to_csv('twitter-datasets/test_preprocessed.txt', header=None, index=False, sep='\t')