## Data Processing

The data processing includes:
- Remove @user mentions
- Remove non-alphabetic characters + spaces + apostrophe
- Remove links
- Remove single characters
- Remove stopwords
- Lemmatize words
- Stem words

In [1]:
import re
import nltk
import numpy as np
import pandas as pd
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
from termcolor import colored
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /Users/sophie/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sophie/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
# Import dataset
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [5]:
# Set stopwords
STOPWORDS = set(stopwords.words('english'))
STOPWORDS.remove("not")
STOPWORDS

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 're',
 's'

### Define function to expand tweet

In [7]:
def expand_tweet(tweet):
    expanded_tweet = []
    for word in tweet:
        if re.search("n't", word):
            expanded_tweet.append(word.split("n't")[0])
            expanded_tweet.append("not")
        else:
            expanded_tweet.append(word)
    return expanded_tweet

### Define function to process tweet

In [8]:
def clean_tweet(data, wordNetLemmatizer, porterStemmer):
    data['Clean_tweet'] = data['Tweet']
    print(colored("Removing user handles starting with @", "yellow"))
    data['Clean_tweet'] = data['Clean_tweet'].str.replace("@[\w]*","")
    
    print(colored("Removing numbers and special characters", "yellow"))
    data['Clean_tweet'] = data['Clean_tweet'].str.replace("[^a-zA-Z' ]","")
    
    print(colored("Removing urls", "yellow"))
    data['Clean_tweet'] = data['Clean_tweet'].replace(re.compile(r"((www\.[^\s]+)|(https?://[^\s]+))"), "")
    
    print(colored("Removing single characters", "yellow"))
    data['Clean_tweet'] = data['Clean_tweet'].replace(re.compile(r"(^| ).( |$)"), " ")
    
    print(colored("Tokenizing", "yellow"))
    data['Clean_tweet'] = data['Clean_tweet'].str.split()
    
    print(colored("Removing stopwords", "yellow"))
    data['Clean_tweet'] = data['Clean_tweet'].apply(lambda tweet: 
                                                    [word for word in tweet if word not in STOPWORDS])
    print(colored("Expanding not words", "yellow"))
    data['Clean_tweet'] = data['Clean_tweet'].apply(lambda tweet: expand_tweet(tweet))
    
    print(colored("Lemmatizing the words", "yellow"))
    data['Clean_tweet'] = data['Clean_tweet'].apply(lambda tweet: 
                                                    [wordNetLemmatizer.lemmatize(word) for word in tweet])
    print(colored("Stemming the words", "yellow"))
    data['Clean_tweet'] = data['Clean_tweet'].apply(lambda tweet: 
                                                    [porterStemmer.stem(word) for word in tweet])
    print(colored("Combining words back to tweets", "yellow"))
    data['Clean_tweet'] = data['Clean_tweet'].apply(lambda tweet: ' '.join(tweet))
    
    return data
    

In [9]:
# Define processing methods
wordNetLemmatizer = WordNetLemmatizer()
porterStemmer = PorterStemmer()

In [10]:
# Preprocessing the tweets
print(colored("Processing train data", "green"))
train_data = clean_tweet(train_data, wordNetLemmatizer, porterStemmer)
train_data.to_csv('clean_train.csv', index = False)
print(colored("Train data processed and saved to clean_train.csv", "green"))

print(colored("Processing test data", "green"))
test_data = clean_tweet(test_data, wordNetLemmatizer, porterStemmer)
test_data.to_csv('clean_test.csv', index = False)
print(colored("Test data processed and saved to clean_test.csv", "green"))

[32mProcessing train data[0m
[33mRemoving user handles starting with @[0m
[33mRemoving numbers and special characters[0m
[33mRemoving urls[0m
[33mRemoving single characters[0m
[33mTokenizing[0m
[33mRemoving stopwords[0m
[33mExpanding not words[0m
[33mLemmatizing the words[0m
[33mStemming the words[0m
[33mCombining words back to tweets[0m
[32mTrain data processed and saved to clean_train.csv[0m
[32mProcessing test data[0m
[33mRemoving user handles starting with @[0m
[33mRemoving numbers and special characters[0m
[33mRemoving urls[0m
[33mRemoving single characters[0m
[33mTokenizing[0m
[33mRemoving stopwords[0m
[33mExpanding not words[0m
[33mLemmatizing the words[0m
[33mStemming the words[0m
[33mCombining words back to tweets[0m
[32mTest data processed and saved to clean_test.csv[0m
