# 1.Load dataset

In [42]:
import pandas as pd
df = pd.read_csv('Sentiment Analysis Dataset.csv')

df_pos = df[df['Sentiment']==1]
df_neg = df[df['Sentiment']==0]

In [43]:
df.head()

Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText
0,1,0,Sentiment140,is so sad for my APL frie...
1,2,0,Sentiment140,I missed the New Moon trail...
2,3,1,Sentiment140,omg its already 7:30 :O
3,4,0,Sentiment140,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,Sentiment140,i think mi bf is cheating on me!!! ...


In [44]:
pos_docs = df_pos['SentimentText'].values
print('There are {} positive sentences'.format(len(pos_docs)))
neg_docs = df_neg['SentimentText'].values
print('There are {} negative sentences'.format(len(neg_docs)))

There are 790185 positive sentences
There are 788440 negative sentences


# 2.Preprocess sentiment text

## 2.1 Removing punctuation

In [2]:
import re
# function to remove special characters
def remove_special_characters(text):
    # define the pattern to keep
    pat = r'[^a-zA-z\s]' 
    return re.sub(pat, '', text)


## 2.2 Lemmatization

In [3]:
import spacy
from spacy.lang.en import English
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

# function to remove special characters
def get_lem(text):
    text = nlp(text)
    text = [word.lemma_.lower() if word.lemma_ != '-PRON-' else word.text for word in text]
    text_without_space = [word for word in text if not word.isspace()]

    return text_without_space


## 2.3 Removing stopwords

### make use of stopwors form nltk and update some words observed in dataset as stopwords as well.

In [14]:
import nltk
from nltk.tokenize import ToktokTokenizer
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
# custom: removing words from list
stopword_list.remove('not')
stopword_list.append('u')
stopword_list.append('day')
stopword_list.append('today')
stopword_list.append('go')
stopword_list.append('going')
stopword_list.append('get')
stopword_list.append('got')
# function to remove stopwords
def remove_stopwords(tokens):
    # convert sentence into token of words
    tokens = [token.strip() for token in tokens]
    # check in lowercase 
    token_without_stopwords = [token for token in tokens if token.lower() not in stopword_list]
  
    return token_without_stopwords


## 2.4 Text cleaning pipeline

### combine three functions as a pipeline, return a list of tokens for each sentence in dataset

In [15]:
def cleanup(text):
    text = remove_special_characters(text)
    tokens = get_lem(text)
    tokens_without_stopwords = remove_stopwords(tokens)

    return tokens_without_stopwords


In [37]:
df_pos['Tokens'] = df_pos['SentimentText'].apply(cleanup)
df_neg['Tokens'] = df_neg['SentimentText'].apply(cleanup)

In [19]:
df_pos.head()

Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText,Tokens
2,3,1,Sentiment140,omg its already 7:30 :O,"[omg, already]"
6,7,1,Sentiment140,Juuuuuuuuuuuuuuuuussssst Chillin!!,"[juuuuuuuuuuuuuuuuussssst, chillin]"
8,9,1,Sentiment140,handed in my uniform today . i miss you ...,"[handed, uniform, miss, already]"
9,10,1,Sentiment140,hmmmm.... i wonder how she my number @-),"[hmmmm, wonder, number]"
11,12,1,Sentiment140,thanks to all the haters up in my face a...,"[thanks, haters, face]"


In [20]:
df_neg.head()

Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText,Tokens
0,1,0,Sentiment140,is so sad for my APL frie...,"[sad, apl, friend]"
1,2,0,Sentiment140,I missed the New Moon trail...,"[missed, new, moon, trailer]"
3,4,0,Sentiment140,.. Omgaga. Im sooo im gunna CRy. I'...,"[omgaga, sooo, gunna, cry, dentist, since, sup..."
4,5,0,Sentiment140,i think mi bf is cheating on me!!! ...,"[think, mi, bf, cheating, t_t]"
5,6,0,Sentiment140,or i just worry too much?,"[worry, much]"


# 3 List the top words

## 3.1 Top positive words

### preprocess text in 'df_pos' dataset, collect all tokens and count the frequency in a word-frequency dataset 'df_top_pos'

In [None]:
pos_words_list = []
pos_words_dict = {}
for text in df_pos['SentimentText'].values:
    tokens = cleanup(text)
    pos_words_list += tokens
    for token in tokens:
        if token in pos_words_dict.keys():
            pos_words_dict[token] += 1
        else: pos_words_dict[token] = 1

In [39]:
df_top_pos = pd.DataFrame(pos_words_dict.items(), columns=['Word','Frequency'])
df_top_pos.sort_values(by='Frequency', ascending=False, ignore_index=True)[:10]

Unnamed: 0,Word,Frequency
0,not,102010
1,good,59541
2,love,47366
3,like,37044
4,thanks,33642
5,lol,33356
6,time,28990
7,new,26383
8,one,25650
9,see,25327


## 3.2 Top negative words

### preprocess text in 'df_neg' dataset, collect all tokens and count the frequency in a word-frequency dataset 'df_top_neg'

In [None]:
neg_words_list = []
neg_words_dict = {}
for text in df_neg['SentimentText'].values:
    tokens = cleanup(text)
    neg_words_list += tokens
    for token in tokens:
        if token in neg_words_dict.keys():
            neg_words_dict[token] += 1
        else: neg_words_dict[token] = 1

In [41]:
df_top_neg = pd.DataFrame(neg_words_dict.items(), columns=['Word','Frequency'])
df_top_neg.sort_values(by='Frequency', ascending=False, ignore_index=True)[:10]

Unnamed: 0,Word,Frequency
0,not,236021
1,work,42703
2,like,40500
3,back,32047
4,really,30906
5,miss,30007
6,want,29337
7,still,28594
8,good,28300
9,sad,27015
