In [1]:
import re

import pandas as pd
import nltk
#window will open -> models -> punkt
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/nikesh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from nltk.collocations import *

bigram_measures = nltk.collocations.BigramAssocMeasures()

finder = BigramCollocationFinder.from_words(sample)

# only bigrams that appear 2+ times
finder.apply_freq_filter(2) 

# return the 10 n-grams with the highest PMI
print (finder.nbest(bigram_measures.pmi, 10))

NameError: name 'sample' is not defined

In [None]:
def common_bigrams(tokenized_text, min_freq, top_n):
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(tokenized_text)
    finder.apply_freq_filter(min_freq) 
    finder.nbest(bigram_measures.pmi, top_n)
    return finder.nbest(bigram_measures.pmi, top_n)

print(common_bigrams(all_text, 10, 10)) # top 10 bigrams

Wordcloud

In [None]:
#install wordcloud into the current path
import sys
!{sys.executable} -m pip install wordcloud

In [None]:
import wordcloud
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

sample = comments.iloc[:10000,:].Reply.str.cat(sep='. ')

In [None]:
import random
def grey_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
    return "hsl(0, 0%%, %d%%)" % random.randint(1, 20)

# Training the Classifier

In [None]:
data = pd.read_csv('train.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
#remove any rows with empty values in the text column
data.dropna(subset=['text'], inplace=True)

#check the shape
data.shape

In [None]:
#download the stopwords package
nltk.download('stopwords')

In [None]:
#remove punctuation 
data.text = data.text.apply(lambda x: re.sub(r'[^\w\s]', '', x))

In [None]:
#Remove stopwords from the text which cannot provide any information since we cannot infer their sentiment
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
newStopWords = ['im', 'the', 'got', 'go', 'want', 'oh', 'week', 'hour', 'see', 'still', 'say', 'today', 'day', 'going',
               'one', 'right', 'twitter', 'tomorrow']
stopwords.extend(newStopWords)
data.text = data.text.apply(lambda x: ' '.join([word for word in nltk.word_tokenize(x) if word.lower() not in stopwords]))

In [None]:
#remove any supplied links to text columns
data.text = data.text.apply(lambda x: re.sub(r'\(?http\S+', '', x))

In [None]:
#remove unicode
data.text = data.text.apply(lambda x: re.sub(r'[^\x00-\x7F]+',' ', x))

In [None]:
#remove any escape sequences
data.text = data.text.apply(lambda x: re.sub(r'[\n\r\t]', '', x))

In [None]:
#remove starting and traling whitespace characters
data.text = data.text.apply(lambda x: re.sub(r'^[ \t]+|[ \t]+$', '', x) )

In [None]:
#Since we are removing words from rows we shoudl check again for any 
#new rows that soleely consisted of links or stopwords and are now empty
data.shape

In [None]:
#remove any rows with empty values in the text column
data.dropna(subset=['text'], inplace=True)

#check the shape
data.shape

Stemming and Lemmatization to find the root forms of words.
Lemmatization is favoured over stemming since since stemming may lead to creating non-existent words.
Lemmatization is slower but it has a 'dictionary-based' approach and we have time.

In [None]:
#import required package
nltk.download('wordnet')

In [None]:
lemmatizer = nltk.WordNetLemmatizer()
#For our dataframe we first tokenize, apply lemmatization then join the spaces
data.text = data.text.apply(lambda x: ' '.join(
    [lemmatizer.lemmatize(word) for word in nltk.word_tokenize(x)]
        ) )

Use bigrams to add stopwords for our wordcloud

In [None]:
data_string = data.iloc[:10000,:].text.str.cat(sep='. ')
print(common_bigrams(data_string, 20, 20)) # top 10 bigrams

In [None]:
data_string = data.iloc[:10000,:].text.str.cat(sep='. ')

stp = STOPWORDS.copy()
stp.add('Im') #add stopwords to remove from the plot
wc = WordCloud(background_color="white", max_words=200,  stopwords=stp)
# generate word cloud
wc.generate(data_string)

plt.figure(figsize=(50,50))
plt.imshow(wc.recolor(color_func=grey_color_func, random_state=3))
plt.show()

In [None]:
data.text.head(-10)

## Training a sentiment classifier

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn import metrics

We use CountVectorizer to help us create a matrix for machine learning

In [None]:
vectorizer = CountVectorizer()
#vectorizer = TfidfVectorizer()
data.dropna(subset=['text'], inplace=True)
X = vectorizer.fit_transform(data.text)

In [None]:
#create a validation set
train_x, val_x, train_y, val_y = train_test_split(X,data.sentiment,test_size=0.2,random_state = 10)

We instantiate a multinomial naive bayes model, then fit it

In [None]:
nb = MultinomialNB()

#nb.fit(features_matrix, item_we_want_to_predict)
#nb.fit(X, data.sentiment)
nb.fit(train_x, train_y)

In [None]:
predicted = nb.predict(val_x)
accuracy = metrics.accuracy_score(predicted,val_y)
accuracy

To assign every prediction to the sentiment column of the submission csv we must:
1. load in the test and submission CSV files
2. Perform the exact same preprocessing as we did for the training set
3. Predict the sentiment for the whole test set and put them in the corresponding row in the submission CSV
 ### Assumption that the rows are lined up in the same order

In [None]:
testing = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [None]:
testing.head(3)

In [None]:
testing.shape

In [None]:
submission.head(3)

In [None]:
submission.shape

In [None]:
#first we remove the rows with empty text cells
testing.dropna(subset=['text'], inplace=True)

#remove punctuation 
data.text = data.text.apply(lambda x: re.sub(r'[^\w\s]', '', x) )

#remove stopwords
data.text = data.text.apply(lambda x: ' '.join([word for word in nltk.word_tokenize(x) if word.lower() not in stopwords]))

#remove any supplied links to text columns
data.text = data.text.apply(lambda x: re.sub(r'\(?http\S+', '', x))

#remove unicode
data.text = data.text.apply(lambda x: re.sub(r'[^\x00-\x7F]+',' ', x))

#remove any starting or trailing whitespace
data.text = data.text.apply(lambda x: re.sub(r'^[ \t]+|[ \t]+$', '', x))


#Lemmatization
testing.text = testing.text.apply(lambda x: ' '.join(
    [lemmatizer.lemmatize(word) for word in nltk.word_tokenize(x)]
        ) )

In [None]:
testing.shape

In [None]:
#vectorise our text
test_vect = vectorizer.transform(testing.text)

In [None]:
#generate the predictions from the test set
test_predict = nb.predict(test_vect)
test_predict

In [None]:
#assign the predictions to the submission file under the sentiment column
submission.sentiment = test_predict

In [None]:
submission.head(-5)

In [None]:
#convert the submission file to a csv
#submission.to_csv('sub.csv', index=False)

### Compare differences between package and the trained model from Kaggle

In [None]:
df = pd.read_csv('df_comments.csv')

In [None]:
comments_vect = vectorizer.transform(df.Reply.values.astype('U'))
comment_sentiment = nb.predict(comments_vect)

In [None]:
df['sentiment'] = comment_sentiment
df.head()

In [None]:
df.shape

In [None]:
great = df[df.sentiment.str.contains("positive")]
great.shape

In [None]:
pos = df[df.sentiment.str.contains("positive")].loc[df['compound'] <= 0.0]
pos

In [None]:
pos_acc = (great.shape[0]-pos.shape[0])/great.shape[0]
pos_acc

In [None]:
great = df[df.sentiment.str.contains("negative")]
great.shape

In [None]:
neg = df[df.sentiment.str.contains("negative")].loc[df['compound'] >= 0.0]
neg

In [None]:
neg_acc = (great.shape[0]-neg.shape[0])/great.shape[0]
neg_acc

In [None]:
great = df[df.sentiment.str.contains("neutral")]
great.shape

In [None]:
neu = df[df.sentiment.str.contains("neutral")].loc[df['compound'] != 0.0]
neu

In [None]:
neu_acc = (great.shape[0]-neu.shape[0])/great.shape[0]
neu_acc