# Unsupervised Sentiment Analysis - KMeans Sparse

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import re

In [2]:
address = '.\IMDB_Dataset.csv'

In [3]:
imdb = pd.read_csv(address)

In [5]:
imdb.head(15)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


## Preprocessing the data

### Some of the data cleaning steps
* removing html tags
* removing and replacing punctuations, and other common spoken features from the text

In [6]:
def text_to_word_list(text):
    text = str(text)
    text = text.lower()
    text = re.sub('<[^<]+?>', '', text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    #text = text.split()
    return text

In [7]:
# dropping NAN and duplicates from the data
imdb_trimmed = imdb.dropna().drop_duplicates().reset_index(drop=True)

In [8]:
#applying the text prep to the review column
imdb_trimmed.review = imdb_trimmed.review.apply(lambda x: text_to_word_list(x))

In [9]:
imdb_trimmed.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there a family where a little boy ja...,negative
4,petter mattei love in the time of money is a v...,positive


In [10]:
imdb_trimmed['sentiment'] = imdb_trimmed['sentiment'].map({'positive':1,'negative':0})
imdb_trimmed.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production the filming tech...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there a family where a little boy ja...,0
4,petter mattei love in the time of money is a v...,1


# Modeling - Applying KMeans Clustering 

## Vectorizing and applying PCA to reduce the dimensionality

Extract features from text using bag of words - CountVectorizer

In [19]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.metrics import pairwise_distances
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words = 'english',max_features=20000)
vec = vectorizer.fit_transform(imdb_trimmed.review)

vec_norm = normalize(vec)
vec_arr = vec_norm.toarray()

vec_pca = PCA(n_components = 2)
vec_tranformed = vec_pca.fit_transform(vec_arr)

kmeans = KMeans(n_clusters=2, max_iter=1000, algorithm = 'auto')

fitted = kmeans.fit(vec_tranformed)
prediction = kmeans.predict(vec_tranformed)

In [20]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,f1_score

imdb_trimmed['sentiment_pred'] = pd.Series(prediction)
f1 = f1_score(imdb_trimmed['sentiment'],imdb_trimmed['sentiment_pred'],pos_label=1)
acc = accuracy_score(imdb_trimmed['sentiment'],imdb_trimmed['sentiment_pred'])

print("F1 Score : {:.2f} and Accuracy {:.2f}".format(f1,acc))

F1 Score : 0.62 and Accuracy 0.55


In [29]:
# visualizing the sparse matrix
i = 10000
j = 10
words = vectorizer.get_feature_names()[i:i+10]
pd.DataFrame(vec[j:j+10,i:i+10].todense(), columns=words)

Unnamed: 0,killers,killian,killing,killings,killjoy,kills,kilmer,kilter,kim,kimberly
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0


with Accuracy of 0.55, I expect to get at least 50% of time a correct review!

In [37]:
imdb_trimmed['sentiment_pred'].value_counts()

1    32778
0    16804
Name: sentiment_pred, dtype: int64

In [39]:
imdb_trimmed.head(15)

Unnamed: 0,review,sentiment,sentiment_pred
0,one of the other reviewers has mentioned that ...,1,1
1,a wonderful little production the filming tech...,1,1
2,i thought this was a wonderful way to spend ti...,1,1
3,basically there a family where a little boy ja...,0,0
4,petter mattei love in the time of money is a v...,1,1
5,probably my all - time favorite movie a story ...,1,0
6,i sure would like to see a resurrection of a u...,1,1
7,this show was an amazing fresh innovative idea...,0,1
8,encouraged by the positive comments about this...,0,1
9,if you like original gut wrenching laughter yo...,1,0
