
- IMDB movie reviews dataset acquired from http://ai.stanford.edu/~amaas/data/sentiment
- Contains 25000 positive and 25000 negative reviews
- Contains at most 300 reviews per movie
- At least 7 stars out of 10 $\rightarrow$ positive (label = 1)
- At most 4 stars out of 10 $\rightarrow$ negative (label = 0)
- 50/50 train/test split
- Evaluation accuracy

In [16]:
#Loading the dataset

import pandas as pd

df= pd.read_csv("C:\\Users\\Administrator\\Desktop\\movie_data.csv")
df.head(10)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0
5,Leave it to Braik to put on a good show. Final...,1
6,Nathan Detroit (Frank Sinatra) is the manager ...,1
7,"To understand ""Crash Course"" in the right cont...",1
8,I've been impressed with Chavez's stance again...,1
9,This movie is directed by Renny Harlin the fin...,1


In [17]:
#Transforming reviews into feature vectors

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

reviews=np.array(['The sun is shining',
'The weather is sweet',
'The sun is shining, the weather is sweet, and one and one is two'])

count=CountVectorizer()

bag=count.fit_transform(reviews)
print(count.vocabulary_)




{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [18]:
# Word relevancy using term frequency-inverse document frequency

from sklearn.feature_extraction.text import TfidfTransformer

tfidf= TfidfTransformer(use_idf=True,norm='l2',smooth_idf=True)
tfidf.fit_transform(bag).toarray()




array([[0.        , 0.43370786, 0.        , 0.55847784, 0.55847784,
        0.        , 0.43370786, 0.        , 0.        ],
       [0.        , 0.43370786, 0.        , 0.        , 0.        ,
        0.55847784, 0.43370786, 0.        , 0.55847784],
       [0.50238645, 0.44507629, 0.50238645, 0.19103892, 0.19103892,
        0.19103892, 0.29671753, 0.25119322, 0.19103892]])

In [26]:
#Data Preparation

import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

df['review']=df['review'].apply(preprocessor)


In [27]:
#Tokenization of documents

from nltk.stem.porter import PorterStemmer
Porter=PorterStemmer()

def tokenizer(text):
   return text.split()

def tokenizer_stemmer(text):
    return [Porter.stem(word) for word in text.split()]


import nltk
nltk.download('stopwords')

stop=stopwords.words('English')
[w for w in tokenizer_stemmer("This is just a sample sentence")[-10:]]




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


NameError: name 'stopwords' is not defined

In [22]:
#Transform Text Data into TF-IDF Vectors

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None, tokenizer=tokenizer_stemmer, use_idf=True, norm='l2', smooth_idf=True)

X=tfidf.fit_transform(df.review)
y=df.sentiment.values

In [23]:
#Document Classification using Logistic Regression

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train,y_train)
yhat = LR.predict(X_test)

Train set: (40000, 73814) (40000,)
Test set: (10000, 73814) (10000,)


In [24]:
#Model Evaluation

from sklearn.metrics import jaccard_similarity_score
jaccard_similarity_score(y_test, yhat)



0.806