In [1]:
import numpy as np
import pandas as pd

# Naive implementation

In [8]:
#import necessary libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation
import re

STOPWORDS = set(stopwords.words('english'))

def lowercase(text):
    lowercase_chars = [char.lower() if char.isalpha()  else " " for char in text]
    return "".join(lowercase_chars)

def preprocess(text):
    text = lowercase(text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in STOPWORDS]
    text = "  ".join(tokens)
    return text

#Feature Extraction
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
def extract_features(text):
    vectorizer = CountVectorizer()
    features = vectorizer.fit_transform([text])
    return features
def train_classification_model(clf, text, assigned_label):
    text1= preprocess(text)
    features = extract_features(text1)
    features
    clf.fit(features, [assigned_label])
    return clf

def test_model(clf, test):
    return clf.predict(extract_features(preprocess(test)))

clf=MultinomialNB()
train_classification_model(clf, "I love this movie! The acting was superb and the storyline was captivating.", 'Positive')
train_classification_model(clf, "I hated the movie, the acting was bad and the storyline was boring.", 'Negative')
train_classification_model(clf, "The movie was okay, average in terms of acting and story.", 'Neutral')

test_model(clf, "The movie was average  in terms of everything. Could be better")

array(['Neutral'], dtype='<U7')

# Sentiment Analysis using NLTK

In [2]:
df = pd.read_csv("C://Users//Ambarish Deb//Downloads//IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Preprocessing

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.stem import *
import re

STOPWORDS = set(stopwords.words('english'))

def lowercase(text):
    lowercase_chars = [char.lower() if char.isalpha()  else " " for char in text]
    return "".join(lowercase_chars)

def preprocess(text):
    cleaner = re.compile('<.*?>')
    text = re.sub(cleaner, '', text)
    text = lowercase(text)
    text= ' '.join([PorterStemmer().stem(word) for word in text.split()])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in STOPWORDS]
    text = "  ".join(tokens)
    return text

df['review']=df['review'].apply(lambda x: preprocess(x))
df.head()

Unnamed: 0,review,sentiment
0,one review ha mention watch oz episod h...,positive
1,wonder littl product film techniqu veri ...,positive
2,thought thi wa wonder way spend time ho...,positive
3,basic famili littl boy jake think zombi ...,negative
4,petter mattei love time money visual stu...,positive


In [4]:
train_reviews=df.review[:40000]
train_sentiments=df.sentiment[:40000]
#test dataset
test_reviews=df.review[40000:]
test_sentiments=df.sentiment[40000:]
print(train_reviews.shape,train_sentiments.shape)
print(test_reviews.shape,test_sentiments.shape)

(40000,) (40000,)
(10000,) (10000,)


### Feature Extraction

#### Bag Of Words

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
#Count vectorizer for bag of words
cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))
#transformed train reviews
cv_train_reviews=cv.fit_transform(train_reviews)
#transformed test reviews
cv_test_reviews=cv.transform(test_reviews)

print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)
#vocab=cv.get_feature_names()-toget feature names

BOW_cv_train: (40000, 5836829)
BOW_cv_test: (10000, 5836829)


#### TF-IDF Vectorizer

In [6]:
#Tfidf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tv=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))
#transformed train reviews
tv_train_reviews=tv.fit_transform(train_reviews)
#transformed test reviews
tv_test_reviews=tv.transform(test_reviews)
print('Tfidf_train:',tv_train_reviews.shape)
print('Tfidf_test:',tv_test_reviews.shape)

Tfidf_train: (40000, 5836829)
Tfidf_test: (10000, 5836829)


### Model Training & Performance Metrics
We'll train two separate models, one for the bag of words approach and the other ffor the TF-IDF approach.

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
lr=LogisticRegression(random_state=42)
#Fitting the model for Bag of words
lr_bow=lr.fit(cv_train_reviews,train_sentiments)
test_sentiments_pred = lr_bow.predict(cv_test_reviews)

target_names = ['positive','negative'] # target values

# Print classification report after a train/test split:
print(classification_report(test_sentiments, test_sentiments_pred , target_names=target_names))

              precision    recall  f1-score   support

    positive       0.57      0.97      0.71      4993
    negative       0.89      0.26      0.40      5007

    accuracy                           0.61     10000
   macro avg       0.73      0.61      0.56     10000
weighted avg       0.73      0.61      0.56     10000



In [12]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_sentiments, test_sentiments_pred)

array([[4840,  153],
       [3718, 1289]], dtype=int64)

In [13]:

lr=LogisticRegression(random_state=100)
#Fitting the model for TFIDF
lr_tv=lr.fit(tv_train_reviews,train_sentiments)
test_sentiments_pred = lr_tv.predict(tv_test_reviews)

target_names = ['positive','negative'] # target values

# Print classification report after a train/test split:
print(confusion_matrix(test_sentiments, test_sentiments_pred))
print(classification_report(test_sentiments, test_sentiments_pred , target_names=target_names))

[[3827 1166]
 [1325 3682]]
              precision    recall  f1-score   support

    positive       0.74      0.77      0.75      4993
    negative       0.76      0.74      0.75      5007

    accuracy                           0.75     10000
   macro avg       0.75      0.75      0.75     10000
weighted avg       0.75      0.75      0.75     10000



## Sentiment Analysis using sentiment lexicon AFINN-165

In [14]:
afinn = pd.read_csv("C://Users//Ambarish Deb//Desktop//AFINN-165.txt", sep='\t', header=None, names=['word', 'score'], index_col='word')
afinn.head()

Unnamed: 0_level_0,score
word,Unnamed: 1_level_1
abandon,-2
abandoned,-2
abandons,-2
abducted,-2
abduction,-2


In [15]:
def sentiment_score(text):
    words = nltk.word_tokenize(text.lower())
    score = 0
    for word in words:
        if word in afinn.index:
            score += afinn.loc[word]['score']
    return score

def sentiment(text):
    if sentiment_score(text)>0:
        return "Positive Sentiment; Score is : "+str(sentiment_score(text))
    elif sentiment_score(text)==0:
        return "Neutral Sentiment; Score is : "+str(sentiment_score(text))
    else:
        return "Negative Sentiment; Score is : "+str(sentiment_score(text))

In [16]:
sentiment("This a fantastic movie of three prisoners who become famous. One of the actors is george clooney and I\'m not a fan but this roll is not bad. Another good thing about the movie is the soundtrack (The man of constant sorrow). I recommand this movie to everybody. Greetings Bart")

'Positive Sentiment; Score is : 7'