In [1]:
import pandas as pd
import re

In [186]:
df = pd.read_csv('data/IMDB Dataset.csv')

In [187]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [188]:
df.shape

(50000, 2)

In [192]:
# Missing Values
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [194]:
# Duplicate values
df.duplicated().sum()

418

In [196]:
df.drop_duplicates(inplace=True)

In [198]:
df['sentiment'].unique()

array(['positive', 'negative'], dtype=object)

In [200]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [202]:
df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [204]:
def clean_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [206]:
# Cleaning html tags
df['review'] = df['review'].apply(clean_html)

In [286]:
df['review'][0]

'one review mention watch 1 oz episod hook right exactli happen first thing struck oz brutal unflinch scene violenc set right word go trust show faint heart timid show pull punch regard drug sex violenc hardcor classic use call oz nicknam given oswald maximum secur state penitentari focus mainli emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home mani aryan muslim gangsta latino christian italian irish scuffl death stare dodgi deal shadi agreement never far would say main appeal show due fact goe show would dare forget pretti pictur paint mainstream audienc forget charm forget romanc oz mess around first episod ever saw struck nasti surreal could say readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard sold nickel inmat kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi watch oz may becom comfort uncomfort view that get touch darker side'

In [288]:
df.shape

(49582, 2)

In [91]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [93]:
def text_transformer(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    alnum = []
    for i in text:
        if i.isalnum():
            alnum.append(i)
    alnum_stop = []
    for i in alnum:
        if i not in stopwords.words('english'):
            alnum_stop.append(i)
    final = []
    for i in alnum_stop:
        final.append(ps.stem(i))
    return " ".join(final)

In [282]:
df['review']= df['review'].apply(text_transformer)

In [292]:
df.to_csv('data/review_transformed.csv')

In [5]:
df = pd.read_csv('data/review_transformed.csv')

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,review,sentiment
0,0,one review mention watch 1 oz episod hook righ...,positive
1,1,wonder littl product film techniqu fashion giv...,positive
2,2,thought wonder way spend time hot summer weeke...,positive
3,3,basic famili littl boy jake think zombi closet...,negative
4,4,petter mattei love time money visual stun film...,positive


In [10]:
# Encode the Sentiment column
df['sentiment'] = df['sentiment'].replace({'positive':1, 'negative':0})

  df['sentiment'] = df['sentiment'].replace({'positive':1, 'negative':0})


In [12]:
df.head()

Unnamed: 0.1,Unnamed: 0,review,sentiment
0,0,one review mention watch 1 oz episod hook righ...,1
1,1,wonder littl product film techniqu fashion giv...,1
2,2,thought wonder way spend time hot summer weeke...,1
3,3,basic famili littl boy jake think zombi closet...,0
4,4,petter mattei love time money visual stun film...,1


In [14]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [105]:
tfidf = TfidfVectorizer(max_features=3500)
X = tfidf.fit_transform(df['review']).toarray()

In [65]:
# cv = CountVectorizer(max_features=1000)
# X = cv.fit_transform(df['review']).toarray()

In [78]:
y = df['sentiment'].values

In [79]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [52]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.metrics import accuracy_score, precision_score

gnb = GaussianNB()
bnb = BernoulliNB()
mnb = MultinomialNB()

In [53]:
gnb.fit(X_train, y_train)
y_pred1 = gnb.predict(X_test)
print(accuracy_score(y_test, y_pred1))
print(precision_score(y_test, y_pred1))

0.8061913885247555
0.8031293325410972


In [21]:
bnb.fit(X_train, y_train)
y_pred2 = bnb.predict(X_test)
print(accuracy_score(y_test, y_pred2))
print(precision_score(y_test, y_pred2))

0.8266612886961783
0.812284730195178


In [22]:
mnb.fit(X_train, y_train)
y_pred3 = mnb.predict(X_test)
print(accuracy_score(y_test, y_pred3))
print(precision_score(y_test, y_pred3))

0.8280730059493798
0.8222571765631145


In [80]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [81]:
# ✅✅✅✅
lr.fit(X_train, y_train)
y_pred4 = lr.predict(X_test)
print(accuracy_score(y_test, y_pred4))
print(precision_score(y_test, y_pred4))

0.8804073812644954
0.8695736811368503


In [97]:
test_review = 'The movie is wonderfull, enjoyable'


In [125]:
transformed_review = text_transformer(test_review)
review_vec = tfidf.transform([transformed_review]).toarray()
lr.predict(review_vec)

array([1], dtype=int64)

In [111]:
review_vec

array([[0., 0., 0., ..., 0., 0., 0.]])

In [87]:
import pickle
pickle.dump(tfidf, open('data/tfidf.pkl', 'wb'))
pickle.dump(lr, open('data/lr.pkl', 'wb'))

In [None]:
import pickle
pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(mnb,open('model.pkl','wb'))

In [363]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [365]:
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
bc = BaggingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)
xgb = XGBClassifier(n_estimators=50,random_state=2)

clfs = {
    'SVC' : svc,
    'KN' : knc, 
    'NB': mnb, 
    'DT': dtc, 
    'LR': lrc, 
    'RF': rfc, 
    'AdaBoost': abc, 
    'BgC': bc, 
    'ETC': etc,
    'GBDT':gbdt,
    'xgb':xgb
}

In [367]:
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    
    return accuracy,precision

In [None]:
accuracy_scores = []
precision_scores = []

for name,clf in clfs.items():
    
    current_accuracy,current_precision = train_classifier(clf, X_train,y_train,X_test,y_test)
    
    print("For ",name)
    print("Accuracy - ",current_accuracy)
    print("Precision - ",current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

For  SVC
Accuracy -  0.8758697186649188
Precision -  0.8665494333724111
For  KN
Accuracy -  0.7485126550368055
Precision -  0.7182950359586038
For  NB
Accuracy -  0.8450136129877988
Precision -  0.8353444920202413
For  DT
Accuracy -  0.7022284965211253
Precision -  0.6487397420867527
For  LR
Accuracy -  0.8804073812644954
Precision -  0.8674279078769112
For  RF
Accuracy -  0.8341232227488151
Precision -  0.8411692559280458
For  AdaBoost
Accuracy -  0.762528990622164
Precision -  0.7310939148786493


In [None]:
performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values('Precision',ascending=False)
performance_df