In [42]:
import numpy as np

In [43]:
import pandas as pd

In [44]:
data = pd.read_csv(r'IMDB Dataset.csv')


In [45]:
data.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [46]:
data.isna().sum()

review       0
sentiment    0
dtype: int64

In [47]:
data.duplicated().sum()

418

In [48]:
data=data.drop_duplicates()

In [49]:
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


<h1>data cleaning

In [50]:
#data cleaning
data.loc[248]['review']

"Meryl Streep is such a genius. Well, at least as an actress. I know she's been made fun of for doing a lot of roles with accents, but she nails the accent every time. Her performance as Lindy Chamberlain was inspiring. Mrs. Chamberlain, as portrayed here, was not particularly likable, nor all that smart. But that just makes Streep's work all the more remarkable. I think she is worth all 10 or so of her Oscar nominations. About the film, well, there were a couple of interesting things. I don't know much about Australia, but the theme of religious bigotry among the general public played a big part in the story. I had largely missed this when I first saw the film some years ago, but it came through loud and clear yesterday. And it seems the Australian press is just as accomplished at misery-inducing pursuit and overkill as their American colleagues. A pretty good film. A bit different. Grade: B"

In [51]:
import re

In [52]:
def clean(text):
    text = re.sub(r'<.*?>','',text)
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub(r'\d+','',text)
    return text.lower()

In [53]:
data.review=data.review.apply(clean)

In [54]:
data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


In [60]:
import nltk

In [61]:
from nltk.tokenize import word_tokenize

In [63]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [65]:
data['review_tokenize']=data.review.apply(word_tokenize)

In [66]:
data.head()

Unnamed: 0,review,sentiment,review_tokenize
0,one of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione..."
1,a wonderful little production the filming tech...,positive,"[a, wonderful, little, production, the, filmin..."
2,i thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,..."
3,basically theres a family where a little boy j...,negative,"[basically, theres, a, family, where, a, littl..."
4,petter matteis love in the time of money is a ...,positive,"[petter, matteis, love, in, the, time, of, mon..."


<h1>remove stop words

In [67]:
from nltk.corpus import stopwords

In [68]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [74]:
stop_words = set(stopwords.words('english'))

In [75]:
def remove_stopwords(text):
    wrd=[wrd for wrd in text if wrd not in stop_words]
    return wrd

In [77]:
data['review_sw_removed']=data.review_tokenize.apply(remove_stopwords)

In [78]:
data.head()

Unnamed: 0,review,sentiment,review_tokenize,review_sw_removed
0,one of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione...","[one, reviewers, mentioned, watching, oz, epis..."
1,a wonderful little production the filming tech...,positive,"[a, wonderful, little, production, the, filmin...","[wonderful, little, production, filming, techn..."
2,i thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,...","[thought, wonderful, way, spend, time, hot, su..."
3,basically theres a family where a little boy j...,negative,"[basically, theres, a, family, where, a, littl...","[basically, theres, family, little, boy, jake,..."
4,petter matteis love in the time of money is a ...,positive,"[petter, matteis, love, in, the, time, of, mon...","[petter, matteis, love, time, money, visually,..."


<h1>Stemming and Lemmetization

In [79]:
from nltk.stem import WordNetLemmatizer

In [80]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...


True

In [81]:
lem=WordNetLemmatizer()

In [82]:
def lemmatize_word(text):
    ret = [lem.lemmatize(word) for word in text]
    return ret

In [83]:
data['review_sw_removed'].apply(lemmatize_word)

0        [one, reviewer, mentioned, watching, oz, episo...
1        [wonderful, little, production, filming, techn...
2        [thought, wonderful, way, spend, time, hot, su...
3        [basically, there, family, little, boy, jake, ...
4        [petter, matteis, love, time, money, visually,...
                               ...                        
49995    [thought, movie, right, good, job, wasnt, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [catholic, taught, parochial, elementary, scho...
49998    [im, going, disagree, previous, comment, side,...
49999    [one, expects, star, trek, movie, high, art, f...
Name: review_sw_removed, Length: 49582, dtype: object

In [84]:
data['review_lemmatize']=data['review_sw_removed'].apply(lemmatize_word)

In [85]:
data.head()

Unnamed: 0,review,sentiment,review_tokenize,review_sw_removed,review_lemmatize
0,one of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione...","[one, reviewers, mentioned, watching, oz, epis...","[one, reviewer, mentioned, watching, oz, episo..."
1,a wonderful little production the filming tech...,positive,"[a, wonderful, little, production, the, filmin...","[wonderful, little, production, filming, techn...","[wonderful, little, production, filming, techn..."
2,i thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,...","[thought, wonderful, way, spend, time, hot, su...","[thought, wonderful, way, spend, time, hot, su..."
3,basically theres a family where a little boy j...,negative,"[basically, theres, a, family, where, a, littl...","[basically, theres, family, little, boy, jake,...","[basically, there, family, little, boy, jake, ..."
4,petter matteis love in the time of money is a ...,positive,"[petter, matteis, love, in, the, time, of, mon...","[petter, matteis, love, time, money, visually,...","[petter, matteis, love, time, money, visually,..."


In [86]:
data[['sentiment','review_lemmatize']]

Unnamed: 0,sentiment,review_lemmatize
0,positive,"[one, reviewer, mentioned, watching, oz, episo..."
1,positive,"[wonderful, little, production, filming, techn..."
2,positive,"[thought, wonderful, way, spend, time, hot, su..."
3,negative,"[basically, there, family, little, boy, jake, ..."
4,positive,"[petter, matteis, love, time, money, visually,..."
...,...,...
49995,positive,"[thought, movie, right, good, job, wasnt, crea..."
49996,negative,"[bad, plot, bad, dialogue, bad, acting, idioti..."
49997,negative,"[catholic, taught, parochial, elementary, scho..."
49998,negative,"[im, going, disagree, previous, comment, side,..."


In [95]:
data['final_review']=data.review_lemmatize.apply(lambda x: ' '.join(x))

In [96]:
data['final_review']

0        one reviewer mentioned watching oz episode you...
1        wonderful little production filming technique ...
2        thought wonderful way spend time hot summer we...
3        basically there family little boy jake think t...
4        petter matteis love time money visually stunni...
                               ...                        
49995    thought movie right good job wasnt creative or...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    catholic taught parochial elementary school nu...
49998    im going disagree previous comment side maltin...
49999    one expects star trek movie high art fan expec...
Name: final_review, Length: 49582, dtype: object

In [97]:
data[['sentiment','final_review']]

Unnamed: 0,sentiment,final_review
0,positive,one reviewer mentioned watching oz episode you...
1,positive,wonderful little production filming technique ...
2,positive,thought wonderful way spend time hot summer we...
3,negative,basically there family little boy jake think t...
4,positive,petter matteis love time money visually stunni...
...,...,...
49995,positive,thought movie right good job wasnt creative or...
49996,negative,bad plot bad dialogue bad acting idiotic direc...
49997,negative,catholic taught parochial elementary school nu...
49998,negative,im going disagree previous comment side maltin...


In [100]:
data.sentiment.unique()

array(['positive', 'negative'], dtype=object)

In [102]:
enc={'negative':0,'positive':1}

In [103]:
data.sentiment=data.sentiment.map(enc)

In [105]:
data[['review','sentiment']]

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production the filming tech...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically theres a family where a little boy j...,0
4,petter matteis love in the time of money is a ...,1
...,...,...
49995,i thought this movie did a down right good job...,1
49996,bad plot bad dialogue bad acting idiotic direc...,0
49997,i am a catholic taught in parochial elementary...,0
49998,im going to have to disagree with the previous...,0


In [106]:
#to save as csv file data.to_csv('ready_to_train.csv')

In [138]:
#splitting x and y
X= data['final_review']
y=data['sentiment']

In [139]:
X

0        one reviewer mentioned watching oz episode you...
1        wonderful little production filming technique ...
2        thought wonderful way spend time hot summer we...
3        basically there family little boy jake think t...
4        petter matteis love time money visually stunni...
                               ...                        
49995    thought movie right good job wasnt creative or...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    catholic taught parochial elementary school nu...
49998    im going disagree previous comment side maltin...
49999    one expects star trek movie high art fan expec...
Name: final_review, Length: 49582, dtype: object

In [140]:
y

0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 49582, dtype: int64

In [141]:
from sklearn.model_selection import train_test_split

In [142]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

<h1>Making Pipeline</h1>

In [143]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [144]:
#know diff betwnn pipe and make_pipeline

In [145]:
pipeline = Pipeline([
    ('tfidf',TfidfVectorizer()),
    ('lrg',LogisticRegression())
])

In [146]:
pipeline.fit(X_train,y_train)

In [147]:
pipeline.predict(["This is a good book"])

array([0], dtype=int64)

In [148]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [149]:
vectorizer = TfidfVectorizer(max_features=3000)

In [150]:
import joblib

In [151]:
joblib.dump(pipeline,'nlp_kathford.joblib')

['nlp_kathford.joblib']

In [152]:
pipeline.predict(X_test)

array([0, 1, 1, ..., 0, 0, 0], dtype=int64)

In [153]:
user_data='i dont like to eat meat'
pipeline.predict([user_data])

array([0], dtype=int64)