In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.classify.scikitlearn import SklearnClassifier

#scikit learn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [2]:
data=pd.read_csv("IMDB_dataset.csv")

In [3]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### Preprocessing

In [4]:
data.columns

Index(['review', 'sentiment'], dtype='object')

In [5]:
data['sentiment'].value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [6]:
data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [7]:
encoder=LabelEncoder()
y=encoder.fit_transform(data['sentiment'])
y[:10]

array([1, 1, 1, 0, 1, 1, 1, 0, 0, 1])

##### 1=POSITIVE, 0=NEGATIVE

In [8]:
print(data.head())
y[:5]

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


array([1, 1, 1, 0, 1])

In [9]:
text=data['review']

In [10]:
text.head()

0    One of the other reviewers has mentioned that ...
1    A wonderful little production. <br /><br />The...
2    I thought this was a wonderful way to spend ti...
3    Basically there's a family where a little boy ...
4    Petter Mattei's "Love in the Time of Money" is...
Name: review, dtype: object

In [11]:
#REGULAR EXPRESSIONS
# Replace email addresses with 'email'
processed = text.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
                                 'emailaddress')

# Replace URLs with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress')

# Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
processed = processed.str.replace(r'£|\$', 'moneysymb')
    
# Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumbr')
    
# Replace numbers with 'numbr'
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

# Remove punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

# change words to lower case - Hello, HELLO, hello are all the same word
processed = processed.str.lower()

In [12]:
processed[0]

'one of the other reviewers has mentioned that after watching just numbr oz episode you ll be hooked they are right as this is exactly what happened with me br br the first thing that struck me about oz was its brutality and unflinching scenes of violence which set in right from the word go trust me this is not a show for the faint hearted or timid this show pulls no punches with regards to drugs sex or violence its is hardcore in the classic use of the word br br it is called oz as that is the nickname given to the oswald maximum security state penitentary it focuses mainly on emerald city an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda em city is home to many aryans muslims gangstas latinos christians italians irish and more so scuffles death stares dodgy dealings and shady agreements are never far away br br i would say the main appeal of the show is due to the fact that it goes where other shows would

In [13]:
stop_words=set(stopwords.words('english'))

processed=processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words ))

In [14]:
processed[0]

'one reviewers mentioned watching numbr oz episode hooked right exactly happened br br first thing struck oz brutality unflinching scenes violence set right word go trust show faint hearted timid show pulls punches regards drugs sex violence hardcore classic use word br br called oz nickname given oswald maximum security state penitentary focuses mainly emerald city experimental section prison cells glass fronts face inwards privacy high agenda em city home many aryans muslims gangstas latinos christians italians irish scuffles death stares dodgy dealings shady agreements never far away br br would say main appeal show due fact goes shows dare forget pretty pictures painted mainstream audiences forget charm forget romance oz mess around first episode ever saw struck nasty surreal say ready watched developed taste oz got accustomed high levels graphic violence violence injustice crooked guards sold nickel inmates kill order get away well mannered middle class inmates turned prison bitch

In [15]:
%%time
snowstem=SnowballStemmer('english')

processed=processed.apply(lambda x : ' '.join(snowstem.stem(term) for term in x.split()))

Wall time: 2min 30s


In [16]:
processed[0]

'one review mention watch numbr oz episod hook right exact happen br br first thing struck oz brutal unflinch scene violenc set right word go trust show faint heart timid show pull punch regard drug sex violenc hardcor classic use word br br call oz nicknam given oswald maximum secur state penitentari focus main emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home mani aryan muslim gangsta latino christian italian irish scuffl death stare dodgi deal shadi agreement never far away br br would say main appeal show due fact goe show dare forget pretti pictur paint mainstream audienc forget charm forget romanc oz mess around first episod ever saw struck nasti surreal say readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard sold nickel inmat kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi watch oz may becom comfort uncomfort view that get touch 

### Tokenization

In [17]:
bag_of_words=[]

for t in processed:
    words=word_tokenize(t)
    for w in words:
        bag_of_words.append(w)


In [18]:
bag_of_words=nltk.FreqDist(bag_of_words)

In [19]:
bag_of_words.most_common(10)

[('br', 201875),
 ('movi', 103229),
 ('film', 95869),
 ('numbr', 70480),
 ('one', 55433),
 ('like', 45200),
 ('time', 31958),
 ('good', 30194),
 ('make', 30020),
 ('charact', 28361)]

In [20]:
len(bag_of_words.keys())

69350

### Feature Extraction

In [21]:
bow=dict(bag_of_words.most_common(1500))

In [22]:
for_features=list(bow.keys())

In [23]:
for_features[:10]

['br',
 'movi',
 'film',
 'numbr',
 'one',
 'like',
 'time',
 'good',
 'make',
 'charact']

In [24]:
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in for_features:
        features[word] = (word in words)

    return features

In [25]:
%%time
messages = zip(processed, y)
featuresets = [(find_features(text), label) for (text, label) in messages]

Wall time: 4min 1s


### split the data into train and test sets

In [26]:
train,test=train_test_split(featuresets,test_size=0.25,random_state=42)

In [27]:
len(train)

37500

In [28]:
len(test)

12500

In [29]:
'''%%time
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", 
         "SGD Classifier","Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(train)
    accuracy = nltk.classify.accuracy(nltk_model, test)*100
    print("{} Accuracy: {}".format(name, accuracy))'''

'%%time\nnames = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", \n         "SGD Classifier","Naive Bayes", "SVM Linear"]\n\nclassifiers = [\n    KNeighborsClassifier(),\n    DecisionTreeClassifier(),\n    RandomForestClassifier(),\n    LogisticRegression(),\n    SGDClassifier(max_iter = 100),\n    MultinomialNB(),\n    SVC(kernel = \'linear\')\n]\n\nmodels = zip(names, classifiers)\n\nfor name, model in models:\n    nltk_model = SklearnClassifier(model)\n    nltk_model.train(train)\n    accuracy = nltk.classify.accuracy(nltk_model, test)*100\n    print("{} Accuracy: {}".format(name, accuracy))'

In [30]:
txt_features,label=zip(*test)

In [31]:
%%time
classifier=SklearnClassifier(MultinomialNB())
classifier.train(train)
print("Accuracy: {}".format(nltk.classify.accuracy(classifier,test)))

Accuracy: 0.83864
Wall time: 1min 22s


In [32]:
predict=classifier.classify_many(txt_features)

### Classification report and confusion matrix

In [33]:
print(classification_report(label,predict))

              precision    recall  f1-score   support

           0       0.84      0.84      0.84      6157
           1       0.84      0.84      0.84      6343

    accuracy                           0.84     12500
   macro avg       0.84      0.84      0.84     12500
weighted avg       0.84      0.84      0.84     12500



In [34]:
confusion_matrix(label,predict)

array([[5146, 1011],
       [1006, 5337]], dtype=int64)

In [35]:
example = find_features("An amazing movie.A must watch. It has a great script and cast. Worth it.")
classifier.classify_many(example)

[1]

In [36]:
#1:positive 0:NEGATIVE