# Importing Dependencies

In [1]:
import pandas as pd
import re
#from nltk.corpus import stopwords
#from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import svm

# Data Collection & Preprocessing

In [2]:
review_dataset = pd.read_csv("C:/Users/pc/Machine Learning Workspace/Datasets/IMDB Dataset.csv")

In [3]:
review_dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
review_dataset.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [5]:
review_dataset.isnull().sum() # checking for missing values

review       0
sentiment    0
dtype: int64

In [6]:
review_dataset['sentiment'].value_counts() # checking if dataset is imbalanced

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

# Label Encoding

In [7]:
review_dataset.replace({'sentiment' : {'negative' : 0, 'positive' : 1}}, inplace=True)

  review_dataset.replace({'sentiment' : {'negative' : 0, 'positive' : 1}}, inplace=True)


In [8]:
review_dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


# Clean dataset

In [9]:
def clean(content):
    cleaned = re.sub('[^a-zA-Z]', ' ', content)
    return cleaned

In [10]:
review_dataset['review'] = review_dataset['review'].apply(clean)

# Splitting the data into training & testing data

In [11]:
X = review_dataset['review'].values
Y = review_dataset['sentiment'].values

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, stratify=Y, random_state=7)

# Feature Extraction

In [13]:
feature_extract = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

In [14]:
X_train_features = feature_extract.fit_transform(X_train)

X_test_features = feature_extract.transform(X_test)

In [15]:
print(X_train_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3086932 stored elements and shape (35000, 86050)>
  Coords	Values
  (0, 72605)	0.30178373640822886
  (0, 74421)	0.2984916385788464
  (0, 50360)	0.0699705003959291
  (0, 84898)	0.18196029955667423
  (0, 15005)	0.2485242738984212
  (0, 77296)	0.1972218549770501
  (0, 57159)	0.2776203678812933
  (0, 50382)	0.11891565328599324
  (0, 48151)	0.4288810873102957
  (0, 57582)	0.5042322094124514
  (0, 27306)	0.07447488991435869
  (0, 54228)	0.382378743201172
  (1, 50360)	0.032272225216210954
  (1, 50382)	0.027423505069014104
  (1, 27306)	0.08587441874320709
  (1, 61774)	0.051674528607325125
  (1, 81990)	0.04990963277027563
  (1, 57670)	0.05870566442494265
  (1, 83058)	0.0817651359592195
  (1, 34837)	0.046104934999941664
  (1, 14591)	0.03616646230814407
  (1, 5046)	0.4055258584978662
  (1, 55553)	0.2676201630895033
  (1, 51883)	0.033375358394830835
  (1, 26692)	0.04506112980363043
  :	:
  (34999, 28796)	0.08454950639307884
  (34999, 11

In [16]:
print(X_test_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1291227 stored elements and shape (15000, 86050)>
  Coords	Values
  (0, 659)	0.05552641674211046
  (0, 2658)	0.10948129801162533
  (0, 3331)	0.1008676616395651
  (0, 8956)	0.11583620334757926
  (0, 10791)	0.07213195577617823
  (0, 13745)	0.08449576870250865
  (0, 13940)	0.07771063770739764
  (0, 14563)	0.11260080445845719
  (0, 15807)	0.09497396102446988
  (0, 15809)	0.12755402474182145
  (0, 15894)	0.12174161437092029
  (0, 16156)	0.26138736462937673
  (0, 19825)	0.20341070702580757
  (0, 20339)	0.1608909757618692
  (0, 20479)	0.11163554953359027
  (0, 23269)	0.189887338828892
  (0, 23299)	0.10564972564295169
  (0, 24867)	0.09282704798295177
  (0, 25658)	0.16191213335682114
  (0, 26340)	0.08724541162187634
  (0, 27306)	0.12017471101896493
  (0, 29533)	0.08597997918498894
  (0, 29540)	0.12672187187443498
  (0, 30445)	0.08773494610546702
  (0, 30941)	0.07100824508870779
  :	:
  (14999, 56324)	0.060731219571589425
  (14999, 59

# Training the model

In [17]:
classifier = svm.SVC(kernel='linear')

In [18]:
classifier.fit(X_train_features, Y_train)

In [19]:
prediction = classifier.predict(X_train_features)

In [20]:
training_accuracy = accuracy_score(prediction, Y_train)

In [26]:
print("Accuracy on training data: ", training_accuracy)

Accuracy on training data:  0.9643142857142857


# Testing the model

In [22]:
test_prediction = classifier.predict(X_test_features)

In [23]:
testing_accuracy = accuracy_score(test_prediction, Y_test)

In [24]:
print("Accuracy on testing data: ", testing_accuracy)

Accuracy on testing data:  0.8958666666666667


# Predictive System

In [85]:
input_data = ["meh, i've seen better"]

In [86]:
vectorized_input = feature_extract.transform(input_data)

In [87]:
input_prediction = classifier.predict(vectorized_input)

In [88]:
print(input_prediction)

if(input_prediction[0]==0):
    print("Negative Review")
else:
    print("Positive Review")

[0]
Negative Review
