In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('imdb_master.csv', encoding = "ISO-8859-1")

In [3]:
del df['Unnamed: 0']

In [4]:
del df['file']

In [5]:
df = df.loc[0:49999]

In [6]:
del df['type']
df = df.sample(frac=1).reset_index(drop=True)

In [7]:
train = df.loc[0:34999]
train.shape

(35000, 2)

In [8]:
test = df.loc[35000:]
test.shape

(15000, 2)

In [9]:
train_data = train['review']
test_data = test['review']
train_labels = train['label']
test_labels = test['label']
test_labels

35000    neg
35001    neg
35002    pos
35003    neg
35004    pos
35005    pos
35006    neg
35007    neg
35008    neg
35009    pos
35010    neg
35011    pos
35012    neg
35013    neg
35014    pos
35015    neg
35016    pos
35017    neg
35018    neg
35019    pos
35020    neg
35021    neg
35022    neg
35023    pos
35024    neg
35025    neg
35026    pos
35027    neg
35028    neg
35029    pos
        ... 
49970    neg
49971    neg
49972    pos
49973    pos
49974    neg
49975    neg
49976    pos
49977    neg
49978    pos
49979    neg
49980    pos
49981    pos
49982    pos
49983    pos
49984    neg
49985    neg
49986    neg
49987    pos
49988    pos
49989    neg
49990    neg
49991    neg
49992    pos
49993    pos
49994    pos
49995    pos
49996    neg
49997    pos
49998    pos
49999    pos
Name: label, Length: 15000, dtype: object

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(ngram_range=(1,3), binary=True)

In [11]:
X_train_dtm = vect.fit_transform(train_data)
X_test_dtm = vect.transform(test_data)

In [12]:
train_labels = train_labels.map({'pos': 1, 'neg': 0})

In [13]:
test_labels = test_labels.map({'pos': 1, 'neg': 0})
test_labels

35000    0
35001    0
35002    1
35003    0
35004    1
35005    1
35006    0
35007    0
35008    0
35009    1
35010    0
35011    1
35012    0
35013    0
35014    1
35015    0
35016    1
35017    0
35018    0
35019    1
35020    0
35021    0
35022    0
35023    1
35024    0
35025    0
35026    1
35027    0
35028    0
35029    1
        ..
49970    0
49971    0
49972    1
49973    1
49974    0
49975    0
49976    1
49977    0
49978    1
49979    0
49980    1
49981    1
49982    1
49983    1
49984    0
49985    0
49986    0
49987    1
49988    1
49989    0
49990    0
49991    0
49992    1
49993    1
49994    1
49995    1
49996    0
49997    1
49998    1
49999    1
Name: label, Length: 15000, dtype: int64

In [14]:
from sklearn.svm import LinearSVC
svm = LinearSVC()

In [15]:
svm.fit(X_train_dtm, train_labels)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [16]:
y_pred_class = svm.predict(X_test_dtm)

In [17]:
from sklearn import metrics
metrics.accuracy_score(test_labels, y_pred_class)

0.9079333333333334

In [18]:
metrics.confusion_matrix(test_labels, y_pred_class)

array([[6757,  729],
       [ 652, 6862]])

In [19]:
#Bad reviews classified as Good reviews
test_data[test_labels < y_pred_class].head(10)

35012    I find it disconcerting that in an era when sa...
35018    What a truly moronic movie, all I can say is t...
35033    Kevin Spacey is my favorite actor of all time,...
35102    When I first saw this movie, it was titled TER...
35136    The female cast of this movie is terrific: you...
35172    This movie has made me upset! When I think of ...
35178    Intriguing premise should have been a 20 minut...
35184    Director Warren Beatty's intention to turn Che...
35198    I just saw this movie last night, and after re...
35225    I watched the pilot and noticed more than a fe...
Name: review, dtype: object

In [20]:
test_data[35007]

"This movie is AWFUL. I haven't laughed so hard at a movie that was unintentionally funny in a long time. Leno should've stuck to stand up and late night tv. The cars in the movie were cool, but the movie by itself is the dumbest movie I've ever seen. it's pathetic, the acting is horrible, and the plot could've been written by a 4 year old. don't get me wrong, jay leno is hilarious, but not in this movie!"

In [21]:
#Good reviews classified as Bad reviews
test_data[test_labels > y_pred_class].head(10)

35004    In the opening scenes of this movie a man shot...
35005    From a bare description of THE TOLL GATE's maj...
35031    This movie is really not all that bad. But the...
35056    Man, if anyone was expecting a great zombie mo...
35116    This was very funny, even if it fell apart a l...
35156    Don't get me wrong. "GoldenEye" was revolution...
35164    I agree with "Jerry." It's a very underrated s...
35219    THE BRAIN THAT WOULDN'T DIE was considered so ...
35221    Sheba Baby is always underrated most likely be...
35268    Shtrafbat is the story only Russians could tel...
Name: review, dtype: object

In [22]:
test_data[35009]

"Having discovered the Ring trilogy, I have been greedily gobbling up all those other Japanese and Korean films that are either on or following the bandwagon.<br /><br />I don't have an easy definition of horror, but this film certainly pushed some of my buttons, even though I can't claim that the film makes a lot of sense. I'm squeamish so there were several points in the film when I just didn't want to watch what was happening on screen. The film unnerved me so I became apprehensive of seeing things that I thought I was going to see.<br /><br />It's an imaginative film offering a great deal visually. It also provides food for thought. And plenty of material to argue about when the film is over.<br /><br />The characters are well-defined to say the least. Could they make films like this in the West?<br /><br />So it doesn't make sense in the end, but when one has an appetite for the occult, the supernatural, the bizarre, the otherworldly, then no film is going to deliver a final all-e