In [1]:
import pandas as pd
from nltk.tokenize import TreebankWordTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
reviews = pd.read_csv('IMDB Dataset.csv')
tokenizer = TreebankWordTokenizer()

In [3]:
reviews_train = []

for line in reviews['review']:
    reviews_train.append(line)

reviews_train = reviews_train[:12500]


In [4]:
## Clean Reviews

import re

REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_reviews(reviews):
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
    
    return reviews

reviews_train = preprocess_reviews(reviews_train)

In [7]:
tfidf = TfidfVectorizer(min_df=2, max_df=.5, ngram_range=(1,1))
features = tfidf.fit_transform(reviews_train)

In [8]:
target = []
for sentiment in reviews['sentiment'] :
    if sentiment == 'positive' :
        target.append(1)
    else :
        target.append(0)

target_train = target[:12500]

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


X_train, X_val, y_train, y_val = train_test_split(
    features, target_train, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))



Accuracy for C=0.01: 0.80544
Accuracy for C=0.05: 0.82816
Accuracy for C=0.25: 0.86144
Accuracy for C=0.5: 0.8704
Accuracy for C=1: 0.87936


In [10]:
## Train the Best Model
final_model = LogisticRegression(C=1)
final_model.fit(features, target_train)
print ("Final Accuracy: %s" 
       % accuracy_score(target_train, final_model.predict(features)))

Final Accuracy: 0.94208


In [11]:
## Get Best and Worst Features
feature_to_coef = {
    word: coef for word, coef in zip(
        tfidf.get_feature_names(), final_model.coef_[0]
    )
}

## Best Reviews
print ('Best')
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:5]:
    print (best_positive)

    
## Worst Reiviews 
print ('\nWorst')
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:5]:
    print (best_negative)

Best
('great', 6.463609182200631)
('excellent', 5.037633643756097)
('best', 4.358091237051494)
('wonderful', 3.957346347641883)
('perfect', 3.7641061024018407)

Worst
('bad', -7.3275624242798685)
('worst', -6.462223097242445)
('awful', -5.198608349522306)
('waste', -5.056911963916621)
('boring', -4.787526761390323)
