In [2]:
import pandas as pd
from nltk.tokenize import TreebankWordTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
reviews = pd.read_csv('IMDB Dataset.csv')
tokenizer = TreebankWordTokenizer()

In [4]:
val = 12500

reviews_train = []

for review in reviews['review']:
    reviews_train.append(review)

reviews_test = reviews_train[val:]    
reviews_train = reviews_train[:val]

In [5]:
## Clean Reviews

import re

REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_reviews(reviews):
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
    
    return reviews

reviews_train = preprocess_reviews(reviews_train)
reviews_test = preprocess_reviews(reviews_test)

In [6]:
from nltk.corpus import stopwords

english_stop_words = stopwords.words('english')
def remove_stop_words(corpus):
    removed_stop_words = []
    for review in corpus:
        removed_stop_words.append(
            ' '.join([word for word in review.split() 
                      if word not in english_stop_words])
        )
    return removed_stop_words

reviews_train = remove_stop_words(reviews_train)
reviews_test = remove_stop_words(reviews_test)

## Normalization

In [7]:
## Stemmed Reviews

def get_stemmed_text(corpus):
    from nltk.stem.porter import PorterStemmer
    stemmer = PorterStemmer()
    return [' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]

stemmed_reviews_train = get_stemmed_text(reviews_train)
stemmed_reviews_test = get_stemmed_text(reviews_test)

In [8]:
## Lemmatized Reviews

def get_lemmatized_text(corpus):
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

lemmatized_reviews_train = get_lemmatized_text(reviews_train)
lemmatized_reviews_test = get_lemmatized_text(reviews_test)

## TF-IDF Vectorizer

In [9]:
tfidf = TfidfVectorizer(min_df=5, ngram_range=(1,2))

## Without Normalisation
tfidf.fit(reviews_train)
features_train = tfidf.transform(reviews_train)
features_test = tfidf.transform(reviews_test)

## Stemmed Features
tfidf.fit(stemmed_reviews_train)
stemmed_features_train = tfidf.transform(stemmed_reviews_train)
stemmed_features_test = tfidf.transform(stemmed_reviews_test)

'''
ngrams_stemmed = pd.DataFrame(
             stemmed_features.todense(),
             columns=tfidf.get_feature_names())
'''

## lemmatized Features
tfidf.fit(lemmatized_reviews_train)
lemmatized_features_train = tfidf.transform(lemmatized_reviews_train)
lemmatized_features_test = tfidf.transform(lemmatized_reviews_test)

'''
ngrams_lemmatized = pd.DataFrame(
             lemmatized_features.todense(),
             columns=tfidf.get_feature_names())
'''


'\nngrams_lemmatized = pd.DataFrame(\n             lemmatized_features.todense(),\n             columns=tfidf.get_feature_names())\n'

In [10]:
## Sentiments
target = []
for sentiment in reviews['sentiment'] :
    if sentiment == 'positive' :
        target.append(1)
    else :
        target.append(0)

target_train = target[:val]
target_test = target[val:]

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

print ('\nWithout Normalisation')
X_train, X_val, y_train, y_val = train_test_split(
   features_train , target_train, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))


print ('\nAfter Stemming')
X_train, X_val, y_train, y_val = train_test_split(
    stemmed_features_train, target_train, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))

print ('\nAfter Lemmatizing')
X_train, X_val, y_train, y_val = train_test_split(
    lemmatized_features_train, target_train, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))
    



Without Normalisation
Accuracy for C=0.01: 0.80416
Accuracy for C=0.05: 0.85376




Accuracy for C=0.25: 0.8736
Accuracy for C=0.5: 0.88192
Accuracy for C=1: 0.89056

After Stemming
Accuracy for C=0.01: 0.77184
Accuracy for C=0.05: 0.82848
Accuracy for C=0.25: 0.85568
Accuracy for C=0.5: 0.86496
Accuracy for C=1: 0.87104

After Lemmatizing
Accuracy for C=0.01: 0.82464
Accuracy for C=0.05: 0.83776
Accuracy for C=0.25: 0.8624
Accuracy for C=0.5: 0.86912
Accuracy for C=1: 0.8752


In [12]:
## Final Model for Without Normalization

final_ngram = LogisticRegression(C=1)
final_ngram.fit(features_train, target_train)
print ("Final Accuracy: Un-Normalized : %s" 
       % accuracy_score(target_test, final_ngram.predict(features_test)))

## Final Model for Stemmed Features
final_ngram = LogisticRegression(C=1)
final_ngram.fit(stemmed_features_train, target_train)
print ("Final Accuracy: After Stemming : %s" 
       % accuracy_score(target_test, final_ngram.predict(stemmed_features_test)))

## Final Model for Lemaatized Features
final_ngram = LogisticRegression(C=1)
final_ngram.fit(lemmatized_features_train, target_train)
print ("Final Accuracy: After Lemmatizing : %s" 
       % accuracy_score(target_test, final_ngram.predict(lemmatized_features_test)))


Final Accuracy: Un-Normalized : 0.8854133333333334
Final Accuracy: After Stemming : 0.8831466666666666
Final Accuracy: After Lemmatizing : 0.88424


In [13]:

## Get Best and Worst Features
feature_to_coef = {
    word: coef for word, coef in zip(
        tfidf.get_feature_names(), final_ngram.coef_[0]
    )
}

## Best Reviews
print ('Best')
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:5]:
    print (best_positive)

    
## Worst Reiviews 
print ('\nWorst')
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:5]:
    print (best_negative)


Best
('great', 6.337722195219969)
('excellent', 4.826326775322904)
('best', 3.9339454568666197)
('wonderful', 3.8203331514044727)
('love', 3.7307311970448076)

Worst
('bad', -7.525844348565642)
('worst', -5.975479130380973)
('awful', -5.158488033413389)
('waste', -4.689547560583586)
('boring', -4.674836702585414)


## Note: Try different Models and check accuracy in them to see which is better