In [3]:
## https://towardsdatascience.com/sentiment-analysis-with-python-part-2-4f71e7bde59a

In [None]:
import numpy as np
import pandas as pd
import re

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

In [5]:
import nltk
# Prevent future/deprecation warnings from showing in output
import warnings
warnings.filterwarnings(action='ignore')

In [6]:
reviews_train = []
for line in open('movie_data/full_train.txt', encoding="utf8"):
    
    reviews_train.append(line.strip())
    

In [8]:
reviews_train[2]

'Brilliant over-acting by Lesley Ann Warren. Best dramatic hobo lady I have ever seen, and love scenes in clothes warehouse are second to none. The corn on face is a classic, as good as anything in Blazing Saddles. The take on lawyers is also superb. After being accused of being a turncoat, selling out his boss, and being dishonest the lawyer of Pepto Bolt shrugs indifferently "I\'m a lawyer" he says. Three funny words. Jeffrey Tambor, a favorite from the later Larry Sanders show, is fantastic here too as a mad millionaire who wants to crush the ghetto. His character is more malevolent than usual. The hospital scene, and the scene where the homeless invade a demolition site, are all-time classics. Look for the legs scene and the two big diggers fighting (one bleeds). This movie gets better each time I see it (which is quite often).'

In [9]:
reviews_train [24999]

"This is one of the dumbest films, I've ever seen. It rips off nearly ever type of thriller and manages to make a mess of them all.<br /><br />There's not a single good line or character in the whole mess. If there was a plot, it was an afterthought and as far as acting goes, there's nothing good to say so Ill say nothing. I honestly cant understand how this type of nonsense gets produced and actually released, does somebody somewhere not at some stage think, 'Oh my god this really is a load of shite' and call it a day. Its crap like this that has people downloading illegally, the trailer looks like a completely different film, at least if you have download it, you haven't wasted your time or money Don't waste your time, this is painful."

In [10]:
print(len(reviews_train))

25000


In [12]:
reviews_test = []
for line in open('movie_data/full_test.txt', encoding="utf8"):
    
    reviews_test.append(line.strip())
    

In [13]:
print(len(reviews_test))

25000


In [14]:
#Because of special structure of data
#Don't use this at home!!!!!
#Both train and test have  25000 observations
#Both train and test can be split in : First 12500 positive , last 12500 negative
#Don't do this at home

target = [1 if i < 12500 else 0 for i in range(25000)]

In [15]:
import re

REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "

def preprocess_reviews(reviews):
    
    reviews = [REPLACE_NO_SPACE.sub(NO_SPACE, line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(SPACE, line) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)

# Baseline

In [None]:
baseline_vectorizer = CountVectorizer(binary=False)
#baseline_vectorizer.fit(reviews_train_clean)
X_baseline = baseline_vectorizer.fit_transform(reviews_train_clean)

# Not the difference between fit_transform and transform
X_test_baseline = baseline_vectorizer.transform(reviews_test_clean)

In [None]:
print(baseline_vectorizer)

In [None]:
#print (reviews_train_clean[0])

In [None]:
#review0_baseline = baseline_vectorizer.transform([reviews_train_clean[0]])

In [None]:
#print (review0_baseline)

In [None]:
print ('Shape of Sparse Matrix: ',X_baseline.shape)
print ('Amount of Non-Zero occurences: ', X_baseline.nnz)
print ('sparsity: %s' % (1- 100.0 * X_baseline.nnz /
                             (X_baseline.shape[0] *X_baseline.shape[1])))

In [None]:
final_model = LogisticRegression()
final_model.fit(X_baseline, target)
print ("Baseline Accuracy: %s" % accuracy_score(target, final_model.predict(X_test_baseline)))

In [None]:
print ("Baseline AUC: %s" %  roc_auc_score(target, final_model.predict_proba(X_test_baseline)[::,1]))

# Remove Stop Words

In [None]:
from nltk.corpus import stopwords

english_stop_words = stopwords.words('english')
def remove_stop_words(corpus):
    removed_stop_words = []
    for review in corpus:
        removed_stop_words.append(
            ' '.join([word for word in review.split() 
                      if word not in english_stop_words])
        )
    return removed_stop_words

In [None]:
no_stop_words_train = remove_stop_words(reviews_train_clean)
no_stop_words_test = remove_stop_words(reviews_test_clean)

In [None]:
cv = CountVectorizer(binary=True)
X = cv.fit_transform(no_stop_words_train)
X_test = cv.transform(no_stop_words_test)



In [None]:
lr = LogisticRegression()
lr.fit(X, target)
print ("Remove Stopwords Accuracy : %s"  % ( accuracy_score(target, lr.predict(X_test))))

In [None]:
print ("Remove Stopwords AUC: %s" %  roc_auc_score(target, lr.predict_proba(X_test)[::,1]))

# Stemming

In [None]:
#Skip during demo
def get_stemmed_text(corpus):
    from nltk.stem.porter import PorterStemmer
    stemmer = PorterStemmer()
    return [' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]

In [None]:
#takes 5 minutes
#Skip during demo
stemmed_reviews_train = get_stemmed_text(reviews_train_clean)
stemmed_reviews_test = get_stemmed_text(reviews_test_clean)

In [None]:
import pickle
from joblib import dump, load

In [None]:
#Skip during demo
with open("stemmed_reviews_train.txt", "wb") as fp: 
      pickle.dump(stemmed_reviews_train, fp)
with open("stemmed_reviews_test.txt", "wb") as fp: 
      pickle.dump(stemmed_reviews_test, fp)

In [None]:
with open("stemmed_reviews_train.txt", "rb") as fp:   # Unpickling
      stemmed_reviews_train_loaded = pickle.load(fp)
with open("stemmed_reviews_test.txt", "rb") as fp:   # Unpickling
      stemmed_reviews_test_loaded = pickle.load(fp)

In [None]:
cv = CountVectorizer(binary=True)

X = cv.fit_transform(stemmed_reviews_train_loaded)
X_test = cv.transform(stemmed_reviews_test_loaded)
   

In [None]:
final_stemmed = LogisticRegression()
final_stemmed.fit(X, target)
print ("Stemmed Accuracy: %s" % accuracy_score(target, final_stemmed.predict(X_test)))

In [None]:
print ("Stemming AUC: %s" %  roc_auc_score(target, final_stemmed.predict_proba(X_test)[::,1]))

In [None]:
dump(final_stemmed, 'final_stemmed.joblib') 

In [None]:
final_stemmed_loaded= load('final_stemmed.joblib')
print ("Final Stemming: %s" % accuracy_score(target, final_stemmed_loaded.predict(X_test)))

# Lemmatization

In [None]:
nltk.download('wordnet')
from nltk import WordNetLemmatizer

In [None]:
def get_lemmatized_text(corpus):
    
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

In [None]:
#takes 5 minutes
#Skip during demo
lemmatized_reviews_train = get_lemmatized_text(reviews_train_clean)
lemmatized_reviews_test = get_lemmatized_text(reviews_test_clean)

In [None]:
#Skip during demo
with open("lemmatized_reviews_train.txt", "wb") as fp: 
      pickle.dump(lemmatized_reviews_train, fp)
with open("lemmatized_reviews_test.txt", "wb") as fp: 
      pickle.dump(lemmatized_reviews_test, fp)

In [None]:
with open("stemmed_reviews_train.txt", "rb") as fp:   # Unpickling
      stemmed_reviews_train_loaded = pickle.load(fp)
with open("stemmed_reviews_test.txt", "rb") as fp:   # Unpickling
      stemmed_reviews_test_loaded = pickle.load(fp)

In [None]:
cv = CountVectorizer(binary=True)

X = cv.fit_transform(lemmatized_reviews_train)
X_test = cv.transform(lemmatized_reviews_test)


In [None]:
final_lemmatized = LogisticRegression()
final_lemmatized.fit(X, target)
print ("Final Lemmatized Accuracy: %s"  % accuracy_score(target, final_lemmatized.predict(X_test)))

In [None]:
print ("Lemmatized AUC: %s" %  roc_auc_score(target, final_lemmatized.predict_proba(X_test)[::,1]))

# n-grams

In [None]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer.fit(reviews_train_clean)

X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)


In [None]:
    
final_ngram = LogisticRegression()
final_ngram.fit(X, target)
print ("Accuracy uni- bi grams: %s"  % accuracy_score(target, final_ngram.predict(X_test)))

In [None]:
print ("Uni- bi grams AUC: %s" %  roc_auc_score(target, final_ngram.predict_proba(X_test)[::,1]))

# Word Counts

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

wc_vectorizer = CountVectorizer(binary=False)


In [None]:
X = wc_vectorizer.fit_transform(reviews_train_clean)
X_test = wc_vectorizer.transform(reviews_test_clean)



In [None]:
final_wc = LogisticRegression()
final_wc.fit(X, target)
print ("WordCount Accuracy: %s"  % accuracy_score(target, final_wc.predict(X_test)))


In [None]:
print ("WordCount AUC: %s" %  roc_auc_score(target, final_wc.predict_proba(X_test)[::,1]))

# TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

tfidf_vectorizer = TfidfVectorizer()

In [None]:
X = tfidf_vectorizer.fit_transform(reviews_train_clean)
X_test = tfidf_vectorizer.transform(reviews_test_clean)


In [None]:
final_tfidf = LogisticRegression()
final_tfidf.fit(X, target)
print ("TF_IDF Accuracy: %s" % accuracy_score(target, final_tfidf.predict(X_test)))

In [None]:
print ("TF_IDF AUC: %s" %  roc_auc_score(target, final_tfidf.predict_proba(X_test)[::,1]))

# Support Vector Machines (SVM)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [None]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))


In [None]:
X = ngram_vectorizer.fit_transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

In [None]:
final_svm_ngram = LinearSVC()
final_svm_ngram.fit(X, target)
print ("SVC Accuracy: %s"  % accuracy_score(target, final_svm_ngram.predict(X_test)))

# Final Model

In [None]:
stop_words = ['in', 'of', 'at', 'a', 'the']
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3), stop_words=stop_words)


In [None]:
X = ngram_vectorizer.fit_transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

In [None]:
final = LinearSVC()
final.fit(X, target)
print ("Final Accuracy: %s" % accuracy_score(target, final.predict(X_test)))


# Top Positive and Negative Features

In [None]:
feature_to_coef = {
    word: coef for word, coef in zip(
        ngram_vectorizer.get_feature_names(), final.coef_[0]
    )
}

for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:10]:
    print (best_positive)
    
print("\n\n")
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:10]:
    print (best_negative)