In [49]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/imdb-sentiment/full_test_imdb.txt
/kaggle/input/imdb-sentiment/full_train_imdb.txt


# Taking in input

In [50]:
reviews_train = []
for line in open('/kaggle/input/imdb-sentiment/full_test_imdb.txt', 'r', encoding="utf8"):
    reviews_train.append(line.strip())
    
reviews_test = []
for line in open('/kaggle/input/imdb-sentiment/full_train_imdb.txt', 'r', encoding="utf8"):
    reviews_test.append(line.strip())

# Data Cleaning and Preprocessing

In [51]:
import re

REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_reviews(reviews):
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)

## Vectorization

In [52]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=True)
cv.fit(reviews_train_clean)
X = cv.transform(reviews_train_clean)
X_test = cv.transform(reviews_test_clean)

## Starting baseline classifier model

In [53]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

target = [1 if i < 12500 else 0 for i in range(25000)]

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c, solver='lbfgs', max_iter=5000)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))

Accuracy for C=0.01: 0.87712
Accuracy for C=0.05: 0.89104
Accuracy for C=0.25: 0.89408
Accuracy for C=0.5: 0.8928
Accuracy for C=1: 0.8904


## Finalizing the model

In [54]:
final_model = LogisticRegression(C=0.05, solver='lbfgs', max_iter=5000)
final_model.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_model.predict(X_test)))

Final Accuracy: 0.87968


### Running a sanity test

In [55]:
feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names_out(), final_model.coef_[0]
    )
}
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:5]:
    print (best_positive)
        
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:5]:
    print (best_negative)

('excellent', 0.9039371419166145)
('hilarious', 0.7869229666130453)
('amazing', 0.7716903255364735)
('great', 0.7121650652025706)
('perfect', 0.7045421508983317)
('worst', -1.4331252589518797)
('waste', -1.251499765671806)
('awful', -1.0855116112674923)
('terrible', -0.9759964494662143)
('boring', -0.9010423701456618)


# Text Processing

### Removing stop words with the help of NLTK (Natural Language Toolkit)

In [56]:
import nltk
nltk.download('wordnet')
from nltk.corpus import stopwords

english_stop_words = stopwords.words('english')
def remove_stop_words(corpus):
    removed_stop_words = []
    for review in corpus:
        removed_stop_words.append(
            ' '.join([word for word in review.split() 
                      if word not in english_stop_words])
        )
    return removed_stop_words

no_stop_words = remove_stop_words(reviews_train_clean)

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Normalization with Stemming

In [57]:
def get_stemmed_text(corpus):
    from nltk.stem.porter import PorterStemmer
    stemmer = PorterStemmer()
    return [' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]

stemmed_reviews = get_stemmed_text(reviews_train_clean)

### Normalization with Lemmatization

In [58]:
#!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/
def get_lemmatized_text(corpus):
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

lemmatized_reviews = get_lemmatized_text(reviews_train_clean)

#### Original

In [59]:
reviews_train_clean[0:3]

['i went and saw this movie last night after being coaxed to by a few friends of mine ill admit that i was reluctant to see it because from what i knew of ashton kutcher he was only able to do comedy i was wrong kutcher played the character of jake fischer very well and kevin costner played ben randall with such professionalism the sign of a good movie is that it can toy with our emotions this one did exactly that the entire theater which was sold out was overcome by laughter during the first half of the movie and were moved to tears during the second half while exiting the theater i not only saw many women in tears but many full grown men as well trying desperately not to let anyone see them crying this movie was great and i suggest that you go see it before you judge',
 'actor turned director bill paxton follows up his promising debut the gothic horror frailty with this family friendly sports drama about the 1913 us open where a young american caddy rises from his humble background t

#### Stemmed

In [60]:
stemmed_reviews[0:3]

['i went and saw thi movi last night after be coax to by a few friend of mine ill admit that i wa reluct to see it becaus from what i knew of ashton kutcher he wa onli abl to do comedi i wa wrong kutcher play the charact of jake fischer veri well and kevin costner play ben randal with such profession the sign of a good movi is that it can toy with our emot thi one did exactli that the entir theater which wa sold out wa overcom by laughter dure the first half of the movi and were move to tear dure the second half while exit the theater i not onli saw mani women in tear but mani full grown men as well tri desper not to let anyon see them cri thi movi wa great and i suggest that you go see it befor you judg',
 'actor turn director bill paxton follow up hi promis debut the gothic horror frailti with thi famili friendli sport drama about the 1913 us open where a young american caddi rise from hi humbl background to play against hi bristish idol in what wa dub as the greatest game ever play 

#### Lemmatized

In [61]:
lemmatized_reviews[0:3]

['i went and saw this movie last night after being coaxed to by a few friend of mine ill admit that i wa reluctant to see it because from what i knew of ashton kutcher he wa only able to do comedy i wa wrong kutcher played the character of jake fischer very well and kevin costner played ben randall with such professionalism the sign of a good movie is that it can toy with our emotion this one did exactly that the entire theater which wa sold out wa overcome by laughter during the first half of the movie and were moved to tear during the second half while exiting the theater i not only saw many woman in tear but many full grown men a well trying desperately not to let anyone see them cry this movie wa great and i suggest that you go see it before you judge',
 'actor turned director bill paxton follows up his promising debut the gothic horror frailty with this family friendly sport drama about the 1913 u open where a young american caddy rise from his humble background to play against hi

### Ngram

In [62]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c, solver='lbfgs', max_iter=5000)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))
        
final_ngram = LogisticRegression(C=0.5, solver='lbfgs', max_iter=5000)
final_ngram.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_ngram.predict(X_test)))

Accuracy for C=0.01: 0.88896
Accuracy for C=0.05: 0.89568
Accuracy for C=0.25: 0.89872
Accuracy for C=0.5: 0.89888
Accuracy for C=1: 0.89776
Final Accuracy: 0.89556


## Building up Vectors

#### Word count

In [63]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

wc_vectorizer = CountVectorizer(binary=False)
wc_vectorizer.fit(reviews_train_clean)
X = wc_vectorizer.transform(reviews_train_clean)
X_test = wc_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75, 
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c, solver='lbfgs', max_iter=5000)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))
        
final_wc = LogisticRegression(C=0.05, solver='lbfgs', max_iter=5000)
final_wc.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_wc.predict(X_test)))

Accuracy for C=0.01: 0.88896
Accuracy for C=0.05: 0.89424
Accuracy for C=0.25: 0.89072
Accuracy for C=0.5: 0.8904
Accuracy for C=1: 0.89056
Final Accuracy: 0.87848


#### TF-IDF

In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(reviews_train_clean)
X = tfidf_vectorizer.transform(reviews_train_clean)
X_test = tfidf_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c, solver='lbfgs', max_iter=5000)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))
 
final_tfidf = LogisticRegression(C=1, solver='lbfgs', max_iter=5000)
final_tfidf.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_tfidf.predict(X_test)))

Accuracy for C=0.01: 0.7984
Accuracy for C=0.05: 0.8384
Accuracy for C=0.25: 0.87696
Accuracy for C=0.5: 0.88736
Accuracy for C=1: 0.89216
Final Accuracy: 0.87944


### Trying model using Support Vector Machines (SVM)


In [65]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c, max_iter=5000)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))
        
final_svm_ngram = LinearSVC(C=0.01, max_iter=5000)
final_svm_ngram.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_svm_ngram.predict(X_test)))

Accuracy for C=0.01: 0.89728
Accuracy for C=0.05: 0.8944
Accuracy for C=0.25: 0.8928
Accuracy for C=0.5: 0.89248
Accuracy for C=1: 0.89248
Final Accuracy: 0.89512


## Final Model using GridCV and ensemble of models

In [77]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score

stop_words = ['in', 'of', 'at', 'a', 'the', 'is', 'and', 'it', 'to', 'this', 'that', 'with', 'for', 'on', 'you', 'was']
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), stop_words=stop_words, 
                                   sublinear_tf=True, max_features=500000)
tfidf_vectorizer.fit(reviews_train_clean)
X = tfidf_vectorizer.transform(reviews_train_clean)
X_test = tfidf_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(X, target, train_size=0.75, stratify=target, random_state=42)

param_grid = {'C': [0.001, 0.005, 0.01, 0.05, 0.1, 1, 5, 10, 20, 50]}
grid_search = GridSearchCV(LinearSVC(max_iter=5000, dual=False), param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)
best_svm = grid_search.best_estimator_
print(f"Best C for LinearSVC: {grid_search.best_params_['C']}")

sgd_svm = SGDClassifier(loss='hinge', max_iter=2000, tol=1e-4, n_jobs=-1)
sgd_svm.fit(X_train, y_train)

logreg = LogisticRegression(penalty='l2', solver='saga', max_iter=2000, n_jobs=-1)
logreg.fit(X_train, y_train)

stacking = StackingClassifier(estimators=[('svm', best_svm), ('sgd', sgd_svm), ('logreg', logreg)], 
                              final_estimator=LogisticRegression(max_iter=2000, n_jobs=-1), n_jobs=-1, verbose=1)
stacking.fit(X_train, y_train)

models = {'LinearSVC': best_svm, 'SGD SVM': sgd_svm, 'LogReg': logreg, 'Stacking': stacking}

for name, model in models.items():
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    print(f"{name} Validation Accuracy: {acc:.5f}")

final_accuracy = accuracy_score(target, stacking.predict(X_test))
print(f"Final Accuracy (Stacking Model): {final_accuracy:.5f}")


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best C for LinearSVC: 1
LinearSVC Validation Accuracy: 0.92384
SGD SVM Validation Accuracy: 0.91648
LogReg Validation Accuracy: 0.90640
Stacking Validation Accuracy: 0.92480
Final Accuracy (Stacking Model): 0.90112
