# Importations

In [1]:
import nltk
import pandas as pd
import chardet
from nltk.tokenize import word_tokenize
from xgboost import XGBClassifier
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
from nltk import pos_tag
import pickle
import os
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
import re
import numpy as np


In [2]:
nltk.download("all")

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /usr/share/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /usr/share/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /usr/share/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /usr/share/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /usr/share/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron

True

# Functions

In [3]:
def summary(dataframe):
    print("\t Head")
    print(dataframe.head())
    print("\t Description")
    print(dataframe.describe())

def null_checker(dataframe):
    print(dataframe.isnull().sum())

def duplicated_checker(dataframe):
    print(dataframe.loc[dataframe.duplicated(),:])


def reader(path):
    with open(path, 'rb') as f:
        result = chardet.detect(f.read())
    reviews = pd.read_csv(path, encoding=result['encoding'])    
    return reviews    

def tokenizer(dataset):
    new_dataset = dataset.copy(deep=True)
    new_dataset["review"] = new_dataset.review.map(lambda x: word_tokenize(x.lower() if isinstance(x, str) else str(x) ))
    return new_dataset

def stopwords_remover(dataset):    
    with open("/kaggle/input/imdb-reviews/stopwords.txt") as file:
        custom_stopwords = file.read().split(",")
        
    stop_words = set(stopwords.words('english') + custom_stopwords + ["footnote", "sidenote", "project", "gutenberg"])

    regex = r"^\w+$"

    dataset["review"] = dataset.review.map(lambda x: [word for word in x if (word not in stop_words and re.match(regex, word))])
    return dataset

def punctuation_remover(dataset):
    punctuation = string.punctuation + "``" + "''" + "--" + "_" + "(" + ")" + '""' + "|" + "“" + "”" + "’" + "‘" + "___"
    dataset["review"] = dataset.review.map(lambda x: [word for word in x if word not in punctuation])
    return dataset

def pos_tagger(dataset):
    dataset["review"] = dataset.review.map(lambda x: [tagged for tagged in pos_tag(x,tagset='universal') if tagged[1] not in ["NUM"] ])
    return dataset

def lemmatizer(dataset):
    lem = WordNetLemmatizer()
    dataset["review"] = dataset.review.map(lambda row: [ lem.lemmatize(word[0], pos = get_pos_tag(word[1])) for word in row ])
    return dataset    

def get_pos_tag(pos):    
    match pos:
        case "NOUN":
            result = "n"
        case "VERB":
            result = "v"
        case "ADJ":
            result = "a"
        case "ADV":
            result = "r"
        case _:
            result = "s"
    return result

def tf_idf_calculator(title, content):

    vectorizer = TfidfVectorizer(strip_accents = "ascii", max_df = 0.6)
    m = vectorizer.fit_transform(content).transpose() 
       
    return  (pd.DataFrame( data = m.toarray(), index = vectorizer.vocabulary_, columns = title), vectorizer)

In [4]:
try:
    dataset = reader("/kaggle/input/imdb-reviews/imdb/imdb.csv")
except Exception as e:
    print(f"Something when wrong while reading the file: \n{e}")

In [5]:
dataset[dataset.duplicated()]

Unnamed: 0.1,Unnamed: 0,type,review,label,file


In [6]:
null_checker(dataset)

Unnamed: 0    0
type          0
review        0
label         0
file          0
dtype: int64


In [7]:
dataset.label.value_counts()

label
unsup    50000
neg      25000
pos      25000
Name: count, dtype: int64

In [8]:
dataset.drop(["Unnamed: 0","file"], axis=1, inplace=True)
dataset.head()

Unnamed: 0,type,review,label
0,test,Once again Mr. Costner has dragged out a movie...,neg
1,test,This is an example of why the majority of acti...,neg
2,test,"First of all I hate those moronic rappers, who...",neg
3,test,Not even the Beatles could write songs everyon...,neg
4,test,Brass pictures (movies is not a fitting word f...,neg


In [9]:
rows_to_drop = dataset[dataset["label"] == 2].index
dataset.drop(rows_to_drop, inplace=True)

In [10]:
dataset["label"] = dataset.label.apply(lambda label: 0 if label == "neg" else 1)

In [11]:
tokenized_reviews = tokenizer(dataset)
tokenized_reviews

Unnamed: 0,type,review,label
0,test,"[once, again, mr., costner, has, dragged, out,...",0
1,test,"[this, is, an, example, of, why, the, majority...",0
2,test,"[first, of, all, i, hate, those, moronic, rapp...",0
3,test,"[not, even, the, beatles, could, write, songs,...",0
4,test,"[brass, pictures, (, movies, is, not, a, fitti...",0
...,...,...,...
99995,train,"[delightfully, awful, !, made, by, david, gian...",1
99996,train,"[watching, time, chasers, ,, it, obvious, that...",1
99997,train,"[at, the, beginning, we, can, see, members, of...",1
99998,train,"[the, movie, was, incredible, ,, ever, since, ...",1


In [12]:
# without_stopwords = stopwords_remover(tokenized_reviews)
# without_stopwords

In [13]:
without_punctuation = punctuation_remover(tokenized_reviews)
without_punctuation

Unnamed: 0,type,review,label
0,test,"[once, again, mr., costner, has, dragged, out,...",0
1,test,"[this, is, an, example, of, why, the, majority...",0
2,test,"[first, of, all, i, hate, those, moronic, rapp...",0
3,test,"[not, even, the, beatles, could, write, songs,...",0
4,test,"[brass, pictures, movies, is, not, a, fitting,...",0
...,...,...,...
99995,train,"[delightfully, awful, made, by, david, giancol...",1
99996,train,"[watching, time, chasers, it, obvious, that, i...",1
99997,train,"[at, the, beginning, we, can, see, members, of...",1
99998,train,"[the, movie, was, incredible, ever, since, i, ...",1


In [14]:
tagged_words = pos_tagger(without_punctuation)
tagged_words

Unnamed: 0,type,review,label
0,test,"[(once, ADV), (again, ADV), (mr., ADJ), (costn...",0
1,test,"[(this, DET), (is, VERB), (an, DET), (example,...",0
2,test,"[(first, ADV), (of, ADP), (all, DET), (i, ADJ)...",0
3,test,"[(not, ADV), (even, ADV), (the, DET), (beatles...",0
4,test,"[(brass, NOUN), (pictures, NOUN), (movies, NOU...",0
...,...,...,...
99995,train,"[(delightfully, ADV), (awful, ADJ), (made, VER...",1
99996,train,"[(watching, VERB), (time, NOUN), (chasers, NOU...",1
99997,train,"[(at, ADP), (the, DET), (beginning, NOUN), (we...",1
99998,train,"[(the, DET), (movie, NOUN), (was, VERB), (incr...",1


In [15]:
lemmatized_words = lemmatizer(tagged_words)
lemmatized_words

Unnamed: 0,type,review,label
0,test,"[once, again, mr., costner, have, drag, out, a...",0
1,test,"[this, be, an, example, of, why, the, majority...",0
2,test,"[first, of, all, i, hate, those, moronic, rapp...",0
3,test,"[not, even, the, beatles, could, write, song, ...",0
4,test,"[brass, picture, movie, be, not, a, fitting, w...",0
...,...,...,...
99995,train,"[delightfully, awful, make, by, david, giancol...",1
99996,train,"[watch, time, chaser, it, obvious, that, it, b...",1
99997,train,"[at, the, beginning, we, can, see, member, of,...",1
99998,train,"[the, movie, be, incredible, ever, since, i, s...",1


In [16]:
lemmatized_words.to_pickle("lemmatized2.pickle")

In [17]:
# lemfile = open('/kaggle/input/lemmatized/lemmatized.pickle', 'rb')
# lemmatized_words = pickle.load(lemfile)

In [18]:
lemmatized_words

Unnamed: 0,type,review,label
0,test,"[once, again, mr., costner, have, drag, out, a...",0
1,test,"[this, be, an, example, of, why, the, majority...",0
2,test,"[first, of, all, i, hate, those, moronic, rapp...",0
3,test,"[not, even, the, beatles, could, write, song, ...",0
4,test,"[brass, picture, movie, be, not, a, fitting, w...",0
...,...,...,...
99995,train,"[delightfully, awful, make, by, david, giancol...",1
99996,train,"[watch, time, chaser, it, obvious, that, it, b...",1
99997,train,"[at, the, beginning, we, can, see, member, of,...",1
99998,train,"[the, movie, be, incredible, ever, since, i, s...",1


In [19]:
cp_lemmatized_words = lemmatized_words.copy(deep=True)

In [20]:
cp_lemmatized_words["review"] = cp_lemmatized_words.review.map( lambda x: " ".join(x) )
reviews = cp_lemmatized_words.review.tolist()

vectorized = CountVectorizer(max_df=0.9, min_df=0.2)
fitted = vectorized.fit_transform(reviews)
fitted

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 3848283 stored elements and shape (100000, 103)>

In [21]:
X = fitted
y = cp_lemmatized_words["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3)

0.869

In [22]:
xgb = XGBClassifier(max_depth=5, n_estimators=1000).fit(X_train, y_train)

pred = xgb.predict(X_test)

accuracy_score(y_test, pred)

0.7329666666666667

0.8699

In [23]:
xgb = XGBClassifier(max_depth=3, n_estimators=1000).fit(X_train, y_train)

pred = xgb.predict(X_test)

accuracy_score(y_test, pred)

0.7416

In [24]:
xgb = XGBClassifier(max_depth=3, n_estimators=1000, objective = 'binary:logistic' ).fit(X_train, y_train)

pred = xgb.predict(X_test)

accuracy_score(y_test, pred)

0.7416

0.851

In [25]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train, y_train)
pred_rfc = model.predict(X_test)
accuracy_score(y_test, pred_rfc)

0.7507333333333334

0.8656

In [26]:
from sklearn.linear_model import SGDClassifier

model = SGDClassifier(max_iter=10000, tol=1e-3)
model.fit(X_train, y_train)
pred_rfc = model.predict(X_test)
accuracy_score(y_test, pred_rfc)

0.7484666666666666

0.8632

In [27]:
model = SGDClassifier(max_iter=10000, loss='hinge', tol=1e-4, alpha=0.0001)
model.fit(X_train, y_train)
pred_rfc = model.predict(X_test)
accuracy_score(y_test, pred_rfc)

0.75

0.8352

In [28]:
from sklearn.naive_bayes import BernoulliNB
model = BernoulliNB()
model.fit(X_train, y_train)
pred_rfc = model.predict(X_test)
accuracy_score(y_test, pred_rfc)

0.7162666666666667

0.8556

In [29]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)
pred_rfc = model.predict(X_test)
accuracy_score(y_test, pred_rfc)

0.7036666666666667

In [30]:
from sklearn.kernel_approximation import AdditiveChi2Sampler
from sklearn.linear_model import SGDClassifier

chi2sampler = AdditiveChi2Sampler(sample_steps=2)
X_transformed = chi2sampler.fit_transform(X_train, y_train)

model = SGDClassifier(max_iter=50, random_state=0, tol=1e-3)
model.fit(X_transformed, y_train)
pred_rfc = model.predict(chi2sampler.fit_transform(X_test))
accuracy_score(y_test, pred_rfc)

0.75

In [34]:
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import MultinomialNB
bagging_classifier = BaggingClassifier(estimator=MultinomialNB(), n_estimators=5)
bagging_classifier.fit(X_train, y_train)
predictions = bagging_classifier.predict(X_test)
accuracy_score(y_test, predictions)

0.7041333333333334

In [36]:
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier
bagging_classifier = BaggingClassifier(estimator=XGBClassifier(max_depth=3, n_estimators=1000), n_estimators=5)
bagging_classifier.fit(X_train, y_train)
predictions = bagging_classifier.predict(X_test)
accuracy_score(y_test, predictions)

0.7433666666666666

In [37]:
import pickle
# Save the trained model as a pickle string.

with open("model.pkl", "wb") as f:
    pickle.dump(bagging_classifier, f)