# Importations

In [None]:
import nltk
import pandas as pd
import chardet
from nltk.tokenize import word_tokenize
from xgboost import XGBClassifier
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
import pickle
import os
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

import re
import numpy as np


In [None]:
nltk.download("all")

# Functions

In [40]:
def summary(dataframe):
    print("\t Head")
    print(dataframe.head())
    print("\t Description")
    print(dataframe.describe())

def null_checker(dataframe):
    print(dataframe.isnull().sum())

def duplicated_checker(dataframe):
    print(dataframe.loc[dataframe.duplicated(),:])


def reader(path):
    with open(path, 'rb') as f:
        result = chardet.detect(f.read())
    reviews = pd.read_csv(path, encoding=result['encoding'])    
    return reviews    

def tokenizer(dataset):
    new_dataset = dataset.copy(deep=True)
    new_dataset["review"] = new_dataset.review.map(lambda x: word_tokenize(x.lower() if isinstance(x, str) else str(x) ))
    return new_dataset

def stopwords_remover(dataset):    
    with open("/kaggle/input/imdb-reviews/stopwords.txt") as file:
        custom_stopwords = file.read().split(",")
        
    stop_words = set(stopwords.words('english') + custom_stopwords + ["footnote", "sidenote", "project", "gutenberg"])

    regex = r"^\w+$"

    dataset["review"] = dataset.review.map(lambda x: [word for word in x if (word not in stop_words and re.match(regex, word))])
    return dataset

def punctuation_remover(dataset):
    punctuation = string.punctuation + "``" + "''" + "--" + "_" + "(" + ")" + '""' + "|" + "“" + "”" + "’" + "‘" + "___"
    dataset["review"] = dataset.review.map(lambda x: [word for word in x if word not in punctuation])
    return dataset

def pos_tagger(dataset):
    dataset["review"] = dataset.review.map(lambda x: [tagged for tagged in pos_tag(x,tagset='universal') if tagged[1] not in ["NUM"] ])
    return dataset

def lemmatizer(dataset):
    lem = WordNetLemmatizer()
    dataset["review"] = dataset.review.map(lambda row: [ lem.lemmatize(word[0], pos = get_pos_tag(word[1])) for word in row ])
    return dataset    

def get_pos_tag(pos):    
    match pos:
        case "NOUN":
            result = "n"
        case "VERB":
            result = "v"
        case "ADJ":
            result = "a"
        case "ADV":
            result = "r"
        case _:
            result = "s"
    return result

def tf_idf_calculator(title, content):

    vectorizer = TfidfVectorizer(strip_accents = "ascii", max_df = 0.6)
    m = vectorizer.fit_transform(content).transpose() 
       
    return  (pd.DataFrame( data = m.toarray(), index = vectorizer.vocabulary_, columns = title), vectorizer)

In [125]:
try:
    dataset = reader("/kaggle/input/imdb-reviews/imdb/imdb.csv")
except Exception as e:
    print(f"Something when wrong while reading the file: \n{e}")

In [42]:
dataset[dataset.duplicated()]

Unnamed: 0.1,Unnamed: 0,type,review,label,file


In [43]:
null_checker(dataset)

Unnamed: 0    0
type          0
review        0
label         0
file          0
dtype: int64


In [126]:
dataset.label.value_counts()

label
unsup    50000
neg      25000
pos      25000
Name: count, dtype: int64

In [127]:
dataset.drop(["Unnamed: 0","file"], axis=1, inplace=True)
dataset.head()

Unnamed: 0,type,review,label
0,test,Once again Mr. Costner has dragged out a movie...,neg
1,test,This is an example of why the majority of acti...,neg
2,test,"First of all I hate those moronic rappers, who...",neg
3,test,Not even the Beatles could write songs everyon...,neg
4,test,Brass pictures (movies is not a fitting word f...,neg


In [128]:
dataset["label"] = dataset.label.apply(lambda label: 0 if label == "neg" else 1 if label == "pos" else 2)
# rows_to_drop = dataset[dataset["label"] == "unsup"].index
# dataset.drop(rows_to_drop, inplace=True)
dataset

Unnamed: 0,type,review,label
0,test,Once again Mr. Costner has dragged out a movie...,0
1,test,This is an example of why the majority of acti...,0
2,test,"First of all I hate those moronic rappers, who...",0
3,test,Not even the Beatles could write songs everyon...,0
4,test,Brass pictures (movies is not a fitting word f...,0
...,...,...,...
99995,train,"Delightfully awful! Made by David Giancola, a ...",2
99996,train,"Watching Time Chasers, it obvious that it was ...",2
99997,train,At the beginning we can see members of Troma t...,2
99998,train,"The movie was incredible, ever since I saw it ...",2


In [129]:
rows_to_drop = dataset[dataset["label"] == 2].index
dataset.drop(rows_to_drop, inplace=True)
dataset

Unnamed: 0,type,review,label
0,test,Once again Mr. Costner has dragged out a movie...,0
1,test,This is an example of why the majority of acti...,0
2,test,"First of all I hate those moronic rappers, who...",0
3,test,Not even the Beatles could write songs everyon...,0
4,test,Brass pictures (movies is not a fitting word f...,0
...,...,...,...
49995,train,"Seeing as the vote average was pretty low, and...",1
49996,train,"The plot had some wretched, unbelievable twist...",1
49997,train,I am amazed at how this movie(and most others ...,1
49998,train,A Christmas Together actually came before my t...,1


In [130]:
tokenized_reviews = tokenizer(dataset)
tokenized_reviews

Unnamed: 0,type,review,label
0,test,"[once, again, mr., costner, has, dragged, out,...",0
1,test,"[this, is, an, example, of, why, the, majority...",0
2,test,"[first, of, all, i, hate, those, moronic, rapp...",0
3,test,"[not, even, the, beatles, could, write, songs,...",0
4,test,"[brass, pictures, (, movies, is, not, a, fitti...",0
...,...,...,...
49995,train,"[seeing, as, the, vote, average, was, pretty, ...",1
49996,train,"[the, plot, had, some, wretched, ,, unbelievab...",1
49997,train,"[i, am, amazed, at, how, this, movie, (, and, ...",1
49998,train,"[a, christmas, together, actually, came, befor...",1


In [131]:
without_stopwords = stopwords_remover(tokenized_reviews)
without_stopwords

Unnamed: 0,type,review,label
0,test,"[costner, dragged, movie, longer, terrific, se...",0
1,test,"[majority, action, films, generic, boring, wor...",0
2,test,"[hate, moronic, rappers, gun, pressed, forehea...",0
3,test,"[beatles, write, songs, walter, hill, thought,...",0
4,test,"[brass, pictures, movies, fitting, word, brass...",0
...,...,...,...
49995,train,"[vote, average, pretty, low, fact, clerk, vide...",1
49996,train,"[plot, wretched, unbelievable, twists, chemist...",1
49997,train,"[amazed, movie, average, 5, stars, lower, crap...",1
49998,train,"[christmas, time, raised, john, denver, songs,...",1


In [132]:
without_punctuation = punctuation_remover(without_stopwords)
without_punctuation

Unnamed: 0,type,review,label
0,test,"[costner, dragged, movie, longer, terrific, se...",0
1,test,"[majority, action, films, generic, boring, wor...",0
2,test,"[hate, moronic, rappers, gun, pressed, forehea...",0
3,test,"[beatles, write, songs, walter, hill, thought,...",0
4,test,"[brass, pictures, movies, fitting, word, brass...",0
...,...,...,...
49995,train,"[vote, average, pretty, low, fact, clerk, vide...",1
49996,train,"[plot, wretched, unbelievable, twists, chemist...",1
49997,train,"[amazed, movie, average, 5, stars, lower, crap...",1
49998,train,"[christmas, time, raised, john, denver, songs,...",1


In [133]:
tagged_words = pos_tagger(without_punctuation)
tagged_words

Unnamed: 0,type,review,label
0,test,"[(costner, NOUN), (dragged, VERB), (movie, NOU...",0
1,test,"[(majority, NOUN), (action, NOUN), (films, NOU...",0
2,test,"[(hate, NOUN), (moronic, ADJ), (rappers, NOUN)...",0
3,test,"[(beatles, NOUN), (write, VERB), (songs, NOUN)...",0
4,test,"[(brass, NOUN), (pictures, NOUN), (movies, NOU...",0
...,...,...,...
49995,train,"[(vote, NOUN), (average, ADJ), (pretty, ADV), ...",1
49996,train,"[(plot, NOUN), (wretched, VERB), (unbelievable...",1
49997,train,"[(amazed, ADJ), (movie, NOUN), (average, NOUN)...",1
49998,train,"[(christmas, NOUN), (time, NOUN), (raised, VER...",1


In [134]:
lemmatized_words = lemmatizer(tagged_words)
lemmatized_words

Unnamed: 0,type,review,label
0,test,"[costner, drag, movie, longer, terrific, sea, ...",0
1,test,"[majority, action, film, generic, boring, wort...",0
2,test,"[hate, moronic, rapper, gun, press, forehead, ...",0
3,test,"[beatles, write, song, walter, hill, think, pr...",0
4,test,"[brass, picture, movie, fit, word, brassy, all...",0
...,...,...,...
49995,train,"[vote, average, pretty, low, fact, clerk, vide...",1
49996,train,"[plot, wretched, unbelievable, twist, chemistr...",1
49997,train,"[amazed, movie, average, star, low, crappy, mo...",1
49998,train,"[christmas, time, raise, john, denver, song, s...",1


In [135]:
lemmatized_words.to_pickle("lemmatized2.pickle")

In [54]:
# lemfile = open('/kaggle/input/lemmatized/lemmatized.pickle', 'rb')
# lemmatized_words = pickle.load(lemfile)

In [136]:
lemmatized_words

Unnamed: 0,type,review,label
0,test,"[costner, drag, movie, longer, terrific, sea, ...",0
1,test,"[majority, action, film, generic, boring, wort...",0
2,test,"[hate, moronic, rapper, gun, press, forehead, ...",0
3,test,"[beatles, write, song, walter, hill, think, pr...",0
4,test,"[brass, picture, movie, fit, word, brassy, all...",0
...,...,...,...
49995,train,"[vote, average, pretty, low, fact, clerk, vide...",1
49996,train,"[plot, wretched, unbelievable, twist, chemistr...",1
49997,train,"[amazed, movie, average, star, low, crappy, mo...",1
49998,train,"[christmas, time, raise, john, denver, song, s...",1


In [137]:
cp_lemmatized_words = lemmatized_words.copy(deep=True)

In [138]:
cp_lemmatized_words["review"] = cp_lemmatized_words.review.map( lambda x: " ".join(x) )
reviews = cp_lemmatized_words.review.tolist()

# len(reviews)
vectorized = CountVectorizer()
# vectorized = CountVectorizer(max_df=0.9, min_df=0.2)
fitted = vectorized.fit_transform(reviews)
fitted

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 3416365 stored elements and shape (50000, 86923)>

In [114]:
cp_lemmatized_words

Unnamed: 0,type,review,label
0,test,once again mr. costner have drag out a movie f...,1
1,test,this be an example of why the majority of acti...,1
2,test,first of all i hate those moronic rapper who c...,1
3,test,not even the beatles could write song everyone...,1
4,test,brass picture movie be not a fitting word for ...,1
...,...,...,...
99995,train,delightfully awful make by david giancola a gu...,1
99996,train,watch time chaser it obvious that it be make b...,1
99997,train,at the beginning we can see member of troma te...,1
99998,train,the movie be incredible ever since i saw it in...,1


In [139]:
X = fitted
y = cp_lemmatized_words["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3)

0.869

In [140]:
xgb = XGBClassifier(max_depth=5, n_estimators=1000).fit(X_train, y_train)

pred = xgb.predict(X_test)

accuracy_score(y_test, pred)

0.8678666666666667

0.8699

In [142]:
xgb = XGBClassifier(max_depth=3, n_estimators=1000).fit(X_train, y_train)

pred = xgb.predict(X_test)

accuracy_score(y_test, pred)

0.8678666666666667

In [143]:
from sklearn.svm import LinearSVC
model = LinearSVC(max_iter=10000)
model.fit(X_train, y_train)
predicts = model.predict(X_test)
accuracy_score(y_test, predicts)

0.8485333333333334

In [None]:
xgb = XGBClassifier(max_depth=3, n_estimators=1000, objective = 'binary:logistic' ).fit(X_train, y_train)

pred = xgb.predict(X_test)

accuracy_score(y_test, pred)

0.851

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train, y_train)
pred_rfc = model.predict(X_test)
accuracy_score(y_test, pred_rfc)

0.8656

In [None]:
from sklearn.linear_model import SGDClassifier

model = SGDClassifier(max_iter=10000, tol=1e-3)
model.fit(X_train, y_train)
pred_rfc = model.predict(X_test)
accuracy_score(y_test, pred_rfc)

0.8632

In [None]:
model = SGDClassifier(max_iter=10000, loss='hinge', tol=1e-4, alpha=0.0001)
model.fit(X_train, y_train)
pred_rfc = model.predict(X_test)
accuracy_score(y_test, pred_rfc)

0.8352

In [145]:
from sklearn.naive_bayes import BernoulliNB
model = BernoulliNB()
model.fit(X_train, y_train)
pred_rfc = model.predict(X_test)
accuracy_score(y_test, pred_rfc)

0.8334666666666667

0.8556

In [146]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)
pred_rfc = model.predict(X_test)
accuracy_score(y_test, pred_rfc)

0.8515333333333334

In [147]:
from sklearn.kernel_approximation import AdditiveChi2Sampler
from sklearn.linear_model import SGDClassifier

chi2sampler = AdditiveChi2Sampler(sample_steps=2)
X_transformed = chi2sampler.fit_transform(X_train, y_train)

model = SGDClassifier(max_iter=50, random_state=0, tol=1e-3)
model.fit(X_transformed, y_train)
pred_rfc = model.predict(chi2sampler.fit_transform(X_test))
accuracy_score(y_test, pred_rfc)

0.7316666666666667

In [148]:
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import MultinomialNB
bagging_classifier = BaggingClassifier(estimator=MultinomialNB(), n_estimators=5)
bagging_classifier.fit(X_train, y_train)
predictions = bagging_classifier.predict(X_test)
accuracy_score(y_test, predictions)

0.8516

In [149]:
from sklearn.ensemble import BaggingClassifier
from xgboost import XGBClassifier
bagging_classifier = BaggingClassifier(estimator=XGBClassifier(max_depth=3, n_estimators=1000), n_estimators=5)
bagging_classifier.fit(X_train, y_train)
predictions = bagging_classifier.predict(X_test)
accuracy_score(y_test, predictions)

0.873

In [160]:
import pickle
# Save the trained model as a pickle string.

with open("model.pkl", "wb") as f:
    pickle.dump(bagging_classifier, f)
with open("vectorized.pkl", "wb") as f:
    pickle.dump(vectorized, f)

In [161]:
data = pd.DataFrame(["Avoid this movie at all costs, everything about it is bad", "I love it", "It's a great movie !"], columns=['review'])
tokenized = tokenizer(data)
without_stopwords = stopwords_remover(tokenized)    
without_punctuation = punctuation_remover(without_stopwords)    
tagged_words = pos_tagger(without_punctuation)
lemmatized_words = lemmatizer(tagged_words)
list_of_rows = lemmatized_words.review.tolist()

text = [" ".join(str(elm) for elm in doc) for doc in list_of_rows]
vtext = vectorized.transform(text)
bagging_classifier.predict(vtext)


array([0, 1, 1])