In [29]:
# import libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
tqdm.pandas()
import string
import re
import nltk
nltk.download(['movie_reviews', 'punkt', 'punkt_tab', 'wordnet', 'stopwords'])
from nltk import word_tokenize, sent_tokenize

from nltk.corpus import stopwords, movie_reviews
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score, classification_report

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\savic\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\savic\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\savic\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\savic\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\savic\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# get negative and positive reviews
pr, nr = [], []

for rate in ['pos', 'neg']:
    for fileid in movie_reviews.fileids(rate):
        words = movie_reviews.words(fileid)
        if rate == 'pos':
            pr.append(' '.join(words))
        else:
            nr.append(' '.join(words))
            
print(f'Positive reviews count: {len(pr)}')
print(f'Negative reviews count: {len(nr)}')

Reviews count: 2000


In [3]:
# get full dataframe
pos_df = pd.DataFrame(data=pr, columns=['review'])
pos_df['rate'] = 'pos'
neg_df = pd.DataFrame(data=nr, columns=['review'])
neg_df['rate'] = 'neg'
full_df = pd.concat([pos_df, neg_df], ignore_index=True)
full_df

Unnamed: 0,review,rate
0,films adapted from comic books have had plenty...,pos
1,every now and then a movie comes along from a ...,pos
2,you ' ve got mail works alot better than it de...,pos
3,""" jaws "" is a rare film that grabs your attent...",pos
4,moviemaking is a lot like being the general ma...,pos
...,...,...
1995,"if anything , "" stigmata "" should be taken as ...",neg
1996,"john boorman ' s "" zardoz "" is a goofy cinemat...",neg
1997,the kids in the hall are an acquired taste . i...,neg
1998,there was a time when john carpenter was a gre...,neg


In [4]:
# get full text preprocessing
def full_text_preprocessing(reviews_series: pd.Series = full_df['review'], 
                            tokenizer : nltk = sent_tokenize,
                            stopwords : nltk.corpus.stopwords.words = stopwords.words('english'),
                            stemmer : nltk.stem = None, 
                            lemmatizer : nltk.stem = WordNetLemmatizer()):

    """
    Function makes full text preprocessing like removing punctuation / stopwords, lowercasing, tokenizing, stemming / lemmatizing

    Args:
    * reviews_series - pd.Series of input text;
    * tokenizer - nltk.tokenizer working on input text; 
    * stopwords - stopwords for deleting from input text;
    * stemmer - stemmer for input text - works if lemmatizer is None;
    * lemmatizer - lemmatizer for input text - works if stemmer is None;

    Returns:
    * Output preprocessed text - pd.Series
    """

    # checking stemmer and lemmatizer
    if stemmer is None and lemmatizer is None:
        print('Check your stemmer and lemmatizer: both of them cannot be None')
        return
    elif stemmer is not None and lemmatizer is not None:
        print('Check your stemmer and lemmatizer: both of them cannot be not None')
        return
        
    # remove URL and HTML code
    def remove_url_and_html_words(tokens_lst: list):

        """
        Function deletes HTML and URL pattern from input tokens list
    
        Args:
        * tokens_lst;
    
        Returns:
        * Output tokens list without HTML and URL pattern - list
        """
        
        # define a regular expression pattern to match URLs
        url_pattern = r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"
        
        # define a regular expression pattern to match HTML tags
        html_pattern = r"<[^>]+>"
    
        # replace URLs with an empty string
        cleaned_lst = re.sub(url_pattern, "", tokens_lst)
    
        # replace HTML tags with an empty string
        cleaned_lst = re.sub(html_pattern, "", tokens_lst)
    
        return cleaned_lst

    print('Removing URL and HTML words from text...')
    reviews_series = reviews_series.progress_apply(lambda x: remove_url_and_html_words(x))
    
    # tokenization
    print('Tokenizing text...')
    tokens = reviews_series.progress_apply(lambda x: word_tokenize(x))

    # lowercasing
    print('Lowercasing text...')
    tokens = tokens.progress_apply(lambda x: list(map(str.lower, x)))

    # remove punctuation
    print('Removing punctuation from text...')
    def remove_punctation(tokens_lst: list):

        """
        Function deletes punctuation from input tokens list
    
        Args:
        * tokens_lst;
    
        Returns:
        * Output tokens list without punctuation - list
        """
        
        return [token for token in tokens_lst if token not in string.punctuation + '``']

    tokens = tokens.progress_apply(lambda x: remove_punctation(x))

    # remove stopwords
    print('Removing stopwords from text...')
    def remove_stopwords(tokens_lst: list):

        """
        Function deletes stopwords from input tokens list
    
        Args:
        * tokens_lst;
    
        Returns:
        * Output tokens list without stopwords - list
        """
        
        return [token for token in tokens_lst if token not in stopwords]

    tokens = tokens.progress_apply(lambda x: remove_stopwords(x))

    # remove the most common words (e.g., the top 10% of words by frequency)
    def remove_frequency_words(tokens_lst):
        return [token for token in tokens_lst if nltk.FreqDist(token)[token] < nltk.FreqDist(token).N() * 0.1]

    print('Removing the most common words from text...')
    tokens = tokens.progress_apply(lambda x: remove_frequency_words(x))

    # stemming or lemmatizing
    def stemming(tokens_lst: list):

        """
        Function makes stemming from input tokens list
    
        Args:
        * tokens_lst;
    
        Returns:
        * Output tokens list after stemming - list
        """
        
        return [stemmer.stem(token) for token in tokens_lst]

    def lemmatizing(tokens_lst: list):

        """
        Function makes lemmatizing from input tokens list
    
        Args:
        * tokens_lst;
    
        Returns:
        * Output tokens list after lemmatizing - list
        """
        
        return [lemmatizer.lemmatize(token) for token in tokens_lst]

    if stemmer is not None:
        print('Stemming...')
        tokens = tokens.progress_apply(lambda x: stemming(x))
    else:
        print('Lemmatizing...')
        tokens = tokens.progress_apply(lambda x: lemmatizing(x))

    print('Joining...')
    tokens = tokens.progress_apply(lambda x: ' '.join(x))

    return tokens

tokens = full_text_preprocessing()

Removing URL and HTML words from text...


100%|███████████████████████████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 37341.62it/s]


Tokenizing text...


100%|█████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:07<00:00, 252.06it/s]


Lowercasing text...


100%|███████████████████████████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 15499.64it/s]


Removing punctuation from text...


100%|████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 9613.89it/s]


Removing stopwords from text...


100%|█████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:02<00:00, 864.17it/s]


Removing the most common words from text...


100%|█████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:07<00:00, 266.85it/s]


Lemmatizing...


100%|█████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:06<00:00, 325.15it/s]


Joining...


100%|███████████████████████████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 63404.04it/s]


In [5]:
# dataframe after full text preprocessing and get X, y
for_model_df = pd.DataFrame(tokens)
for_model_df['rate'] = full_df['rate']
X, y = for_model_df[['review']], for_model_df['rate']
print(X.shape, y.shape)

(2000, 1) (2000,)


In [6]:
# tf-idf transformer
tf_idf_transformer = TfidfVectorizer(stop_words='english', ngram_range=(1, 5), max_features=150000)

# instance of StratifiedKFold
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

# get train and test parts of dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, stratify=y, random_state=42)
                      
X_train_text = tf_idf_transformer.fit_transform(X_train['review'])
X_test_text = tf_idf_transformer.transform(X_test['review'])

In [57]:
# logistic regression
logreg = LogisticRegression()
logreg_param_grid = [{'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']},
                     {'penalty':['none', 'elasticnet', 'l1', 'l2']},
                     {'C':np.logspace(-5, 5, 11)},
                     {'max_iter': [300, 500, 1000, 2000]}]


logreg_grid_search = GridSearchCV(estimator = logreg,  
                                  param_grid = logreg_param_grid,
                                  scoring = 'accuracy',
                                  cv = skf,
                                  verbose=3,
                                  n_jobs=-1)

logreg_grid_search.fit(X_train_text, y_train)

y_logreg_train_pred = logreg_grid_search.predict(X_train_text)
y_logreg_test_pred = logreg_grid_search.predict(X_test_text)

# evaluation of model
print(f'{logreg} Train accuracy score: {accuracy_score(y_true = y_train, y_pred = y_logreg_train_pred)}')
print(f'{logreg} Train classification report:\n {classification_report(y_true = y_train, y_pred = y_logreg_train_pred)}')
print('-' * 70)
print(f'{logreg} Test accuracy score: {accuracy_score(y_true = y_test, y_pred = y_logreg_test_pred)}')
print(f'{logreg} Test classification report:\n {classification_report(y_true = y_test, y_pred = y_logreg_test_pred)}')

Fitting 5 folds for each of 24 candidates, totalling 120 fits
LogisticRegression() Train accuracy score: 1.0
LogisticRegression() Train classification report:
               precision    recall  f1-score   support

         neg       1.00      1.00      1.00       700
         pos       1.00      1.00      1.00       700

    accuracy                           1.00      1400
   macro avg       1.00      1.00      1.00      1400
weighted avg       1.00      1.00      1.00      1400

----------------------------------------------------------------------
LogisticRegression() Test accuracy score: 0.8266666666666667
LogisticRegression() Test classification report:
               precision    recall  f1-score   support

         neg       0.85      0.80      0.82       300
         pos       0.81      0.85      0.83       300

    accuracy                           0.83       600
   macro avg       0.83      0.83      0.83       600
weighted avg       0.83      0.83      0.83       600



In [8]:
# decision tree classifier
dt = DecisionTreeClassifier()
dt_param_grid = [{'criterion': ['gini', 'entropy', 'log_loss']},
                 {'splitter': ['best', 'random']},
                 {'max_depth': np.linspace(3, 7, 5)},
                 {'min_samples_split': np.linspace(2, 5, 4)}]


dt_grid_search = GridSearchCV(estimator = dt,  
                              param_grid = dt_param_grid,
                              scoring = 'accuracy',
                              cv = skf,
                              verbose=3,
                              n_jobs=-1)

dt_grid_search.fit(X_train_text, y_train)

y_dt_train_pred = dt_grid_search.predict(X_train_text)
y_dt_test_pred = dt_grid_search.predict(X_test_text)

# evaluation of model
print(f'{dt} Train accuracy score: {accuracy_score(y_true = y_train, y_pred = y_dt_train_pred)}')
print(f'{dt} Train classification report:\n {classification_report(y_true = y_train, y_pred = y_dt_train_pred)}')
print('-' * 70)
print(f'{dt} Test accuracy score: {accuracy_score(y_true = y_test, y_pred = y_dt_test_pred)}')
print(f'{dt} Test classification report:\n {classification_report(y_true = y_test, y_pred = y_dt_test_pred)}')

Fitting 5 folds for each of 14 candidates, totalling 70 fits
DecisionTreeClassifier() Train accuracy score: 1.0
DecisionTreeClassifier() Train classification report:
               precision    recall  f1-score   support

         neg       1.00      1.00      1.00       700
         pos       1.00      1.00      1.00       700

    accuracy                           1.00      1400
   macro avg       1.00      1.00      1.00      1400
weighted avg       1.00      1.00      1.00      1400

----------------------------------------------------------------------
DecisionTreeClassifier() Test accuracy score: 0.635
DecisionTreeClassifier() Test classification report:
               precision    recall  f1-score   support

         neg       0.63      0.65      0.64       300
         pos       0.64      0.62      0.63       300

    accuracy                           0.64       600
   macro avg       0.64      0.64      0.63       600
weighted avg       0.64      0.64      0.63       600



In [9]:
# random forest classifier
rf = RandomForestClassifier()
rf_param_grid = [{'criterion': ['gini', 'entropy', 'log_loss']},
                 {'max_depth': np.linspace(3, 10, 8)},
                 {'min_samples_split': np.linspace(2, 6, 5)},
                 {'n_estimators': [300, 500, 1000, 2000]}]


rf_grid_search = GridSearchCV(estimator = rf,  
                              param_grid = rf_param_grid,
                              scoring = 'accuracy',
                              cv = skf,
                              verbose=3,
                              n_jobs=-1)

rf_grid_search.fit(X_train_text, y_train)

y_rf_train_pred = rf_grid_search.predict(X_train_text)
y_rf_test_pred = rf_grid_search.predict(X_test_text)

# evaluation of model
print(f'{rf} Train accuracy score: {accuracy_score(y_true = y_train, y_pred = y_rf_train_pred)}')
print(f'{rf} Train classification report:\n {classification_report(y_true = y_train, y_pred = y_rf_train_pred)}')
print('-' * 70)
print(f'{rf} Test accuracy score: {accuracy_score(y_true = y_test, y_pred = y_rf_test_pred)}')
print(f'{rf} Test classification report:\n {classification_report(y_true = y_test, y_pred = y_rf_test_pred)}')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
RandomForestClassifier() Train accuracy score: 1.0
RandomForestClassifier() Train classification report:
               precision    recall  f1-score   support

         neg       1.00      1.00      1.00       700
         pos       1.00      1.00      1.00       700

    accuracy                           1.00      1400
   macro avg       1.00      1.00      1.00      1400
weighted avg       1.00      1.00      1.00      1400

----------------------------------------------------------------------
RandomForestClassifier() Test accuracy score: 0.835
RandomForestClassifier() Test classification report:
               precision    recall  f1-score   support

         neg       0.79      0.92      0.85       300
         pos       0.90      0.75      0.82       300

    accuracy                           0.83       600
   macro avg       0.84      0.83      0.83       600
weighted avg       0.84      0.83      0.83       600



In [10]:
# SVC
svc = SVC()
svc_param_grid = [{'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
                  {'C': np.logspace(-5, 5, 11)},
                  {'degree': np.linspace(3, 10, 8)},
                  {'gamma': ['scale', 'auto']},
                  {'max_iter': [300, 500, 1000, 2000]}]


svc_grid_search = GridSearchCV(estimator = svc,  
                               param_grid = svc_param_grid,
                               scoring = 'accuracy',
                               cv = skf,
                               verbose=3,
                               n_jobs=-1)

svc_grid_search.fit(X_train_text, y_train)

y_svc_train_pred = svc_grid_search.predict(X_train_text)
y_svc_test_pred = svc_grid_search.predict(X_test_text)

# evaluation of model
print(f'{svc} Train accuracy score: {accuracy_score(y_true = y_train, y_pred = y_svc_train_pred)}')
print(f'{svc} Train classification report:\n {classification_report(y_true = y_train, y_pred = y_svc_train_pred)}')
print('-' * 70)
print(f'{svc} Test accuracy score: {accuracy_score(y_true = y_test, y_pred = y_svc_test_pred)}')
print(f'{svc} Test classification report:\n {classification_report(y_true = y_test, y_pred = y_svc_test_pred)}')

Fitting 5 folds for each of 29 candidates, totalling 145 fits
SVC() Train accuracy score: 1.0
SVC() Train classification report:
               precision    recall  f1-score   support

         neg       1.00      1.00      1.00       700
         pos       1.00      1.00      1.00       700

    accuracy                           1.00      1400
   macro avg       1.00      1.00      1.00      1400
weighted avg       1.00      1.00      1.00      1400

----------------------------------------------------------------------
SVC() Test accuracy score: 0.8166666666666667
SVC() Test classification report:
               precision    recall  f1-score   support

         neg       0.84      0.78      0.81       300
         pos       0.80      0.85      0.82       300

    accuracy                           0.82       600
   macro avg       0.82      0.82      0.82       600
weighted avg       0.82      0.82      0.82       600



In [24]:
# XGBCLassifier
xgb = XGBClassifier()
xgb_param_grid = [{'booster': ['gbtree', 'gblinear']},
                  {'max_depth': np.linspace(3, 7, 5)},
                  {'max_leaves': np.linspace(3, 6, 4)},
                  {'n_estimators': [30, 50, 70, 100]},
                  {'sampling_method': ['gradient_based', 'uniform']}]


xgb_grid_search = GridSearchCV(estimator = xgb,  
                               param_grid = xgb_param_grid,
                               scoring = 'accuracy',
                               cv = skf,
                               verbose=3,
                               n_jobs=-1)

xgb_grid_search.fit(X_train_text, y_train.map({'neg': 0, 'pos': 1}))

y_xgb_train_pred = xgb_grid_search.predict(X_train_text)
y_xgb_test_pred = xgb_grid_search.predict(X_test_text)

# evaluation of model
print(f"XGB Train accuracy score: {accuracy_score(y_true = y_train.map({'neg': 0, 'pos': 1}), y_pred = y_xgb_train_pred)}")
print(f"XGB Train classification report:\n {classification_report(y_true = y_train.map({'neg': 0, 'pos': 1}), y_pred = y_xgb_train_pred)}")
print('-' * 70)
print(f"XGB Test accuracy score: {accuracy_score(y_true = y_test.map({'neg': 0, 'pos': 1}), y_pred = y_xgb_test_pred)}")
print(f"XGB Test classification report:\n {classification_report(y_true = y_test.map({'neg': 0, 'pos': 1}), y_pred = y_xgb_test_pred)}")

Fitting 5 folds for each of 17 candidates, totalling 85 fits
[CV 1/5] END ....................booster=gbtree;, score=0.771 total time=   8.1s
[CV 2/5] END ....................booster=gbtree;, score=0.771 total time=   8.2s
[CV 3/5] END ....................booster=gbtree;, score=0.832 total time=   8.2s
[CV 4/5] END ....................booster=gbtree;, score=0.786 total time=   8.0s
[CV 5/5] END ....................booster=gbtree;, score=0.764 total time=   8.1s
[CV 1/5] END ..................booster=gblinear;, score=0.761 total time=   0.1s
[CV 2/5] END ..................booster=gblinear;, score=0.739 total time=   0.1s
[CV 3/5] END ..................booster=gblinear;, score=0.789 total time=   0.1s
[CV 4/5] END ..................booster=gblinear;, score=0.736 total time=   0.1s
[CV 5/5] END ..................booster=gblinear;, score=0.729 total time=   0.1s
[CV 1/5] END .......................max_depth=3.0;, score=nan total time=   0.0s
[CV 2/5] END .......................max_depth=3.

In [52]:
# CatBoostClassifier
eval_dataset = Pool(X_test_text, y_test)
cb = CatBoostClassifier(l2_leaf_reg = 0.9, depth = 7, iterations = 400, eval_metric='Accuracy')
cb.fit(X_train_text, y_train, eval_set=eval_dataset)

y_cb_train_pred = cb.predict(X_train_text)
y_cb_test_pred = cb.predict(X_test_text)

# evaluation of model
print(f'CatBoost Train accuracy score: {accuracy_score(y_true = y_train, y_pred = y_cb_train_pred)}')
print(f'CatBoost Train classification report:\n {classification_report(y_true = y_train, y_pred = y_cb_train_pred)}')
print('-' * 70)
print(f'CatBoost Test accuracy score: {accuracy_score(y_true = y_test, y_pred = y_cb_test_pred)}')
print(f'CatBoost Test classification report:\n {classification_report(y_true = y_test, y_pred = y_cb_test_pred)}')

0:	learn: 0.6735714	test: 0.6566667	best: 0.6566667 (0)	total: 702ms	remaining: 4m 39s
1:	learn: 0.6950000	test: 0.6633333	best: 0.6633333 (1)	total: 1.36s	remaining: 4m 31s
2:	learn: 0.6992857	test: 0.6700000	best: 0.6700000 (2)	total: 2.02s	remaining: 4m 27s
3:	learn: 0.7114286	test: 0.6866667	best: 0.6866667 (3)	total: 2.67s	remaining: 4m 24s
4:	learn: 0.7092857	test: 0.6766667	best: 0.6866667 (3)	total: 3.33s	remaining: 4m 22s
5:	learn: 0.7192857	test: 0.6966667	best: 0.6966667 (5)	total: 3.97s	remaining: 4m 20s
6:	learn: 0.7128571	test: 0.6850000	best: 0.6966667 (5)	total: 4.63s	remaining: 4m 19s
7:	learn: 0.7221429	test: 0.6916667	best: 0.6966667 (5)	total: 5.27s	remaining: 4m 18s
8:	learn: 0.7221429	test: 0.6966667	best: 0.6966667 (5)	total: 5.93s	remaining: 4m 17s
9:	learn: 0.7135714	test: 0.6850000	best: 0.6966667 (5)	total: 6.61s	remaining: 4m 17s
10:	learn: 0.7128571	test: 0.6833333	best: 0.6966667 (5)	total: 7.25s	remaining: 4m 16s
11:	learn: 0.7235714	test: 0.6800000	best: