### 1. Import modules

In [15]:
import os
import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')    # For tokenization
nltk.download('averaged_perceptron_tagger')  # For POS tagging

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vanng\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vanng\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vanng\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\vanng\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

### 2. Load data

In [16]:
train_path = "../datasets/train"
categories = ['neg', 'pos']
data = []

def is_git_lfs_file(file_path):
    """
    Check if a file is a Git LFS pointer
    """
    with open(file_path, "r", encoding="utf-8") as file:
        first_line = file.readline().strip()
        return first_line.startswith("version https://git-lfs.github.com")

for category in categories:
    category_path = os.path.join(train_path, category)
    category= 0 if category == "neg" else 1

    for filename in os.listdir(category_path):
        file_path = os.path.join(category_path, filename)

        if is_git_lfs_file(file_path): # skip Git LFS pointer files
            continue

        with open(file_path, "r", encoding='utf-8') as file:
            content = file.read().strip()
        
        data.append((content, category))

df = pd.DataFrame(data, columns=['content', 'category'])

df


Unnamed: 0,content,category
0,Airport '77 starts as a brand new luxury 747 p...,0
1,"I don't know who to blame, the timid writers o...",0
2,This film is one giant pant load. Paul Schrade...,0
3,"The plot for Descent, if it actually can be ca...",0
4,"""Ghost of Dragstrip Hollow"" appears to take pl...",0
...,...,...
1120,At first i didn't think that Ben Affleck could...,1
1121,What would you expect from a film titled 'Surv...,1
1122,This movie isn't as bad as I heard. It was enj...,1
1123,I laughed so hard during this movie my face hu...,1


### 3. Transform data

#### 3.1. Convert data from RAW to Tokens

In [17]:
def convert_tokens(rawtext, verbose):
    # 1. Tokenization
    pattern = r'\w+'
    tokenizer = RegexpTokenizer(pattern)
    token_words = tokenizer.tokenize(rawtext)
    if (verbose):
        print('Tokens:'+str(token_words[0:10]))
    
    # 2. Decapitalization
    decap_token_words = [word.lower() for word in token_words]
    if (verbose):
        print("Decapitalized tokens:" + str(decap_token_words[0:10]))
    
    # 3. Remove stop words
    stopwords_nltk_en = set(stopwords.words('english'))
    rmsw_token_words = ([word for word in token_words if word.lower() not in stopwords_nltk_en])
    if (verbose):
        print('Stopwords removed:' + str(rmsw_token_words[0:20]))
    
    # 4. Remove CAP words
    rmcap_token_words = []
    for word in rmsw_token_words:
        if word.isupper():
            rmcap_token_words.append(word.title())
        else:
            rmcap_token_words.append(word)
    if (verbose):
        print('CAPITALIZED removed:' + str(rmcap_token_words[0:20]))
    
    # 5. Remove salutation
    salutation = ['mr', 'mrs', 'ms', 'dr', 'phd', 'prof', 'rev']
    rmsalu_token_words = ([word for word in rmsw_token_words if word.lower() not in salutation])
    if (verbose):
        print('Salutation removed:' + str(rmsalu_token_words[0:20]))
    
    # 6. Define transfer tag function:
    def transfer_tag(treebank_tag):
        treebank_tag = treebank_tag.lower()
        if treebank_tag.startswith('j'):
            return "a"
        elif treebank_tag.startswith('v'):
            return "v"
        elif treebank_tag.startswith('n'):
            return 'n'
        elif treebank_tag.startswith('r'):
            return 'r'
        else:
            return 'n'
    
    # 7. Lemmatization
    wnl = WordNetLemmatizer()

    lemma_words = []
    for word, tag in nltk.pos_tag(rmsalu_token_words):
        firstletter = tag[0].lower()
        wtag = transfer_tag(firstletter)
        if not wtag:
            lemma_words.extend([word])
        else:
            lemma_words.extend([wnl.lemmatize(word, wtag)])
    if verbose:
        print('Lemma:' + str(lemma_words[0:10]))
    
    # 8. English words
    eng_words = [word for word in lemma_words if len(wn.synsets(word.lower())) > 1]

    # 9 Remove numbers
    rmnb_token_words = ([word for word in eng_words if not word.isdigit()])
    if (verbose):
        print('Number removed:' + str(rmnb_token_words[0:20]))
    
    return rmnb_token_words

In [18]:
df_tokenized = df.copy()
[n, d] = df_tokenized.shape
df_tokenized['tokens'] = ['']*n

for index, row in df_tokenized.iterrows():
    df_tokenized.at[index, "tokens"] = convert_tokens(row['content'], verbose = False)


df_tokenized.head(10)

Unnamed: 0,content,category,tokens
0,Airport '77 starts as a brand new luxury 747 p...,0,"[start, brand, new, luxury, plane, load, valua..."
1,"I don't know who to blame, the timid writers o...",0,"[know, blame, timid, writer, director, seem, o..."
2,This film is one giant pant load. Paul Schrade...,0,"[film, one, giant, pant, load, Paul, lose, bad..."
3,"The plot for Descent, if it actually can be ca...",0,"[plot, Descent, actually, call, plot, two, not..."
4,"""Ghost of Dragstrip Hollow"" appears to take pl...",0,"[Ghost, Hollow, appear, take, place, era, long..."
5,Summer season is here when the choices in the ...,0,"[Summer, season, choice, cinemas, limited, hot..."
6,Shame on Yash Raj films and Aditya Chopra who ...,0,"[Shame, film, seem, lose, intelligence, year, ..."
7,If this is a 2008 product from one of the bigg...,0,"[product, one, big, production, house, Indian,..."
8,"I had some expectation for the movie, since it...",0,"[expectation, nice, star, cast, return, duo, W..."
9,I had a lot of expectations from this movie an...,0,"[lot, expectation, Film, br, br, Jimmy, operat..."


#### 3.2. TF-IDF

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(norm=None)
list_contents = []
for index, row in df_tokenized.iterrows():
    list_contents.append(" ".join(row.tokens))

tfidf_matrix = tfidf_vectorizer.fit_transform(list_contents)
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=[tfidf_vectorizer.get_feature_names_out()])

df_tfidf.head(10)

Unnamed: 0,3d,aback,abandon,abandoned,abandoning,abbey,abduct,abducts,abide,ability,...,zen,zero,zest,zillion,zip,zodiac,zombie,zombies,zone,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9.870769,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
from sklearn.model_selection import train_test_split

# Assuming X is the DataFrame containing TF-IDF features and y is the category column
X_train, X_test, y_train, y_test = train_test_split(df_tfidf, df_tokenized['category'], test_size=0.2, random_state=42, stratify=df_tokenized['category'])

### 3. Apply models

In [22]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(max_iter=500)
log_reg.fit(X_train, y_train)

# Naive Bayes 
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [23]:
from sklearn.model_selection import cross_val_score
# Linear Regression:
cv_scores_lr = cross_val_score(log_reg, X_train, y_train, cv=5)  # 5-fold cross-validation
print("Cross-validation Accuracy for linear regression:", cv_scores_lr.mean())

# Naive Bayes:
cv_scores_nb = cross_val_score(nb, X_train, y_train, cv=5)  # 5-fold cross-validation
print("Cross-validation Accuracy for Naive Bayes:", cv_scores_nb.mean())

# Random Forest:
cv_scores_rf = cross_val_score(rf, X_train, y_train, cv=5)  # 5-fold cross-validation
print("Cross-validation Accuracy for Random Forest:", cv_scores_rf.mean())


Cross-validation Accuracy for linear regression: 0.8422222222222222
Cross-validation Accuracy for Naive Bayes: 0.8433333333333334
Cross-validation Accuracy for Random Forest: 0.8044444444444444


<span style="color:yellow">Cross validation for naive_bayes score was the highest: 0.843</span>

In [24]:
from lazypredict.Supervised import LazyClassifier
lazy_clf = LazyClassifier()
clf_models = lazy_clf.fit(X_train, X_test, y_train, y_test)
print(clf_models)

 97%|█████████▋| 30/31 [01:00<00:01,  1.76s/it]

[LightGBM] [Info] Number of positive: 464, number of negative: 436
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012792 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3795
[LightGBM] [Info] Number of data points in the train set: 900, number of used features: 780
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.515556 -> initscore=0.062242
[LightGBM] [Info] Start training from score 0.062242


100%|██████████| 31/31 [01:01<00:00,  1.97s/it]

(                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
BernoulliNB                        0.85               0.85     0.85      0.85   
NearestCentroid                    0.84               0.84     0.84      0.84   
RandomForestClassifier             0.83               0.83     0.83      0.83   
ExtraTreesClassifier               0.83               0.82     0.82      0.83   
LogisticRegression                 0.81               0.81     0.81      0.81   
RidgeClassifierCV                  0.80               0.80     0.80      0.80   
RidgeClassifier                    0.80               0.80     0.80      0.80   
XGBClassifier                      0.80               0.79     0.79      0.80   
LinearSVC                          0.80               0.79     0.79      0.79   
PassiveAggressiveClassifier        0.80               0.79     0.79      0.79   
NuSVC                      




<span style="color:yellow">From the result table, Naive Bayes (BernoulliNB) gave the best score: 0.85 for accuracy, balanced accuracy and F1 score</span>

### 4. Hyperparameter Tuning

#### 4.1. For Logistic Regression

In [25]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['liblinear', 'lbfgs']  # Different solvers for optimization
}

# Perform Grid Search with Cross-Validation (cv=5 means 5-fold)
grid_search = GridSearchCV(LogisticRegression(max_iter=500), param_grid_lr, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Validation Accuracy:", grid_search.best_score_)


Best Parameters: {'C': 0.01, 'solver': 'lbfgs'}
Best Validation Accuracy: 0.8477777777777777


#### 4.2. For Naive Bayes

In [26]:
param_grid_nb = {'alpha': [0.01, 0.1, 1, 10]}  # Smoothing parameter
grid_search_nb = GridSearchCV(MultinomialNB(), param_grid_nb, cv=5, scoring='accuracy')
grid_search_nb.fit(X_train, y_train)

print("Best Alpha:", grid_search_nb.best_params_)
print("Best Validation Accuracy:", grid_search_nb.best_score_)

Best Alpha: {'alpha': 10}
Best Validation Accuracy: 0.8677777777777778


#### 4.3. For Random Forest

In [27]:
param_grid_rf = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}

grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=3, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)

print("Best Parameters:", grid_search_rf.best_params_)
print("Best Validation Accuracy:", grid_search_rf.best_score_)


Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 200}
Best Validation Accuracy: 0.8177777777777778


### 5. Evaluate Model Performance

In [28]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Use best models from hyperparameter tuning:
best_lr = grid_search.best_estimator_
best_nb = grid_search_nb.best_estimator_
best_rf = grid_search_rf.best_estimator_

# Logistic Regression Evaluation
y_pred_log_reg = best_lr.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log_reg))
print(classification_report(y_test, y_pred_log_reg))

# Naive Bayes Evaluation
y_pred_nb = best_nb.predict(X_test)
print("Naïve Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))
print(f"Confusion Matrix for Naive Bayes: \n{confusion_matrix(y_test, y_pred_nb)}")

# Random Forest Evaluation
y_pred_rf = best_rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Logistic Regression Accuracy: 0.8533333333333334
              precision    recall  f1-score   support

           0       0.88      0.81      0.84       109
           1       0.83      0.90      0.86       116

    accuracy                           0.85       225
   macro avg       0.86      0.85      0.85       225
weighted avg       0.86      0.85      0.85       225

Naïve Bayes Accuracy: 0.8488888888888889
              precision    recall  f1-score   support

           0       0.84      0.84      0.84       109
           1       0.85      0.85      0.85       116

    accuracy                           0.85       225
   macro avg       0.85      0.85      0.85       225
weighted avg       0.85      0.85      0.85       225

Confusion Matrix for Naive Bayes: 
[[92 17]
 [17 99]]
Random Forest Accuracy: 0.8266666666666667
              precision    recall  f1-score   support

           0       0.86      0.76      0.81       109
           1       0.80      0.89      0.84       