# Model training and evaluation

In [46]:
import pandas as pd
import numpy as np
df = pd.read_csv("data/datasetTransformed.csv")
df

Unnamed: 0,Review_Text,Sentiment
0,If you've ever been to Disneyland anywhere you...,0
1,Its been a while since d last time we visit HK...,0
2,Thanks God it wasn t too hot or too humid wh...,0
3,HK Disneyland is a great compact park. Unfortu...,0
4,"the location is not in the city, took around 1...",0
...,...,...
42651,i went to disneyland paris in july 03 and thou...,1
42652,2 adults and 1 child of 11 visited Disneyland ...,1
42653,My eleven year old daughter and myself went to...,1
42654,"This hotel, part of the Disneyland Paris compl...",0


In [45]:
# Imports
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from bs4 import BeautifulSoup
import re


In [None]:
# Cleaning Review_Text column
def text_cleaning(text):
    soup = BeautifulSoup(text, "html.parser")
    text = re.sub(r'\[[^]]*\]', '', soup.get_text())
    pattern = r"[^a-zA-Z0-9\s,']"
    text = re.sub(pattern, '', text)
    stop_words = set(stopwords.words('english'))

    custom_stopwords = {
        'movie', 'film', 'disney', 'disneyland', 'park', 'ride', 
        'attraction', 'character', 'show', 'time', 'day', 'get',
        'go', 'one', 'would', 'could', 'also', 'even', 'us', 'get'
    }
    stop_words.update(custom_stopwords)
    return text

df['Clean_text'] = df['Review_Text'].apply(text_cleaning).tolist()

In [48]:
# Split dataframe into train and test 
X = df['Clean_text']
Y = df['Sentiment']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, shuffle=True)


# Model 1: Logistic Regression
The first model I am going to use for the classification problem is Logistic Regression. This section includes:
- Best Parameter Search
- Model training
- Model Evaluation

In [25]:
# BEST PARAMETER SEARCH
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Define pipeline
parametersTest = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(max_iter=10000))
])

# Params combination
param_grid = {
    'tfidf__max_features': [1000, 5000, 10000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'clf__C': [0.1, 1, 10, 100]
}

# Instantiate the grid with the complete pipeline
grid_search = GridSearchCV(
    parametersTest, 
    param_grid, 
    cv=5, 
    scoring='accuracy',
    n_jobs=-1 
)

grid_search.fit(X_train, Y_train)

# Best parameters founded
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)



Best parameters: {'clf__C': 1, 'tfidf__max_features': 10000, 'tfidf__ngram_range': (1, 2)}
Best score: 0.7811510935332742


In [57]:
# MODEL TRAINING
model = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2),stop_words='english', max_features=10000)), # Convert text to numbers
    ('clf', LogisticRegression(C=1))        # Apply logistic Regression
])

# Train model
model.fit(X_train, Y_train)

0,1,2
,steps,"[('tfidf', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [59]:
# MODEL EVALUATION
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

Y_pred=model.predict(X_test)

# Instances classified correctly
correctly_classified_instances = sum(Y_pred == Y_test)
print(f"Instances classified correctly: {correctly_classified_instances}")

# Instances classified incorrectly
incorrectly_classified_instances = sum(Y_pred != Y_test)
print(f"Instances classified incorrectly: {incorrectly_classified_instances}")

# Model accuracy
model_accuracy = accuracy_score(Y_test, Y_pred)
print(f"Model accuracy: {model_accuracy}")

# Confusion matrix
confusion_matrix_dt = confusion_matrix(Y_test, Y_pred)
print(f"Confusion matrix:\n{confusion_matrix_dt}")

# Classification report
classification_report_dt = classification_report(Y_test, Y_pred, output_dict=True)
print(f"Classification report:\n{classification_report(Y_test, Y_pred)}")

# FPR - False Positive Rate FPR = FP / (FP + TN)
false_positives = confusion_matrix_dt.sum(axis=0) - np.diag(confusion_matrix_dt)
true_negatives = confusion_matrix_dt.sum() - (false_positives + confusion_matrix_dt.sum(axis=1))
false_positive_rate = false_positives / (false_positives + true_negatives)
print(f"False positive rate for different classes \n[negative positive] \n{false_positive_rate}")

Instances classified correctly: 6622
Instances classified incorrectly: 1910
Model accuracy: 0.7761368963900609
Confusion matrix:
[[2764 1129]
 [ 781 3858]]
Classification report:
              precision    recall  f1-score   support

           0       0.78      0.71      0.74      3893
           1       0.77      0.83      0.80      4639

    accuracy                           0.78      8532
   macro avg       0.78      0.77      0.77      8532
weighted avg       0.78      0.78      0.77      8532

False positive rate for different classes 
[negative positive] 
[0.16835525 0.29000771]


# Model 2: MultinomialNB
This section includes:
- Best Parameter Search
- Model training
- Model Evaluation

In [29]:
from sklearn.naive_bayes import MultinomialNB
# BEST PARAMETER SEARCH
# Define pipeline
parametersTest = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

# Params combination
param_grid = {
    'tfidf__max_features': [1000, 5000, 10000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'clf__alpha': [0.1, 0.5, 1.0, 2.0]               # Parámetro de suavizado de NB

}

# Instantiate the grid with the complete pipeline
grid_search = GridSearchCV(
    parametersTest, 
    param_grid, 
    cv=5, 
    scoring='accuracy',
    n_jobs=-1 
)

grid_search.fit(X_train, Y_train)

# Best parameters founded
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best parameters: {'clf__alpha': 1.0, 'tfidf__max_features': 10000, 'tfidf__ngram_range': (1, 2)}
Best score: 0.7737075308435214


In [None]:
# MODEL TRAINING
model_NB = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2),stop_words='english', max_features=10000)), # Convert text to numbers
    ('clf', MultinomialNB(alpha=1.0))        
])

# Train model
model_NB.fit(X_train, Y_train)

0,1,2
,steps,"[('tfidf', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [39]:
# MODEL EVALUATION
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

Y_pred=model_NB.predict(X_test)

# Instances classified correctly
correctly_classified_instances_NB = sum(Y_pred == Y_test)
print(f"Instances classified correctly: {correctly_classified_instances}")

# Instances classified incorrectly
incorrectly_classified_instances_NB = sum(Y_pred != Y_test)
print(f"Instances classified incorrectly: {incorrectly_classified_instances}")

# Model accuracy
model_accuracy_NB= accuracy_score(Y_test, Y_pred)
print(f"Model accuracy: {model_accuracy_NB}")

# Confusion matrix
confusion_matrix_dt_NB = confusion_matrix(Y_test, Y_pred)
print(f"Confusion matrix:\n{confusion_matrix_dt}")

# Classification report
classification_report_dt_NB = classification_report(Y_test, Y_pred, output_dict=True)
print(f"Classification report:\n{classification_report(Y_test, Y_pred)}")

# FPR - False Positive Rate FPR = FP / (FP + TN)
false_positives = confusion_matrix_dt_NB.sum(axis=0) - np.diag(confusion_matrix_dt_NB)
true_negatives = confusion_matrix_dt_NB.sum() - (false_positives + confusion_matrix_dt_NB.sum(axis=1))
false_positive_rate_NB = false_positives / (false_positives + true_negatives)
print(f"False positive rate for different classes \n[negative positive] \n{false_positive_rate_NB}")

Instances classified correctly: 6702
Instances classified incorrectly: 1830
Model accuracy: 0.7747304266291608
Confusion matrix:
[[2864 1066]
 [ 764 3838]]
Classification report:
              precision    recall  f1-score   support

           0       0.79      0.70      0.74      3930
           1       0.77      0.84      0.80      4602

    accuracy                           0.77      8532
   macro avg       0.78      0.77      0.77      8532
weighted avg       0.78      0.77      0.77      8532

False positive rate for different classes 
[negative positive] 
[0.16232073 0.29898219]


# Model 3: RandomForestClassifier
This section includes:
- Best Parameter Search
- Model training
- Model Evaluation

In [34]:
from sklearn.ensemble import RandomForestClassifier

# BEST PARAMETER SEARCH
# Define pipeline
parametersTest = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', RandomForestClassifier(random_state=42))
])

# Params combination
param_grid = {
    'tfidf__max_features': [5000, 10000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'clf__n_estimators': [100, 200],        
    'clf__max_depth': [None, 20, 30],       
    'clf__min_samples_split': [2, 5],          
    'clf__min_samples_leaf': [1, 2]           
}

# Instantiate the grid with the complete pipeline
grid_search = GridSearchCV(
    parametersTest, 
    param_grid, 
    cv=3, 
    scoring='accuracy'
)

grid_search.fit(X_train, Y_train)

# Best parameters founded
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

KeyboardInterrupt: 

In [36]:
# MODEL TRAINING
model_RF = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 4), max_features=1000)), # Convert text to numbers
    ('clf', RandomForestClassifier(
        random_state=42,
        n_estimators=100,
        max_depth=20,
        min_samples_split=5
    ))      
])

# Train model
model_RF.fit(X_train, Y_train)

0,1,2
,steps,"[('tfidf', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,20
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [37]:
# MODEL EVALUATION
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

Y_pred=model_RF.predict(X_test)

# Instances classified correctly
correctly_classified_instances_RF = sum(Y_pred == Y_test)
print(f"Instances classified correctly: {correctly_classified_instances_RF}")

# Instances classified incorrectly
incorrectly_classified_instances_RF = sum(Y_pred != Y_test)
print(f"Instances classified incorrectly: {incorrectly_classified_instances_RF}")

# Model accuracy
model_accuracy_RF = accuracy_score(Y_test, Y_pred)
print(f"Model accuracy: {model_accuracy_RF}")

# Confusion matrix
confusion_matrix_dt_RF = confusion_matrix(Y_test, Y_pred)
print(f"Confusion matrix:\n{confusion_matrix_dt_RF}")

# Classification report
classification_report_dt_RF = classification_report(Y_test, Y_pred, output_dict=True)
print(f"Classification report:\n{classification_report(Y_test, Y_pred)}")

# FPR - False Positive Rate FPR = FP / (FP + TN)
false_positives = confusion_matrix_dt_RF.sum(axis=0) - np.diag(confusion_matrix_dt_RF)
true_negatives = confusion_matrix_dt_RF.sum() - (false_positives + confusion_matrix_dt_RF.sum(axis=1))
false_positive_rate_RF = false_positives / (false_positives + true_negatives)
print(f"False positive rate for different classes \n[negative positive] \n{false_positive_rate_RF}")

Instances classified correctly: 6137
Instances classified incorrectly: 2395
Model accuracy: 0.7192920768870136
Confusion matrix:
[[2380 1550]
 [ 845 3757]]
Classification report:
              precision    recall  f1-score   support

           0       0.74      0.61      0.67      3930
           1       0.71      0.82      0.76      4602

    accuracy                           0.72      8532
   macro avg       0.72      0.71      0.71      8532
weighted avg       0.72      0.72      0.72      8532

False positive rate for different classes 
[negative positive] 
[0.18361582 0.39440204]


# Comparison


In [40]:
# Compare all models
models = {
    'Logistic Regression': model_accuracy,
    'Multinomial NB': model_accuracy_NB, 
    'Random Forest': model_accuracy_RF
}

print("MODEL COMPARISON:")
for model_name, acc in models.items():
    print(f"{model_name}: {acc:.4f}")

# Find the best model
best_model_name = max(models, key=models.get)
print(f"\n🎉 BEST MODEL: {best_model_name}")

MODEL COMPARISON:
Logistic Regression: 0.7855
Multinomial NB: 0.7747
Random Forest: 0.7193

🎉 BEST MODEL: Logistic Regression


# Save models

In [43]:
import joblib

joblib.dump(model, 'models/model_LR.joblib')
joblib.dump(model_NB, 'models/model_NB.joblib')
joblib.dump(model_RF, 'models/model_RF.joblib')

['models/model_RF.joblib']