In [1]:
# Imports
import kagglehub
import pandas as pd
import numpy as np
import kagglehub
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from kagglehub import KaggleDatasetAdapter

In [2]:
#Load the dataset
#df = pd.read_csv("drive/MyDrive/McDonald_s_Reviews.csv", encoding="latin1")
df = pd.read_csv("McDonald_s_Reviews.csv", encoding="latin1")
print(df.head())
print("\nShape: ", df.shape)

df.columns = df.columns.str.strip()
print("Missing values before handling:")
print(df.isnull().sum())

# Handle missing values numerical columns with median
df['latitude'].fillna(df['latitude'].mean(), inplace=True)
df['longitude'].fillna(df['longitude'].mean(), inplace=True)

print("\nMissing values after handling:")
print(df.isnull().sum())

   reviewer_id  store_name              category  \
0            1  McDonald's  Fast food restaurant   
1            2  McDonald's  Fast food restaurant   
2            3  McDonald's  Fast food restaurant   
3            4  McDonald's  Fast food restaurant   
4            5  McDonald's  Fast food restaurant   

                                       store_address  latitude   longitude  \
0  13749 US-183 Hwy, Austin, TX 78750, United States  30.460718 -97.792874   
1  13749 US-183 Hwy, Austin, TX 78750, United States  30.460718 -97.792874   
2  13749 US-183 Hwy, Austin, TX 78750, United States  30.460718 -97.792874   
3  13749 US-183 Hwy, Austin, TX 78750, United States  30.460718 -97.792874   
4  13749 US-183 Hwy, Austin, TX 78750, United States  30.460718 -97.792874   

  rating_count   review_time  \
0        1,240  3 months ago   
1        1,240    5 days ago   
2        1,240    5 days ago   
3        1,240   a month ago   
4        1,240  2 months ago   

                         

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['latitude'].fillna(df['latitude'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['longitude'].fillna(df['longitude'].mean(), inplace=True)


In [3]:
# Removing unnecessary features
df = df.drop("reviewer_id", axis=1)
df = df.drop("store_name", axis=1)
df = df.drop("category", axis=1)
df.head()

Unnamed: 0,store_address,latitude,longitude,rating_count,review_time,review,rating
0,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,Why does it look like someone spit on my food?...,1 star
1,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,5 days ago,It'd McDonalds. It is what it is as far as the...,4 stars
2,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,5 days ago,Made a mobile order got to the speaker and che...,1 star
3,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,a month ago,My mc. Crispy chicken sandwich was ï¿½ï¿½ï¿½ï¿...,5 stars
4,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,2 months ago,"I repeat my order 3 times in the drive thru, a...",1 star


In [4]:
# Encode rating column
df['rating'] = df['rating'].astype(str).str.extract(r'(\d)').astype(int)
df['rating'] = df['rating'].map(lambda x: -1 if x <= 2 else (0 if x == 3 else 1))

#Encode rating_count
df['rating_count'] = df['rating_count'].str.replace(',', '').astype(int)

#Encode store_address
from sklearn.preprocessing import LabelEncoder

#Encode store_address
le = LabelEncoder()
df['store_address'] = le.fit_transform(df['store_address'])

#Encode review_time
import re

def convert_review_time(time_str):
    if pd.isna(time_str):  # Handle missing values
        return None

    match = re.search(r'\d+', str(time_str))  # Ensure it's a string
    if not match:
        return None  # If no number is found, return None

    num = int(match.group())  # Extract number

    if "day" in time_str:
        return num
    elif "month" in time_str:
        return num * 30
    elif "year" in time_str:
        return num * 365
    else:
        return None  # Catch unexpected cases

df['review_time'] = df['review_time'].apply(convert_review_time)
df.head()

Unnamed: 0,store_address,latitude,longitude,rating_count,review_time,review,rating
0,8,30.460718,-97.792874,1240,90.0,Why does it look like someone spit on my food?...,-1
1,8,30.460718,-97.792874,1240,5.0,It'd McDonalds. It is what it is as far as the...,1
2,8,30.460718,-97.792874,1240,5.0,Made a mobile order got to the speaker and che...,-1
3,8,30.460718,-97.792874,1240,,My mc. Crispy chicken sandwich was ï¿½ï¿½ï¿½ï¿...,1
4,8,30.460718,-97.792874,1240,60.0,"I repeat my order 3 times in the drive thru, a...",-1


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=100)  # Limit to 100 features

# Transform the text column
tfidf_matrix = vectorizer.fit_transform(df['review'].fillna(""))  # Convert NaN to empty strings

# Convert sparse matrix to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Merge with original dataset (drop original review column)
df = pd.concat([df.drop(columns=['review']), tfidf_df], axis=1)
df.head()

Unnamed: 0,store_address,latitude,longitude,rating_count,review_time,rating,10,20,area,ask,...,waiting,want,way,went,window,work,worst,wrong,½s,½ï
0,8,30.460718,-97.792874,1240,90.0,-1,0.0,0.0,0.0,0.0,...,0.0,0.527387,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8,30.460718,-97.792874,1240,5.0,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,8,30.460718,-97.792874,1240,5.0,-1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.048056,0.0,0.053121,0.0,0.0,0.0,0.986156
3,8,30.460718,-97.792874,1240,,1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.997586
4,8,30.460718,-97.792874,1240,60.0,-1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.283198,0.0,0.263631,0.0,0.0


In [6]:
# Removes review column and adds these columns
tfidf_df.columns

Index(['10', '20', 'area', 'ask', 'asked', 'bad', 'best', 'better', 'big',
       'breakfast', 'burger', 'busy', 'came', 'chicken', 'clean', 'coffee',
       'cold', 'come', 'customer', 'customers', 'day', 'did', 'didn', 'dirty',
       'don', 'drive', 'eat', 'employees', 'excellent', 'experience', 'fast',
       'food', 'fresh', 'friendly', 'fries', 'gave', 'going', 'good', 'got',
       'great', 'homeless', 'horrible', 'hot', 'ice', 'inside', 'just', 'kids',
       'know', 'like', 'line', 'location', 'long', 'lot', 'love', 'make',
       'manager', 'mcdonald', 'mcdonalds', 'meal', 'minutes', 'need',
       'neutral', 'new', 'nice', 'night', 'open', 'order', 'ordered', 'orders',
       'people', 'place', 'poor', 'quick', 'really', 'restaurant', 'right',
       'rude', 'said', 'say', 'service', 'slow', 'staff', 'terrible', 'time',
       'times', 'told', 'took', 've', 'wait', 'waited', 'waiting', 'want',
       'way', 'went', 'window', 'work', 'worst', 'wrong', '½s', '½ï'],
      dtype

In [7]:
# Drop columns that are completely empty
X = df.drop(columns=['rating'])
X = X.dropna(axis=1, how='all')  # Drop columns where all values are NaN

# Apply imputation
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Convert back to DataFrame
df_imputed = pd.DataFrame(X_imputed, columns=X.columns)

# SelectKBest for feature selection
selector = SelectKBest(score_func=f_classif, k=99)
X_selected = selector.fit_transform(df_imputed, df['rating'])
selected_features = X.columns[selector.get_support()]

# Convert selected features into DataFrame
df_selected = pd.DataFrame(X_selected, columns=selected_features)

print(selected_features)


Index(['store_address', 'latitude', 'longitude', 'rating_count', 'review_time',
       '10', '20', 'area', 'ask', 'asked', 'bad', 'best', 'better',
       'breakfast', 'burger', 'busy', 'came', 'chicken', 'clean', 'cold',
       'come', 'customer', 'customers', 'day', 'did', 'didn', 'dirty', 'don',
       'drive', 'eat', 'employees', 'excellent', 'experience', 'fast', 'food',
       'fresh', 'friendly', 'fries', 'gave', 'going', 'good', 'got', 'great',
       'homeless', 'horrible', 'hot', 'ice', 'inside', 'just', 'kids', 'know',
       'like', 'line', 'location', 'long', 'lot', 'love', 'make', 'manager',
       'mcdonald', 'meal', 'minutes', 'need', 'neutral', 'nice', 'night',
       'open', 'order', 'ordered', 'orders', 'people', 'place', 'poor',
       'quick', 'right', 'rude', 'said', 'say', 'service', 'slow', 'staff',
       'terrible', 'time', 'times', 'told', 'took', 've', 'wait', 'waited',
       'waiting', 'want', 'way', 'went', 'window', 'work', 'worst', 'wrong',
       '½s',

In [8]:
# Define feature matrix (X) and target variable (y)
X = df_selected  # Features from previous feature selection step
y = df['rating']  # Target variable

# Apply RandomOverSampler to balance dataset
oversampler = RandomOverSampler(random_state=42)
X_balanced, y_balanced = oversampler.fit_resample(X, y)

# Convert back to DataFrame
df_balanced = pd.DataFrame(X_balanced, columns=X.columns)
df_balanced['rating'] = y_balanced  # Add the target column back

print(df_balanced['rating'].value_counts())  # Check class distribution


rating
-1    16061
 1    16061
 0    16061
Name: count, dtype: int64


In [9]:
# Split the Data
X = df_selected
y = df['rating']
X_train, X_temp, y_train, y_temp = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Re-apply imputation AFTER splitting
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_val = imputer.transform(X_val)
X_test = imputer.transform(X_test)

print(f"Train set: {X_train.shape}, {y_train.shape}")
print(f"Validation set: {X_val.shape}, {y_val.shape}")
print(f"Test set: {X_test.shape}, {y_test.shape}")

Train set: (33728, 99), (33728,)
Validation set: (7227, 99), (7227,)
Test set: (7228, 99), (7228,)


In [10]:
rf_model = RandomForestClassifier(n_estimators=10, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
print(f"Random Forest")

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,
    roc_auc_score, matthews_corrcoef, cohen_kappa_score, classification_report
)
from sklearn.preprocessing import label_binarize
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Compute Performance Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
mcc = matthews_corrcoef(y_test, y_pred)
cohen_kappa = cohen_kappa_score(y_test, y_pred)

# Multi-class ROC-AUC Score (Fixed)
y_test_bin = label_binarize(y_test, classes=np.unique(y_test))  # Convert to one-hot encoding
roc_auc = roc_auc_score(y_test_bin, rf_model.predict_proba(X_test), multi_class="ovr")

# 4️⃣ Print Metrics
print("\n🔹 Classification Report:\n", classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"MCC: {mcc:.4f}")
print(f"Cohen's Kappa: {cohen_kappa:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Random Forest

🔹 Classification Report:
               precision    recall  f1-score   support

          -1       0.84      0.86      0.85      2388
           0       0.84      0.92      0.88      2464
           1       0.87      0.76      0.81      2376

    accuracy                           0.85      7228
   macro avg       0.85      0.85      0.84      7228
weighted avg       0.85      0.85      0.84      7228

Accuracy: 0.8462
Precision: 0.8472
Recall: 0.8462
F1-score: 0.8450
MCC: 0.7706
Cohen's Kappa: 0.7691
ROC-AUC Score: 0.9524

Confusion Matrix:
[[2051  181  156]
 [  84 2255  125]
 [ 312  254 1810]]


In [11]:
from sklearn.linear_model import LogisticRegression

logreg_model = LogisticRegression(max_iter=1000, random_state=42)
logreg_model.fit(X_train, y_train)
y_pred = logreg_model.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Predictions
y_pred = logreg_model.predict(X_test)
print(f"Logistic Regression")

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

precision = precision_score(y_test, y_pred, average='weighted')
print(f"Precision: {precision:.4f}")

mcc = matthews_corrcoef(y_test, y_pred)
print(f"MCC: {mcc:.4f}")

cohen_kappa = cohen_kappa_score(y_test, y_pred)
print(f"Kappa score: {cohen_kappa:.4f}")

# Classification Report (Precision, Recall, F1-score)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Logistic Regression
Accuracy: 0.6346
Precision: 0.6402
MCC: 0.4571
Kappa score: 0.4522

Classification Report:
              precision    recall  f1-score   support

          -1       0.60      0.78      0.68      2388
           0       0.58      0.50      0.54      2464
           1       0.74      0.63      0.68      2376

    accuracy                           0.63      7228
   macro avg       0.64      0.64      0.63      7228
weighted avg       0.64      0.63      0.63      7228


Confusion Matrix:
[[1866  426   96]
 [ 792 1234  438]
 [ 436  453 1487]]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
# Random Forest (Model 42 – Best)
{
    'n_estimators': 100,
    'max_depth': 20,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'bootstrap': False
}


{'n_estimators': 100,
 'max_depth': 20,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'bootstrap': False}

In [13]:
# Logistic Regression (Top 1)
{
    'solver': 'lbfgs',
    'penalty': 'l2',
    'max_iter': 500,
    'C': 10
}


{'solver': 'lbfgs', 'penalty': 'l2', 'max_iter': 500, 'C': 10}

In [14]:

# Apply PCA for dimensionality reduction
from sklearn.decomposition import PCA

# Apply PCA to reduce dimensions while keeping 95% variance
pca = PCA(n_components=30)  # Adjust components as needed
X_train_pca = pca.fit_transform(X_train)
X_val_pca = pca.transform(X_val)
X_test_pca = pca.transform(X_test)

print(f"Original Features: {X_train.shape[1]}")
print(f"Reduced Features (PCA): {X_train_pca.shape[1]}")



Original Features: 99
Reduced Features (PCA): 30


In [15]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, matthews_corrcoef, cohen_kappa_score, 
    classification_report, confusion_matrix
)
import joblib

# 🧪 Optional: If not already done, apply PCA and load the reduced data
# X_train_pca, X_val_pca, y_train, y_val should already be defined

# ✅ 1. Best Random Forest Model
best_rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    min_samples_split=2,
    min_samples_leaf=1,
    bootstrap=False,
    random_state=42
)

# ✅ 2. Best Logistic Regression Model
best_lr = LogisticRegression(
    solver='lbfgs',
    penalty='l2',
    max_iter=500,
    C=10,
    random_state=42
)

# ✅ 3. Voting Ensemble using hard voting
ensemble_model = VotingClassifier(
    estimators=[
        ('RandomForest', best_rf),
        ('LogisticRegression', best_lr)
    ],
    voting='hard'
)

# 🚀 Fit the ensemble
ensemble_model.fit(X_train_pca, y_train)

# 📊 Predict and evaluate
y_pred = ensemble_model.predict(X_val_pca)

print("\n📢 Ensemble Model Evaluation:")
print(f"✅ Accuracy: {accuracy_score(y_val, y_pred):.4f}")
print(f"✅ MCC: {matthews_corrcoef(y_val, y_pred):.4f}")
print(f"✅ Kappa Score: {cohen_kappa_score(y_val, y_pred):.4f}")
print("\n✅ Classification Report:")
print(classification_report(y_val, y_pred))

print("\n🧮 Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

# 💾 Save the ensemble model
joblib.dump(ensemble_model, "rf_lr_ensemble_model.pkl")


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



📢 Ensemble Model Evaluation:
✅ Accuracy: 0.7393
✅ MCC: 0.6262
✅ Kappa Score: 0.6087

✅ Classification Report:
              precision    recall  f1-score   support

          -1       0.62      0.91      0.74      2424
           0       0.80      0.64      0.71      2374
           1       0.90      0.66      0.76      2429

    accuracy                           0.74      7227
   macro avg       0.78      0.74      0.74      7227
weighted avg       0.78      0.74      0.74      7227


🧮 Confusion Matrix:
[[2213  118   93]
 [ 768 1520   86]
 [ 568  251 1610]]


['rf_lr_ensemble_model.pkl']

In [16]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, matthews_corrcoef, cohen_kappa_score, classification_report, confusion_matrix

# Random Forest: Best Parameters from Your Search (Model 41)
best_rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    min_samples_split=2,
    min_samples_leaf=1,
    bootstrap=False,
    random_state=42
)

# Logistic Regression: Best Parameters from Your Search
best_lr = LogisticRegression(
    solver='lbfgs',
    penalty='l2',
    max_iter=500,
    C=10,
    multi_class='auto',
    random_state=42
)

# Decision Tree with default or basic tuning (can be tuned later if needed)
dt_clf = DecisionTreeClassifier(random_state=42)

# ✅ 1. Voting Classifier (Soft Voting)
voting_clf = VotingClassifier(
    estimators=[
        ('rf', best_rf),
        ('lr', best_lr),
        ('dt', dt_clf)
    ],
    voting='soft'
)
# ✅ 2. Bagging Classifier (with Decision Tree)
bagging_clf = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=50,
    max_samples=0.8,
    max_features=0.8,
    bootstrap=True,
    random_state=42
)

# --- Training and Evaluation Function ---
def evaluate_model(model, name, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print(f"\n✅ Evaluation: {name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"MCC: {matthews_corrcoef(y_test, y_pred):.4f}")
    print(f"Kappa Score: {cohen_kappa_score(y_test, y_pred):.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# --- Run All Models ---
models = [
    (voting_clf, "Voting Classifier (RF + LR + DT)"),

]

import joblib

for model, name in models:
    evaluate_model(model, name, X_train, y_train, X_test, y_test)
    
    # Save model
    filename = name.replace(" ", "_").replace("(", "").replace(")", "").replace("+", "plus") + ".pkl"
    joblib.dump(model, filename)
    print(f"✅ Saved model to: {filename}")



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



✅ Evaluation: Voting Classifier (RF + LR + DT)
Accuracy: 0.8307
MCC: 0.7480
Kappa Score: 0.7457
Classification Report:
               precision    recall  f1-score   support

          -1       0.84      0.82      0.83      2388
           0       0.80      0.92      0.86      2464
           1       0.85      0.75      0.80      2376

    accuracy                           0.83      7228
   macro avg       0.83      0.83      0.83      7228
weighted avg       0.83      0.83      0.83      7228

Confusion Matrix:
 [[1958  236  194]
 [  82 2266  116]
 [ 279  317 1780]]
✅ Saved model to: Voting_Classifier_RF_plus_LR_plus_DT.pkl


In [17]:
from sklearn.ensemble import (
    RandomForestClassifier,
    VotingClassifier,
    BaggingClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier
)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score,
    matthews_corrcoef,
    cohen_kappa_score,
    classification_report,
    confusion_matrix
)
from sklearn.neighbors import KNeighborsClassifier
import joblib
import numpy as np

# ✅ Best Individual Models
best_rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    min_samples_split=2,
    min_samples_leaf=1,
    bootstrap=False,
    random_state=42
)

best_lr = LogisticRegression(
    solver='lbfgs',
    penalty='l2',
    max_iter=500,
    C=10,
    multi_class='auto',
    random_state=42
)

best_knn = KNeighborsClassifier(
    n_neighbors=3,
    weights='distance',
    algorithm='kd_tree',
    p=1,
    leaf_size=20
)

# ✅ 1. Voting Classifier (Soft Voting)
voting_clf = VotingClassifier(
    estimators=[
        ('rf', best_rf),
        ('lr', best_lr),
        ('knn', best_knn)
    ],
    voting='soft'
)

# ✅ 2. Bagging Classifier
bagging_clf = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=50,
    max_samples=0.8,
    max_features=0.8,
    bootstrap=True,
    random_state=42
)

# ✅ 3. AdaBoost Classifier
adaboost_clf = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1),
    n_estimators=50,
    learning_rate=1.0,
    random_state=42
)

# ✅ 4. Gradient Boosting Classifier
gboost_clf = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

# ✅ 5. Super Ensemble (Combines All Above Ensembles)
super_ensemble = VotingClassifier(
    estimators=[
        ('vote', voting_clf),
        ('bagging', bagging_clf),
        ('adaboost', adaboost_clf),
        ('gboost', gboost_clf)
    ],
    voting='soft'
)


# ✅ Evaluation Function
def evaluate_model(model, name, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"\n🔍 Evaluation: {name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"MCC: {matthews_corrcoef(y_test, y_pred):.4f}")
    print(f"Kappa Score: {cohen_kappa_score(y_test, y_pred):.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

    # Save model
    filename = name.replace(" ", "_").replace("(", "").replace(")", "").replace("+", "plus") + ".pkl"
    joblib.dump(model, filename)
    print(f"✅ Model saved as: {filename}")

# ✅ Train and Evaluate All Models
models = [
    (voting_clf, "Voting Classifier (RF + LR + KNN)"),
    (bagging_clf, "Bagging Classifier (DT)"),
    (adaboost_clf, "AdaBoost Classifier"),
    (gboost_clf, "Gradient Boosting Classifier"),
    (super_ensemble, "Super Voting Classifier (All Ensembles Combined)")
]

# 🔁 Run all models (Assumes X_train, y_train, X_test, y_test are already defined)
for model, name in models:
    evaluate_model(model, name, X_train, y_train, X_test, y_test)

# ✅ Optional: Manual Averaging of Predicted Probabilities
voting_clf.fit(X_train, y_train)
bagging_clf.fit(X_train, y_train)
adaboost_clf.fit(X_train, y_train)
gboost_clf.fit(X_train, y_train)

voting_proba = voting_clf.predict_proba(X_test)
bagging_proba = bagging_clf.predict_proba(X_test)
adaboost_proba = adaboost_clf.predict_proba(X_test)
gboost_proba = gboost_clf.predict_proba(X_test)

avg_proba = (voting_proba + bagging_proba + adaboost_proba + gboost_proba) / 4
y_pred_avg = np.argmax(avg_proba, axis=1)

print("\n🧠 Evaluation: Custom Averaged Ensemble (Manual Soft Voting)")
print("Accuracy:", accuracy_score(y_test, y_pred_avg))
print("MCC:", matthews_corrcoef(y_test, y_pred_avg))
print("Kappa Score:", cohen_kappa_score(y_test, y_pred_avg))
print("Classification Report:\n", classification_report(y_test, y_pred_avg))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_avg))


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



🔍 Evaluation: Voting Classifier (RF + LR + KNN)
Accuracy: 0.8087
MCC: 0.7152
Kappa Score: 0.7127
Classification Report:
               precision    recall  f1-score   support

          -1       0.84      0.79      0.81      2388
           0       0.77      0.90      0.83      2464
           1       0.83      0.73      0.78      2376

    accuracy                           0.81      7228
   macro avg       0.81      0.81      0.81      7228
weighted avg       0.81      0.81      0.81      7228

Confusion Matrix:
 [[1897  291  200]
 [  95 2209  160]
 [ 277  360 1739]]
✅ Model saved as: Voting_Classifier_RF_plus_LR_plus_KNN.pkl

🔍 Evaluation: Bagging Classifier (DT)
Accuracy: 0.8554
MCC: 0.7841
Kappa Score: 0.7830
Classification Report:
               precision    recall  f1-score   support

          -1       0.86      0.86      0.86      2388
           0       0.84      0.92      0.88      2464
           1       0.86      0.79      0.82      2376

    accuracy                     

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



🔍 Evaluation: Super Voting Classifier (All Ensembles Combined)
Accuracy: 0.8493
MCC: 0.7758
Kappa Score: 0.7738
Classification Report:
               precision    recall  f1-score   support

          -1       0.86      0.86      0.86      2388
           0       0.82      0.92      0.87      2464
           1       0.88      0.77      0.82      2376

    accuracy                           0.85      7228
   macro avg       0.85      0.85      0.85      7228
weighted avg       0.85      0.85      0.85      7228

Confusion Matrix:
 [[2051  212  125]
 [  82 2262  120]
 [ 262  288 1826]]
✅ Model saved as: Super_Voting_Classifier_All_Ensembles_Combined.pkl


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



🧠 Evaluation: Custom Averaged Ensemble (Manual Soft Voting)
Accuracy: 0.05118981737686774
MCC: -0.2820566717592112
Kappa Score: -0.24608810334550424
Classification Report:
               precision    recall  f1-score   support

          -1       0.00      0.00      0.00      2388
           0       0.03      0.03      0.03      2464
           1       0.10      0.12      0.11      2376
           2       0.00      0.00      0.00         0

    accuracy                           0.05      7228
   macro avg       0.03      0.04      0.04      7228
weighted avg       0.05      0.05      0.05      7228

Confusion Matrix:
 [[   0 2051  212  125]
 [   0   82 2262  120]
 [   0  262  288 1826]
 [   0    0    0    0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [19]:
import pandas as pd
import numpy as np
import joblib
from sklearn.decomposition import PCA
from sklearn.ensemble import (
    RandomForestClassifier, VotingClassifier, BaggingClassifier,
    AdaBoostClassifier, GradientBoostingClassifier
)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import (
    accuracy_score, matthews_corrcoef, cohen_kappa_score,
    classification_report, confusion_matrix
)
from sklearn.impute import SimpleImputer

# 📌 Assume df_selected and df['rating'] are already defined
X = df_selected
y = df['rating']

# 🧪 Train/Val/Test Split
X_train_full, X_temp, y_train_full, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# 🔄 Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train_full = imputer.fit_transform(X_train_full)
X_val = imputer.transform(X_val)
X_test = imputer.transform(X_test)

# 🎯 PCA: Fixed number of components (or you can use n_components=0.95 for 95% variance)
pca = PCA(n_components=30)
X_train_pca = pca.fit_transform(X_train_full)
X_val_pca = pca.transform(X_val)
X_test_pca = pca.transform(X_test)

# ✅ Base Models
best_rf = RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_split=2,
                                 min_samples_leaf=1, bootstrap=False, random_state=42)
best_lr = LogisticRegression(solver='lbfgs', penalty='l2', max_iter=500, C=10,
                             multi_class='auto', random_state=42)
best_knn = KNeighborsClassifier(n_neighbors=3, weights='distance',
                                algorithm='kd_tree', p=1, leaf_size=20)
dt_clf = DecisionTreeClassifier(random_state=42)

# ✅ Ensemble Models
voting_clf = VotingClassifier(estimators=[
    ('rf', best_rf),
    ('lr', best_lr),
    ('knn', best_knn)
], voting='soft')

bagging_clf = BaggingClassifier(estimator=dt_clf, n_estimators=50,
                                max_samples=0.8, max_features=0.8,
                                bootstrap=True, random_state=42)

adaboost_clf = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1),
                                  n_estimators=50, learning_rate=1.0, random_state=42)

gboost_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,
                                        max_depth=3, random_state=42)

# 🧠 Results table
results = []

# 📊 Evaluation Function with Tabulation
def evaluate_model(model, name, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)

    train_acc = model.score(X_train, y_train)
    test_acc = model.score(X_test, y_test)

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    cv_acc = np.mean(cv_scores)

    y_pred = model.predict(X_test)

    mcc = matthews_corrcoef(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)

    print(f"\n🔍 Evaluation: {name}")
    print(f"Training Accuracy: {train_acc:.4f}")
    print(f"Testing Accuracy: {test_acc:.4f}")
    print(f"Cross-Validation Accuracy (5-fold): {cv_acc:.4f}")
    print(f"MCC: {mcc:.4f}")
    print(f"Kappa Score: {kappa:.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

    # Save model
    filename = name.replace(" ", "_").replace("(", "").replace(")", "").replace("+", "plus") + ".pkl"
    joblib.dump(model, filename)
    print(f"✅ Model saved as: {filename}")

    # Append results
    results.append({
        'Model': name,
        'Train Acc': round(train_acc, 4),
        'Test Acc': round(test_acc, 4),
        'CV Acc (5-fold)': round(cv_acc, 4),
        'MCC': round(mcc, 4),
        'Kappa': round(kappa, 4)
    })

# 🚀 Train & Evaluate Supported Models
models = [
    (voting_clf, "Voting Classifier (RF + LR + KNN)"),
    (bagging_clf, "Bagging Classifier (DT)"),
    (adaboost_clf, "AdaBoost Classifier"),
    (gboost_clf, "Gradient Boosting Classifier")
]

for model, name in models:
    evaluate_model(model, name, X_train_pca, y_train_full, X_test_pca, y_test)

# 🧪 Manual Soft Voting (Custom Averaged Probabilities)
voting_clf.fit(X_train_pca, y_train_full)
bagging_clf.fit(X_train_pca, y_train_full)
adaboost_clf.fit(X_train_pca, y_train_full)
gboost_clf.fit(X_train_pca, y_train_full)

voting_proba = voting_clf.predict_proba(X_test_pca)
bagging_proba = bagging_clf.predict_proba(X_test_pca)
adaboost_proba = adaboost_clf.predict_proba(X_test_pca)
gboost_proba = gboost_clf.predict_proba(X_test_pca)

avg_proba = (voting_proba + bagging_proba + adaboost_proba + gboost_proba) / 4
y_pred_avg = np.argmax(avg_proba, axis=1)

manual_acc = accuracy_score(y_test, y_pred_avg)
manual_mcc = matthews_corrcoef(y_test, y_pred_avg)
manual_kappa = cohen_kappa_score(y_test, y_pred_avg)

print("\n🧠 Evaluation: Custom Averaged Ensemble (Manual Soft Voting)")
print(f"Accuracy: {manual_acc:.4f}")
print(f"MCC: {manual_mcc:.4f}")
print(f"Kappa Score: {manual_kappa:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_avg))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_avg))

results.append({
    'Model': 'Manual Soft Voting (Avg Proba)',
    'Train Acc': '-',  # not computed
    'Test Acc': round(manual_acc, 4),
    'CV Acc (5-fold)': '-',
    'MCC': round(manual_mcc, 4),
    'Kappa': round(manual_kappa, 4)
})

# 📊 Final Results Table
results_df = pd.DataFrame(results)
print("\n📋 Tabulated Results:")
print(results_df)

# 💾 Save to CSV
results_df.to_csv("model_performance_summary.csv", index=False)
print("\n📁 Results saved to 'model_performance_summary.csv'")


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt


🔍 Evaluation: Voting Classifier (RF + LR + KNN)
Training Accuracy: 0.9645
Testing Accuracy: 0.7575
Cross-Validation Accuracy (5-fold): 0.7466
MCC: 0.5889
Kappa Score: 0.5818
Classification Report:
               precision    recall  f1-score   support

          -1       0.73      0.82      0.77      1920
           0       0.81      0.29      0.42       696
           1       0.78      0.84      0.81      2394

    accuracy                           0.76      5010
   macro avg       0.77      0.65      0.67      5010
weighted avg       0.76      0.76      0.74      5010

Confusion Matrix:
 [[1582   21  317]
 [ 240  199  257]
 [ 354   26 2014]]
✅ Model saved as: Voting_Classifier_RF_plus_LR_plus_KNN.pkl

🔍 Evaluation: Bagging Classifier (DT)
Training Accuracy: 0.9624
Testing Accuracy: 0.7808
Cross-Validation Accuracy (5-fold): 0.7731
MCC: 0.6295
Kappa Score: 0.6246
Classification Report:
               precision    recall  f1-score   support

          -1       0.78      0.84      0.8

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



🧠 Evaluation: Custom Averaged Ensemble (Manual Soft Voting)
Accuracy: 0.0509
MCC: -0.0518
Kappa Score: -0.0321
Classification Report:
               precision    recall  f1-score   support

          -1       0.00      0.00      0.00      1920
           0       0.11      0.34      0.17       696
           1       0.09      0.01      0.01      2394
           2       0.00      0.00      0.00         0

    accuracy                           0.05      5010
   macro avg       0.05      0.09      0.05      5010
weighted avg       0.06      0.05      0.03      5010

Confusion Matrix:
 [[   0 1630   16  274]
 [   0  236  182  278]
 [   0  287   19 2088]
 [   0    0    0    0]]

📋 Tabulated Results:
                               Model Train Acc  Test Acc CV Acc (5-fold)  \
0  Voting Classifier (RF + LR + KNN)    0.9645    0.7575          0.7466   
1            Bagging Classifier (DT)    0.9624    0.7808          0.7731   
2                AdaBoost Classifier    0.7228    0.7208          0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [23]:
from sklearn.decomposition import PCA

# Apply PCA only on X_train
pca = PCA(n_components=30)
X_train_pca = pca.fit_transform(X_train)

# Transform validation and test using the same PCA
X_val_pca = pca.transform(X_val)
X_test_pca = pca.transform(X_test)

# Double-check shapes
print(f"X_train_pca shape: {X_train_pca.shape}, y_train shape: {y_train.shape}")
print(f"X_val_pca shape: {X_val_pca.shape}, y_val shape: {y_val.shape}")
print(f"X_test_pca shape: {X_test_pca.shape}, y_test shape: {y_test.shape}")


X_train_pca shape: (33728, 30), y_train shape: (33728,)
X_val_pca shape: (5009, 30), y_val shape: (5009,)
X_test_pca shape: (5010, 30), y_test shape: (5010,)


In [24]:
from sklearn.ensemble import (
    RandomForestClassifier, VotingClassifier, 
    BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (
    accuracy_score, matthews_corrcoef, cohen_kappa_score, 
    classification_report, confusion_matrix
)
import joblib
import numpy as np

# ✅ Define Base Models with Best Params
best_rf = RandomForestClassifier(
    n_estimators=100, max_depth=20, min_samples_split=2,
    min_samples_leaf=1, bootstrap=False, random_state=42
)

best_lr = LogisticRegression(
    solver='lbfgs', penalty='l2', max_iter=500, C=10,
    multi_class='auto', random_state=42
)

best_knn = KNeighborsClassifier(
    n_neighbors=3, weights='distance', algorithm='kd_tree',
    p=1, leaf_size=20
)

dt_clf = DecisionTreeClassifier(random_state=42)

# ✅ 1. Voting Classifier (RF + LR)
voting_rf_lr = VotingClassifier(
    estimators=[('rf', best_rf), ('lr', best_lr)],
    voting='soft'
)

# ✅ 2. Voting Classifier (RF + LR + DT)
voting_rf_lr_dt = VotingClassifier(
    estimators=[('rf', best_rf), ('lr', best_lr), ('dt', dt_clf)],
    voting='soft'
)

# ✅ 3. Voting Classifier (RF + LR + KNN)
voting_rf_lr_knn = VotingClassifier(
    estimators=[('rf', best_rf), ('lr', best_lr), ('knn', best_knn)],
    voting='soft'
)

# ✅ 4. Bagging Classifier
bagging_clf = BaggingClassifier(
    estimator=DecisionTreeClassifier(), n_estimators=50,
    max_samples=0.8, max_features=0.8, bootstrap=True,
    random_state=42
)

# ✅ 5. AdaBoost Classifier
adaboost_clf = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1), n_estimators=50,
    learning_rate=1.0, random_state=42
)

# ✅ 6. Gradient Boosting Classifier
gboost_clf = GradientBoostingClassifier(
    n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42
)

# ✅ 7. Super Ensemble (All Ensembles Combined)
super_ensemble = VotingClassifier(
    estimators=[
        ('v1', voting_rf_lr_knn),
        ('b1', bagging_clf),
        ('a1', adaboost_clf),
        ('g1', gboost_clf)
    ],
    voting='soft'
)

# ✅ Evaluation Function
from sklearn.model_selection import cross_val_score

def evaluate_model(model, name, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)

    print(f"\n🔍 Evaluation: {name}")
    print(f"✅ Training Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
    print(f"✅ Testing Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    
    # Cross-validation (5-fold)
    cv_scores = cross_val_score(model, X_train, y_train, cv=5)
    print(f"✅ Cross-Val Accuracy (5-fold): {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

    print(f"✅ MCC: {matthews_corrcoef(y_test, y_pred):.4f}")
    print(f"✅ Kappa Score: {cohen_kappa_score(y_test, y_pred):.4f}")
    print("✅ Classification Report:\n", classification_report(y_test, y_pred))
    print("✅ Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

    # Save model
    filename = name.replace(" ", "_").replace("(", "").replace(")", "").replace("+", "plus") + ".pkl"
    joblib.dump(model, filename)
    print(f"💾 Model saved as: {filename}")

# ✅ All Models with Names
models = [
    (voting_rf_lr, "Voting Classifier (RF + LR)"),
    (voting_rf_lr_dt, "Voting Classifier (RF + LR + DT)"),
    (voting_rf_lr_knn, "Voting Classifier (RF + LR + KNN)"),
    (bagging_clf, "Bagging Classifier (DT)"),
    (adaboost_clf, "AdaBoost Classifier"),
    (gboost_clf, "Gradient Boosting Classifier"),
    (super_ensemble, "Super Voting Classifier (All Ensembles Combined)")
]

# ✅ PCA-reduced data assumed as input
# Replace these with your actual PCA reduced data variables
# X_train_pca, X_test_pca, y_train, y_test

# Run all models on PCA data
for model, name in models:
    evaluate_model(model, name, X_train_pca, y_train, X_test_pca, y_test)

# ✅ 8. Manual Averaged Ensemble
# Train base ensemble models
voting_rf_lr_knn.fit(X_train_pca, y_train)
bagging_clf.fit(X_train_pca, y_train)
adaboost_clf.fit(X_train_pca, y_train)
gboost_clf.fit(X_train_pca, y_train)

# Predict probabilities
proba_vote = voting_rf_lr_knn.predict_proba(X_test_pca)
proba_bag = bagging_clf.predict_proba(X_test_pca)
proba_ada = adaboost_clf.predict_proba(X_test_pca)
proba_gboost = gboost_clf.predict_proba(X_test_pca)

# Average probabilities and predict
avg_proba = (proba_vote + proba_bag + proba_ada + proba_gboost) / 4
y_pred_avg = np.argmax(avg_proba, axis=1)

print("\n🧠 Evaluation: Custom Averaged Ensemble (Manual Soft Voting)")
print(f"✅ Testing Accuracy: {accuracy_score(y_test, y_pred_avg):.4f}")
print(f"✅ MCC: {matthews_corrcoef(y_test, y_pred_avg):.4f}")
print(f"✅ Kappa Score: {cohen_kappa_score(y_test, y_pred_avg):.4f}")
print("✅ Classification Report:\n", classification_report(y_test, y_pred_avg))
print("✅ Confusion Matrix:\n", confusion_matrix(y_test, y_pred_avg))


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



🔍 Evaluation: Voting Classifier (RF + LR)
✅ Training Accuracy: 0.9331
✅ Testing Accuracy: 0.8750


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

✅ Cross-Val Accuracy (5-fold): 0.8423 ± 0.0044
✅ MCC: 0.8001
✅ Kappa Score: 0.7978
✅ Classification Report:
               precision    recall  f1-score   support

          -1       0.87      0.91      0.89      1920
           0       0.72      0.89      0.79       696
           1       0.94      0.85      0.89      2394

    accuracy                           0.88      5010
   macro avg       0.84      0.88      0.86      5010
weighted avg       0.88      0.88      0.88      5010

✅ Confusion Matrix:
 [[1740   85   95]
 [  46  616   34]
 [ 210  156 2028]]
💾 Model saved as: Voting_Classifier_RF_plus_LR.pkl


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



🔍 Evaluation: Voting Classifier (RF + LR + DT)
✅ Training Accuracy: 0.9593
✅ Testing Accuracy: 0.8681


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

✅ Cross-Val Accuracy (5-fold): 0.8096 ± 0.0035
✅ MCC: 0.7909
✅ Kappa Score: 0.7879
✅ Classification Report:
               precision    recall  f1-score   support

          -1       0.90      0.86      0.88      1920
           0       0.66      0.92      0.77       696
           1       0.93      0.86      0.89      2394

    accuracy                           0.87      5010
   macro avg       0.83      0.88      0.85      5010
weighted avg       0.88      0.87      0.87      5010

✅ Confusion Matrix:
 [[1655  141  124]
 [  23  638   35]
 [ 152  186 2056]]
💾 Model saved as: Voting_Classifier_RF_plus_LR_plus_DT.pkl


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



🔍 Evaluation: Voting Classifier (RF + LR + KNN)
✅ Training Accuracy: 0.9567
✅ Testing Accuracy: 0.8762


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

✅ Cross-Val Accuracy (5-fold): 0.8156 ± 0.0048
✅ MCC: 0.8027
✅ Kappa Score: 0.8003
✅ Classification Report:
               precision    recall  f1-score   support

          -1       0.89      0.89      0.89      1920
           0       0.70      0.91      0.79       696
           1       0.94      0.86      0.89      2394

    accuracy                           0.88      5010
   macro avg       0.84      0.89      0.86      5010
weighted avg       0.89      0.88      0.88      5010

✅ Confusion Matrix:
 [[1706  115   99]
 [  20  635   41]
 [ 186  159 2049]]
💾 Model saved as: Voting_Classifier_RF_plus_LR_plus_KNN.pkl

🔍 Evaluation: Bagging Classifier (DT)
✅ Training Accuracy: 0.9598
✅ Testing Accuracy: 0.8878
✅ Cross-Val Accuracy (5-fold): 0.8438 ± 0.0034
✅ MCC: 0.8210
✅ Kappa Score: 0.8188
✅ Classification Report:
               precision    recall  f1-score   support

          -1       0.91      0.90      0.90      1920
           0       0.71      0.92      0.81       696
        

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



🔍 Evaluation: Super Voting Classifier (All Ensembles Combined)
✅ Training Accuracy: 0.9527
✅ Testing Accuracy: 0.8800


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

✅ Cross-Val Accuracy (5-fold): 0.8389 ± 0.0044
✅ MCC: 0.8090
✅ Kappa Score: 0.8065
✅ Classification Report:
               precision    recall  f1-score   support

          -1       0.90      0.90      0.90      1920
           0       0.70      0.91      0.79       696
           1       0.94      0.86      0.90      2394

    accuracy                           0.88      5010
   macro avg       0.84      0.89      0.86      5010
weighted avg       0.89      0.88      0.88      5010

✅ Confusion Matrix:
 [[1730  100   90]
 [  27  631   38]
 [ 171  175 2048]]
💾 Model saved as: Super_Voting_Classifier_All_Ensembles_Combined.pkl


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



🧠 Evaluation: Custom Averaged Ensemble (Manual Soft Voting)
✅ Testing Accuracy: 0.0403
✅ MCC: -0.1611
✅ Kappa Score: -0.1157
✅ Classification Report:
               precision    recall  f1-score   support

          -1       0.00      0.00      0.00      1920
           0       0.01      0.04      0.02       696
           1       0.19      0.07      0.11      2394
           2       0.00      0.00      0.00         0

    accuracy                           0.04      5010
   macro avg       0.05      0.03      0.03      5010
weighted avg       0.09      0.04      0.05      5010

✅ Confusion Matrix:
 [[   0 1730  100   90]
 [   0   27  631   38]
 [   0  171  175 2048]
 [   0    0    0    0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

#Load the dataset
#df = pd.read_csv("drive/MyDrive/McDonald_s_Reviews.csv", encoding="latin1")
df = pd.read_csv("McDonald_s_Reviews.csv", encoding="latin1")
print(df.head())
print("\nShape: ", df.shape)

# Ensure it's a list of strings, not list of lists
X_train = df['review'].tolist()

# Vectorize
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)

# Save the vectorizer for use in your app
import pickle
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)


   reviewer_id  store_name              category  \
0            1  McDonald's  Fast food restaurant   
1            2  McDonald's  Fast food restaurant   
2            3  McDonald's  Fast food restaurant   
3            4  McDonald's  Fast food restaurant   
4            5  McDonald's  Fast food restaurant   

                                       store_address  latitude   longitude  \
0  13749 US-183 Hwy, Austin, TX 78750, United States  30.460718 -97.792874   
1  13749 US-183 Hwy, Austin, TX 78750, United States  30.460718 -97.792874   
2  13749 US-183 Hwy, Austin, TX 78750, United States  30.460718 -97.792874   
3  13749 US-183 Hwy, Austin, TX 78750, United States  30.460718 -97.792874   
4  13749 US-183 Hwy, Austin, TX 78750, United States  30.460718 -97.792874   

  rating_count   review_time  \
0        1,240  3 months ago   
1        1,240    5 days ago   
2        1,240    5 days ago   
3        1,240   a month ago   
4        1,240  2 months ago   

                         