In [74]:
# !pip install catboost

In [75]:
import ast
import os
import re
import io
import uuid
import warnings
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression , Ridge, Lasso, ElasticNet
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
import lightgbm as lgb
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
# from catboost import CatBoostRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from scipy.stats import f_oneway, spearmanr
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate , KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score ,explained_variance_score , mean_absolute_percentage_error
from transformers import BertTokenizer
from functools import reduce
from pickle import TRUE
from collections import Counter
import statistics
from sklearn.linear_model import BayesianRidge
import xgboost as xgb
from datetime import datetime
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.svm import SVR
warnings.filterwarnings('ignore')

In [76]:
import pickle
import joblib
import json

In [77]:
d1 = pd.read_csv("/content/demos.csv")
d2 = pd.read_csv("/content/dlcs.csv")
d3 = pd.read_csv("/content/info_base_games.csv")
d4 = pd.read_csv("/content/gamalytic_cls_sales.csv")

In [78]:
if 'demo_appid' in d1.columns:
    d1.drop(['demo_appid'], axis=1, inplace=True)
if 'dlc_appid' in d2.columns:
    d2.drop(['dlc_appid'], axis=1, inplace=True)
if 'Unnamed: 0' in d1.columns:
    d1.drop(columns=["Unnamed: 0"], inplace=True)
if 'aiContent' in d4.columns:
  d4.drop(['aiContent'], axis=1, inplace=True)
if 'metacritic' in d3.columns:
  d3.drop(['metacritic'], axis=1, inplace=True)

In [79]:
output_file = "/content/merged_output.csv"

d1 = d1.rename(columns={"full_game_appid": "ID"})
d2 = d2.rename(columns={"base_appid": "ID"})
d3 = d3.rename(columns={"steam_appid": "ID"})
d4 = d4.rename(columns={"steamId": "ID"})

d1['ID'] = d1['ID'].astype(str)
d2['ID'] = d2['ID'].astype(str)
d3['ID'] = d3['ID'].astype(str)
d4['ID'] = d4['ID'].astype(str)


if not all('ID' in df.columns for df in [d1, d2, d3, d4]):
    raise ValueError("One or more files are missing the 'ID' column after renaming")


print("ID dtypes:", d1['ID'].dtype, d2['ID'].dtype, d3['ID'].dtype, d4['ID'].dtype)


merged_df = d4.merge(d1, on='ID', how='outer', suffixes=('_d4', '_d1'))
merged_df = merged_df.merge(d2, on='ID', how='outer', suffixes=('_prev', '_d2'))
merged_df = merged_df.merge(d3, on='ID', how='outer', suffixes=('_prev', '_d3'))


print("Rows in merged_df:", len(merged_df))
print("Columns in merged_df:", merged_df.columns)


os.makedirs(os.path.dirname(output_file), exist_ok=True)
print("Saving to:", output_file)

merged_df.to_csv(output_file, index=False)
print(f"Successfully merged files into {output_file}")
print("File exists after save:", os.path.exists(output_file))

ID dtypes: object object object object
Rows in merged_df: 5759
Columns in merged_df: Index(['ID', 'price', 'copiesSold', 'publisherClass', 'reviewScore',
       'name_prev', 'name_d2', 'name', 'steam_achievements',
       'steam_trading_cards', 'workshop_support', 'genres',
       'achievements_total', 'release_date', 'supported_platforms'],
      dtype='object')
Saving to: /content/merged_output.csv
Successfully merged files into /content/merged_output.csv
File exists after save: True


In [80]:
df = pd.read_csv("/content/merged_output.csv")

In [81]:
df = df.drop_duplicates(subset='ID', keep='first')

In [82]:
cols_to_drop = [col for col in ['ID', 'name'] if col in df.columns]
df = df.drop(cols_to_drop, axis=1)

In [83]:
if 'name_prev' in df.columns:
    df.drop(['name_prev'], axis=1, inplace=True)
if 'name_d2' in df.columns:
    df.drop(['name_d2'], axis=1, inplace=True)

In [84]:
with open('imputation_values.pkl', 'rb') as f:
    imputation_values = pickle.load(f)

In [85]:
def find_columns_with_matching_values(df):
    matching_columns = []
    for column in df.columns:
        if any(df[column].astype(str) == str(column)):
            matching_columns.append(column)

    return matching_columns

train_matching_columns = find_columns_with_matching_values(df)
train_matching_columns

[]

In [86]:
for column in train_matching_columns:
    df.loc[df[column].astype(str) == str(column), column] = np.nan

# Test The Test Script

In [87]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

train_data.to_csv('train_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

In [88]:
train = pd.read_csv("/content/train_data.csv")
test  = pd.read_csv("/content/test_data.csv")

In [89]:
# test = df.copy()

# Handling Outliers

In [90]:
test_numerical_columns = test.select_dtypes(include=['int64', 'float64', 'int32', 'float32']).columns.tolist()
print(test_numerical_columns)

['price', 'reviewScore', 'achievements_total']


In [91]:
with open('outliers_handling.pkl', 'rb') as f:
    loaded_outliers_handling = pickle.load(f)

In [92]:
def handle_outliers_iqr(test_df, column):
    lower_bound = loaded_outliers_handling[column]['lower_bound']
    upper_bound = loaded_outliers_handling[column]['upper_bound']
    test_df[column] = np.where(test_df[column] < lower_bound, lower_bound, test_df[column])
    test_df[column] = np.where(test_df[column] > upper_bound, upper_bound, test_df[column])
    return test_df

In [93]:
for col in test_numerical_columns:
  if col in test.columns:
    test = handle_outliers_iqr(test.copy(), col)
    print("\nTest data after IQR-based outlier handling:")
    print(test[col].describe())


Test data after IQR-based outlier handling:
count    669.000000
mean       4.525426
std        3.198948
min        0.000000
25%        0.990000
50%        4.990000
75%        7.990000
max        7.990000
Name: price, dtype: float64

Test data after IQR-based outlier handling:
count    669.000000
mean      72.270305
std        6.907868
min       62.181359
25%       62.181359
50%       77.697734
75%       77.697734
max       77.697734
Name: reviewScore, dtype: float64

Test data after IQR-based outlier handling:
count    352.0
mean      18.0
std        0.0
min       18.0
25%       18.0
50%       18.0
75%       18.0
max       18.0
Name: achievements_total, dtype: float64


# Numerical Columns

In [94]:
test_numerical_columns = test.select_dtypes(include=['int64', 'float64', 'int32', 'float32']).columns.tolist()

In [95]:
skew_threshold = 0.5
for col in test_numerical_columns:
  if col in test.columns and col in imputation_values:
    if test[col].skew() > skew_threshold:
        test[col].fillna(imputation_values[col], inplace=True)
    else:
        test[col].fillna(imputation_values[col], inplace=True)

# Date Column

In [96]:
if 'release_date' in test.columns:
    if test['release_date'].isin(['Coming soon', 'To be announced']).any():
        test['release_date'] = test['release_date'].replace(['Coming soon', 'To be announced'], '')

In [97]:
def remove_future_dates(cell):
    result = cell
    if pd.isna(cell):
        return result
    try:
        cell_str = str(cell).strip()
        if re.fullmatch(r'^[A-Za-z]{3} \d{1,2}, \d{4}$', cell_str):
            cell_date = datetime.strptime(cell_str, '%b %d, %Y')
            if cell_date > pd.Timestamp("2025-12-31"):
                result = ""
            else:
                result = cell_date.strftime('%b %d, %Y')
    except (ValueError, TypeError):
        pass
    return result

In [98]:
for i, row in test.iterrows():
    cell = row['release_date']
    new_cell = remove_future_dates(cell)
    test.at[i, 'release_date'] = new_cell

In [99]:
def check_no_future(cell):
    result = cell
    counter = 0
    if pd.isna(cell):
        return result
    try:
        cell_str = str(cell).strip()
        if re.fullmatch(r'^[A-Za-z]{3} \d{1,2}, \d{4}$', cell_str):
            cell_date = datetime.strptime(cell_str, '%b %d, %Y')
            if cell_date > pd.Timestamp("2025-12-31"):
              result = "Future Exist"
    except (ValueError, TypeError):
        pass
    return result

In [100]:
counter1 = 0
for i,row in test.iterrows():
    cell = row['release_date']
    new_cell = check_no_future(cell)
    if new_cell == "Future Exist":
      counter1 +=1
print(counter1)

0


In [101]:
pattern = r'Q\d \d{4}'

In [102]:
count = test['release_date'].str.contains(pattern, regex=True).sum()

In [103]:
regex_pattern = r'Q\d (\d{4})'
test['release_date'] = test['release_date'].replace(to_replace=regex_pattern, value=r'\g<1>', regex=True)

In [104]:
with open('mode_date_value.pkl', 'rb') as f:
    loaded_mode_date = pickle.load(f)

In [105]:
import pandas as pd
import re
from datetime import datetime

def replace_missing_date(mode_date, cell):
    if pd.isna(cell) or str(cell).strip() == '':
        return mode_date

    cell = str(cell).strip()
    mode_date_dt = datetime.strptime(mode_date, "%b %d, %Y")
    if re.fullmatch(r'^(?:[1-9]|[12]\d|3[01])$', cell):
        return mode_date_dt.strftime("%b ") + f"{int(cell):02d}" + ", " + mode_date_dt.strftime("%Y")
    elif re.fullmatch(r'^\d{4}$', cell):
        return mode_date_dt.strftime("%b ") + mode_date_dt.strftime("%d").lstrip('0') + ", " + cell
    elif re.fullmatch(r'^(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)$', cell):
        return cell + " " + mode_date_dt.strftime("%d").lstrip('0') + ", " + mode_date_dt.strftime("%Y")
    elif re.fullmatch(r'^(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec), \d{4}$', cell):
        return cell.split(", ")[0] + " " + mode_date_dt.strftime("%d").lstrip('0') + ", " + cell.split(", ")[1]
    elif re.fullmatch(r'^(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) (?:[1-9]|[12]\d|3[01])$', cell):
        month, day = cell.split(" ")
        return month + " " + f"{int(day):02d}" + ", " + mode_date_dt.strftime("%Y")
    elif re.fullmatch(r'^(?:[1-9]|[12]\d|3[01]), \d{4}$', cell):
        day = cell.split(", ")[0]
        return mode_date_dt.strftime("%b ") + f"{int(day):02d}" + ", " + cell.split(", ")[1]
    elif re.fullmatch(r'^(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) (?:[1-9]|[12]\d|3[01]), \d{4}$', cell):
        try:
            parsed_date = datetime.strptime(cell, "%b %d, %Y")
            return parsed_date.strftime("%b %d, %Y")
        except ValueError:
            return mode_date
    else:
        return mode_date

In [106]:
test_date = loaded_mode_date
for i, row in test.iterrows():
    cell = row['release_date']
    new_cell = replace_missing_date(test_date , cell)
    test.at[i, 'release_date'] = new_cell

In [107]:
test['release_date'] = pd.to_datetime(test['release_date'], format='%b %d, %Y', errors='coerce')

In [108]:
test['release_date'].fillna(loaded_mode_date, inplace=True)

In [109]:
test['timestamp'] = test['release_date'].astype('int64')

In [110]:
test.drop('release_date', axis=1, inplace=True)

# Boolean Columns

In [111]:
Boolean_columns1 = []

for col in test.columns:
    if test[col].apply(lambda x: isinstance(x, bool)).any():
        Boolean_columns1.append(col)

print("Boolean columns in test Data:", Boolean_columns1)

Boolean columns in test Data: ['steam_achievements', 'steam_trading_cards', 'workshop_support']


In [112]:
for col in Boolean_columns1:
    if col in test.columns and col in imputation_values:
        test[col] = test[col].fillna(imputation_values[col])

In [113]:
for col in Boolean_columns1:
    if col in test.columns:
        test[col] = test[col].astype(int)

# Categorical Columns

In [114]:
def get_categorical_columns(df, threshold=10):
    categorical_columns = []
    for col in df.columns:
        if not np.issubdtype(df[col].dtype, np.datetime64):
            if df[col].dtype == 'object':
                categorical_columns.append(col)
    return categorical_columns

In [115]:
categorical_columns1 = get_categorical_columns(test, threshold=10)
print("Categorical Columns:", categorical_columns1)

Categorical Columns: ['copiesSold', 'publisherClass', 'genres', 'supported_platforms']


In [116]:
#In Test Data
for col in categorical_columns1:
  te_null_percentage = (test[col].isnull().sum() / len(test)) * 100
  print(f"In Test Data --> Column {col}: {te_null_percentage:.2f}% null values")

In Test Data --> Column copiesSold: 0.00% null values
In Test Data --> Column publisherClass: 0.00% null values
In Test Data --> Column genres: 0.15% null values
In Test Data --> Column supported_platforms: 0.00% null values


In [117]:
most_common = imputation_values['genres']
test['genres'] = test['genres'].fillna(most_common)

In [118]:
most_common = imputation_values['supported_platforms']
test['supported_platforms'] = test['supported_platforms'].fillna(most_common)

In [119]:
mode_value = imputation_values['publisherClass']
test['publisherClass'].fillna(mode_value, inplace=True)

# Encoding Features

### Encoding Genres

In [120]:
with open('mlb_genres.pkl', 'rb') as f:
    mlb_genres = pickle.load(f)

In [121]:
test['genres_list'] = test['genres'].str.split(',\s*')
known_genres = set(mlb_genres.classes_)
replacement_genres = imputation_values['genres']
def replace_if_unknown(genres):
    if any(g not in known_genres for g in genres):
        return replacement_genres
    return genres

test['genres_list'] = test['genres_list'].apply(replace_if_unknown)
genres_encoded_test = mlb_genres.transform(test['genres_list'])
genres_df_test = pd.DataFrame(genres_encoded_test, columns=mlb_genres.classes_, index=test.index)
test = pd.concat([test, genres_df_test], axis=1)
test = test.drop(['genres', 'genres_list'], axis=1)

### Encoding Suported Platforms

In [122]:
with open('mlb_platforms.pkl', 'rb') as f:
    mlb_platforms = pickle.load(f)

In [123]:
test['supported_platforms'] = test['supported_platforms'].apply(lambda x: eval(x) if isinstance(x, str) else x)
known_platforms = set(mlb_platforms.classes_)
fallback_platforms = ['Windows', 'Linux']  # Must exist in training data

def replace_unknown_platforms(platform_list):
    if any(p not in known_platforms for p in platform_list):
        return fallback_platforms
    return platform_list

test['supported_platforms'] = test['supported_platforms'].apply(replace_unknown_platforms)
encoded_platforms_test = mlb_platforms.transform(test['supported_platforms'])
encoded_df_test = pd.DataFrame(encoded_platforms_test, columns=mlb_platforms.classes_, index=test.index)

# Update test DataFrame
test = pd.concat([test.drop('supported_platforms', axis=1), encoded_df_test], axis=1)

### Encoding Publisher Class

In [124]:
with open('encoder_publisher.pkl', 'rb') as f:
    encoder_publisher = pickle.load(f)

In [125]:
fallback_label = imputation_values['publisherClass']
test['publisherClass'] = test['publisherClass'].apply(
    lambda x: x if x in encoder_publisher.classes_ else fallback_label
)
test['publisherClass_encoded'] = encoder_publisher.transform(test['publisherClass'])
test = test.drop('publisherClass', axis=1)

# Encoding Copies Sold

In [126]:
encoder = joblib.load('copiesSold_encoder.pkl')

In [127]:
test['copiesSold_encoded'] = encoder.transform(test['copiesSold'])

In [128]:
test = test.drop('copiesSold', axis=1)

# Preparing for testing

In [129]:
X_test = test.drop(columns=['copiesSold_encoded'])
y_test = test['copiesSold_encoded']

In [130]:
with open('selected_features.pkl', 'rb') as f:
    selected_features = pickle.load(f)
X_test = X_test[selected_features]

# LightGBM Test

In [131]:
bst = lgb.Booster(model_file='lgbm_model.txt')

In [132]:
X_test.columns

Index(['price', 'reviewScore', 'steam_achievements', 'steam_trading_cards',
       'workshop_support', 'mac', 'publisherClass_encoded'],
      dtype='object')

In [133]:
y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / np.where(y_test != 0, y_test, 1))) * 100

print(f"Test RMSE (simplified model): {rmse:.4f}")
print(f"Test R2 Score (simplified model): {r2*100:.2f}%")
print(f"Test MAE (simplified model): {mae:.4f}")
print(f"Test MAPE (simplified model): {mape:.2f}%")

Test RMSE (simplified model): 1.2452
Test R2 Score (simplified model): 8.73%
Test MAE (simplified model): 1.1674
Test MAPE (simplified model): 87.85%


# Linear Regression

In [134]:
lr = joblib.load('linear_regression_model.pkl')

In [135]:
y_test_pred = lr.predict(X_test)

# Calculate test metrics
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_mape = np.mean(np.abs((y_test - y_test_pred) / np.where(y_test != 0, y_test, 1))) * 100

# Calculate tolerance-based accuracy
tolerance = 0.1  # Adjust based on y_test.std() or domain knowledge
test_correct = np.abs(y_test - y_test_pred) <= tolerance
test_accuracy = np.mean(test_correct) * 100

# Print test metrics
print("Test Metrics (Linear Regression):")
print(f"Test RMSE: {test_rmse:.4f}")
print(f"Test R2 Score: {test_r2*100:.2f}%")
print(f"Test MAE: {test_mae:.4f}")
print(f"Test MAPE: {test_mape:.2f}%")
print(f"Test Tolerance-Based Accuracy (within ±{tolerance}): {test_accuracy:.2f}%")

Test Metrics (Linear Regression):
Test RMSE: 1.4701
Test R2 Score: -27.22%
Test MAE: 1.3557
Test MAPE: 110.28%
Test Tolerance-Based Accuracy (within ±0.1): 2.24%


# XGBoost

In [136]:
import xgboost as xgb

bst = xgb.Booster()
bst.load_model('xgboost_model.json')

In [137]:
dtest = xgb.DMatrix(X_test)

y_test_pred = bst.predict(dtest)

test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_mape = np.mean(np.abs((y_test - y_test_pred) / np.where(y_test != 0, y_test, 1))) * 100

tolerance = 0.1
test_correct = np.abs(y_test - y_test_pred) <= tolerance
test_accuracy = np.mean(test_correct) * 100

print("Test Metrics (XGBoost):")
print(f"Test RMSE: {test_rmse:.4f}")
print(f"Test R2 Score: {test_r2*100:.2f}%")
print(f"Test MAE: {test_mae:.4f}")
print(f"Test MAPE: {test_mape:.2f}%")
print(f"Test Tolerance-Based Accuracy (within ±{tolerance}): {test_accuracy:.2f}%")

Test Metrics (XGBoost):
Test RMSE: 1.2767
Test R2 Score: 4.05%
Test MAE: 1.1896
Test MAPE: 92.91%
Test Tolerance-Based Accuracy (within ±0.1): 1.94%


# Decision Tree

In [138]:
dt = joblib.load('decision_tree_model.pkl')

In [139]:
y_test_pred = dt.predict(X_test)

test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_mape = np.mean(np.abs((y_test - y_test_pred) / np.where(y_test != 0, y_test, 1))) * 100

tolerance = 0.1
test_correct = np.abs(y_test - y_test_pred) <= tolerance
test_accuracy = np.mean(test_correct) * 100

print("Test Metrics (Decision Tree):")
print(f"Test RMSE: {test_rmse:.4f}")
print(f"Test R2 Score: {test_r2*100:.2f}%")
print(f"Test MAE: {test_mae:.4f}")
print(f"Test MAPE: {test_mape:.2f}%")
print(f"Test Tolerance-Based Accuracy (within ±{tolerance}): {test_accuracy:.2f}%")

Test Metrics (Decision Tree):
Test RMSE: 1.2201
Test R2 Score: 12.36%
Test MAE: 1.1385
Test MAPE: 84.26%
Test Tolerance-Based Accuracy (within ±0.1): 0.75%


# Random Forest

In [140]:
rf = joblib.load('random_forest_model.pkl')

In [141]:
y_test_pred = rf.predict(X_test)

test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_mape = np.mean(np.abs((y_test - y_test_pred) / np.where(y_test != 0, y_test, 1))) * 100

tolerance = 0.1
test_correct = np.abs(y_test - y_test_pred) <= tolerance
test_accuracy = np.mean(test_correct) * 100

print("Test Metrics (Random Forest):")
print(f"Test RMSE: {test_rmse:.4f}")
print(f"Test R2 Score: {test_r2*100:.2f}%")
print(f"Test MAE: {test_mae:.4f}")
print(f"Test MAPE: {test_mape:.2f}%")
print(f"Test Tolerance-Based Accuracy (within ±{tolerance}): {test_accuracy:.2f}%")

Test Metrics (Random Forest):
Test RMSE: 1.2199
Test R2 Score: 12.40%
Test MAE: 1.1390
Test MAPE: 84.17%
Test Tolerance-Based Accuracy (within ±0.1): 0.75%


# Gradient Boosting

In [142]:
gb = joblib.load('gradient_boosting_model.pkl')

In [143]:
y_test_pred = gb.predict(X_test)
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_mape = np.mean(np.abs((y_test - y_test_pred) / np.where(y_test != 0, y_test, 1))) * 100

tolerance = 0.1
test_correct = np.abs(y_test - y_test_pred) <= tolerance
test_accuracy = np.mean(test_correct) * 100

print("Test Metrics (Gradient Boosting):")
print(f"Test RMSE: {test_rmse:.4f}")
print(f"Test R2 Score: {test_r2*100:.2f}%")
print(f"Test MAE: {test_mae:.4f}")
print(f"Test MAPE: {test_mape:.2f}%")
print(f"Test Tolerance-Based Accuracy (within ±{tolerance}): {test_accuracy:.2f}%")

Test Metrics (Gradient Boosting):
Test RMSE: 1.2457
Test R2 Score: 8.65%
Test MAE: 1.1577
Test MAPE: 86.85%
Test Tolerance-Based Accuracy (within ±0.1): 0.60%


# Bayesian Ridge Regression

In [144]:
br = joblib.load('bayesian_ridge_model.pkl')

In [145]:
y_test_pred = br.predict(X_test)

test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_mape = np.mean(np.abs((y_test - y_test_pred) / np.where(y_test != 0, y_test, 1))) * 100

tolerance = 0.1
test_correct = np.abs(y_test - y_test_pred) <= tolerance
test_accuracy = np.mean(test_correct) * 100

print("Test Metrics (Bayesian Ridge):")
print(f"Test RMSE: {test_rmse:.4f}")
print(f"Test R2 Score: {test_r2*100:.2f}%")
print(f"Test MAE: {test_mae:.4f}")
print(f"Test MAPE: {test_mape:.2f}%")
print(f"Test Tolerance-Based Accuracy (within ±{tolerance}): {test_accuracy:.2f}%")

Test Metrics (Bayesian Ridge):
Test RMSE: 1.4699
Test R2 Score: -27.20%
Test MAE: 1.3556
Test MAPE: 110.28%
Test Tolerance-Based Accuracy (within ±0.1): 1.94%


# Logisitc Regression

In [146]:
model = joblib.load('logistic_regression_model.pkl')

In [147]:
# Predict continuous values on test set
y_test_pred = model.predict(X_test)

# Calculate regression metrics
test_mse = mean_squared_error(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
test_r2 = r2_score(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
epsilon = 1e-10  # Avoid division by zero in MAPE
test_mape = np.mean(np.abs((y_test - y_test_pred) / (np.abs(y_test) + epsilon))) * 100
tolerance = 0.1 * (y_test.max() - y_test.min())  # Scale tolerance to target range
test_correct = np.abs(y_test - y_test_pred) <= tolerance
test_accuracy = np.mean(test_correct) * 100

print("\nTest Metrics (Linear Regression):")
print(f"Test RMSE: {test_rmse:.4f}")
print(f"Test R2 Score: {test_r2*100:.2f}%")
print(f"Test MAE: {test_mae:.4f}")
print(f"Test MAPE: {test_mape:.2f}%")
print(f"Test Tolerance-Based Accuracy (within ±{tolerance:.4f}): {test_accuracy:.2f}%")


Test Metrics (Linear Regression):
Test RMSE: 1.4701
Test R2 Score: -27.22%
Test MAE: 1.3557
Test MAPE: 867330748486.82%
Test Tolerance-Based Accuracy (within ±0.3000): 5.38%
