# Model Training Notebook
 - This notebook contains training on all different Models of Machine Learning and ANN.
 - Different Models tried here and the best one selected at last based on testing results.
 - Best Model will be used for final training and used in Prediction Notebook for predicting the Output.

## 1. Import Libraries

In [29]:
import numpy as np
import pandas as pd
# warnings
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, precision_score 

## 2. Load Training and Testing Data

In [7]:
# Train data
X_train_path = 'D:\\DATA_SCIENCE_A.I\\INX Fututre Inc\\data\\processed\\Train_Data\\X_train.csv'
y_train_path = 'D:\\DATA_SCIENCE_A.I\\INX Fututre Inc\\data\\processed\\Train_Data\\y_train.csv'

# Loading
X_train = pd.read_csv(X_train_path)
y_train = pd.read_csv(y_train_path).squeeze()

# Test data
X_test_path = 'D:\\DATA_SCIENCE_A.I\\INX Fututre Inc\\data\\processed\\Test_Data\\X_test.csv'
y_test_path = 'D:\\DATA_SCIENCE_A.I\\INX Fututre Inc\\data\\processed\\Test_Data\\y_test.csv'

# Loading
X_test = pd.read_csv(X_test_path)
y_test = pd.read_csv(y_test_path).squeeze()

In [9]:
# Shape of training and testing
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(931, 53) (931,)
(233, 53) (233,)


## 3. Evaluate function
 - This function will return required metrics after model training.

In [25]:
def evaluate_model(true, predicted):

    acc = accuracy_score(true,predicted) #accuracy score
    ps = precision_score(true,predicted, average='macro') # precision score
    f1 = f1_score(true,predicted, average='macro') # f1 score
    cm = confusion_matrix(true,predicted) #confusion matrix
    cr = classification_report(true,predicted) # classification report

    return acc, ps, f1, cm, cr

## 4. Models Training
 - This function will perform training on each model in the dict and provide the results.


In [31]:
result = []

# Encoding y labels for XG Boost as it need labels starting 0,1,2...
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

# Creating dictionary of models.
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Ada Boost': AdaBoostClassifier(),
    'SVM': SVC(),
    'XG Boost': XGBClassifier()}

# Fit each model.
for name,model in models.items():
    
    if name == 'XG Boost':      # it will use encoded y
        model.fit(X_train, y_train_enc)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        y_test_decoded = le.inverse_transform(y_test_pred)  # for evaluation
        y_train_decoded = le.inverse_transform(y_train_pred)
    else:  # for other models
        model.fit(X_train,y_train) # training of model.

  #Make Predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        y_test_decoded = y_test_pred
        y_train_decoded = y_train_pred

#Evaluate Train and test dataset
    train_acc, train_ps, train_f1, train_cm, train_cr=evaluate_model(y_train,y_train_decoded)
    test_acc, test_ps, test_f1, test_cm, test_cr=evaluate_model(y_test,y_test_decoded)

    print(name)
    print('Model Perfomance for training set\n')
    print("Accuracy Score:",train_acc)
    print("Precision Score:",train_ps)
    print("F-1 Score:",train_f1)

    print("-----------------------------")

    print(name)
    print('Model Perfomance for test set\n')
    print("Accuracy Score:",test_acc)
    print("Precision Score:",test_ps)
    print("F-1 Score:",test_f1)
    print("Confusion Matrix:\n", test_cm)
    print("Classification Report:\n", test_cr)

    print('='*32)
    print('\n')
    result.append({
    'Model': name,'Train Accuracy': train_acc,'Train Precision': train_ps,'Train F1': train_f1,'Test Accuracy': test_acc,
        'Test Precision': test_ps,'Test F1': test_f1})
result_df = pd.DataFrame(result)

Logistic Regression
Model Perfomance for training set

Accuracy Score: 0.8796992481203008
Precision Score: 0.851719064848663
F-1 Score: 0.8220176289826945
-----------------------------
Logistic Regression
Model Perfomance for test set

Accuracy Score: 0.8454935622317596
Precision Score: 0.8159346846846848
F-1 Score: 0.7409397179276899
Confusion Matrix:
 [[ 27  14   0]
 [  5 158   4]
 [  0  13  12]]
Classification Report:
               precision    recall  f1-score   support

           2       0.84      0.66      0.74        41
           3       0.85      0.95      0.90       167
           4       0.75      0.48      0.59        25

    accuracy                           0.85       233
   macro avg       0.82      0.69      0.74       233
weighted avg       0.84      0.85      0.84       233



Decision Tree
Model Perfomance for training set

Accuracy Score: 1.0
Precision Score: 1.0
F-1 Score: 1.0
-----------------------------
Decision Tree
Model Perfomance for test set

Accuracy Sc

In [35]:
result_df.sort_values(by=['Test F1','Train F1'], ascending=False)

Unnamed: 0,Model,Train Accuracy,Train Precision,Train F1,Test Accuracy,Test Precision,Test F1
6,XG Boost,1.0,1.0,1.0,0.95279,0.951569,0.921072
3,Gradient Boosting,0.995704,0.994705,0.993697,0.948498,0.940992,0.909223
2,Random Forest,1.0,1.0,1.0,0.918455,0.910019,0.86043
1,Decision Tree,1.0,1.0,1.0,0.875536,0.819354,0.81412
0,Logistic Regression,0.879699,0.851719,0.822018,0.845494,0.815935,0.74094
5,SVM,0.887218,0.920318,0.814134,0.824034,0.809513,0.688168
4,Ada Boost,0.735768,0.664432,0.681172,0.729614,0.675421,0.670839


XGBoost and Gradient Boosting are the only models that consistently handle class 4 well.

AdaBoost, SVM, and Logistic Regression struggle significantly with class 4 — low recall and F1.

✅ Final Verdict
XGBoost is your best model — highest test F1, strong precision, and solid handling of all classes.

Gradient Boosting is a close second, slightly less robust on class 4.

Random Forest is strong but slightly weaker on minority class.

Decision Tree generalizes decently but overfits hard.

Logistic Regression and SVM are interpretable but underperform.

AdaBoost is not viable for your use case.

 -  Current Best Model- **XG Boost**

## 5. Hyperparameter Tuning XG Boost

In [75]:
# Step:1 Encoding y labels for XG Boost as it need labels starting 0,1,2...
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

In [77]:
# Step:2 Class Weighting for Multiclass
# This will give minor class -- higher weight
#                major class -- Lower weight

from sklearn.utils.class_weight import compute_sample_weight
sample_weights = compute_sample_weight(class_weight='balanced', y=y_train_enc)

In [79]:
# Step:3 Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV
xgb = XGBClassifier(objective='multi:softprob', num_class=3, use_label_encoder=False, eval_metric='mlogloss')

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 1],
    'reg_alpha': [0, 0.5],
    'reg_lambda': [1, 2]
}

grid = GridSearchCV(xgb, param_grid, cv=3, scoring='f1_weighted', verbose=1, n_jobs=-1)
grid.fit(X_train, y_train_enc, sample_weight=sample_weights)

Fitting 3 folds for each of 384 candidates, totalling 1152 fits


In [83]:
# best model

grid.best_estimator_

In [87]:
# Evaluating Best Model
best_model = grid.best_estimator_
y_pred_enc = best_model.predict(X_test) 

y_pred = le.inverse_transform(y_pred_enc)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Confusion Matrix:
 [[ 39   2   0]
 [  5 158   4]
 [  0   6  19]]
Classification Report:
               precision    recall  f1-score   support

           2       0.89      0.95      0.92        41
           3       0.95      0.95      0.95       167
           4       0.83      0.76      0.79        25

    accuracy                           0.93       233
   macro avg       0.89      0.89      0.89       233
weighted avg       0.93      0.93      0.93       233



## 6. Saving Best Model and report of Others

In [93]:
import json
import joblib

# Save model scores
pd.DataFrame(result_df).to_csv('D:\\DATA_SCIENCE_A.I\\INX Fututre Inc\\artifacts\\best_model_metrics\\model_scores.csv', index=False)

# Save model, encoder
joblib.dump(best_model, 'D:\\DATA_SCIENCE_A.I\\INX Fututre Inc\\artifacts\\xgb_model.pkl')
joblib.dump(le, 'D:\\DATA_SCIENCE_A.I\\INX Fututre Inc\\artifacts\\label_encoder.pkl')

# Save classification report
report = classification_report(y_test, y_pred, output_dict=True)
with open('D:\\DATA_SCIENCE_A.I\\INX Fututre Inc\\artifacts\\best_model_metrics\\classification_report.json', 'w') as f:
    json.dump(report, f)

# 4. Save confusion matrix
cm = confusion_matrix(y_test, y_pred)
np.save('D:\\DATA_SCIENCE_A.I\\INX Fututre Inc\\artifacts\\best_model_metrics\\confusion_matrix.npy', cm)

# 5. Save predictions vs true labels
pd.DataFrame({'y_true': y_test, 'y_pred': y_pred}).to_csv('D:\\DATA_SCIENCE_A.I\\INX Fututre Inc\\artifacts\\best_model_metrics\\predictions.csv', index=False)

In [91]:
result_df

Unnamed: 0,Model,Train Accuracy,Train Precision,Train F1,Test Accuracy,Test Precision,Test F1
0,Logistic Regression,0.879699,0.851719,0.822018,0.845494,0.815935,0.74094
1,Decision Tree,1.0,1.0,1.0,0.875536,0.819354,0.81412
2,Random Forest,1.0,1.0,1.0,0.918455,0.910019,0.86043
3,Gradient Boosting,0.995704,0.994705,0.993697,0.948498,0.940992,0.909223
4,Ada Boost,0.735768,0.664432,0.681172,0.729614,0.675421,0.670839
5,SVM,0.887218,0.920318,0.814134,0.824034,0.809513,0.688168
6,XG Boost,1.0,1.0,1.0,0.95279,0.951569,0.921072
