In [1]:
#Import important libraries
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [2]:
#Import warnings
import warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")

In [3]:
#Load the dataset
data = pd.read_excel("sample_data.xlsx")
data.head()

Unnamed: 0,AD,R,E,T
0,437.086107,2040.288035,70,2
1,955.642876,3761.475756,54,5
2,758.794548,2884.90502,32,9
3,638.792636,2758.10797,82,7
4,240.416776,1633.378851,69,8


In [4]:
#Seperate features and target variables
X = data.drop(columns=['R'])
y = data["R"]

In [5]:
#seperate numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64','float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [6]:
#Create transformers for data preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='mean'))])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num',numeric_transformer,numeric_features),
        ('cat',categorical_transformer,categorical_features)])

In [7]:
#Create a pipeline

base_pipeline=Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('regressor',KNeighborsRegressor())
])

In [8]:
#Performing train test split
X_train, X_test, y_train,y_test = train_test_split(X,y, test_size=0.20, random_state=42)

In [9]:
#Train and evaluate the Base Model
base_pipeline.fit(X_train, y_train)

In [10]:
#Predictions and evaluation metrics
print("EVAL METRICS FOR BASE MODEL")
y_train_pred=base_pipeline.predict(X_train)
y_test_pred=base_pipeline.predict(X_test)

def evaluate_model(y_true,y_pred,data_type="Train"):
    print(f"Evaluation metrics for {data_type} data:")
    print(f"Mean Squared Eror: {mean_squared_error(y_true, y_pred)}")
    print(f"R2 Score:{r2_score(y_true, y_pred)}")
    print("\n")

evaluate_model(y_train, y_train_pred,"train")
evaluate_model(y_test, y_test_pred,"Test")

EVAL METRICS FOR BASE MODEL
Evaluation metrics for train data:
Mean Squared Eror: 2718.78815008907
R2 Score:0.9960420224610349


Evaluation metrics for Test data:
Mean Squared Eror: 5070.264908017641
R2 Score:0.9922474811962217




In [11]:
#Hyperparameters tuning
param_grid={
    'regressor__n_neighbors':[3,5,7,9],
    'regressor__weights':['uniform','distance'],
    'regressor__metric':['euclidean','manhattan']
}
grid_search = GridSearchCV(base_pipeline, param_grid, cv=5, n_jobs=-1,
                           scoring='neg_mean_squared_error')
grid_search.fit(X_train,y_train)

print(f"Best Hyperparameters: {grid_search.best_params_}")

Best Hyperparameters: {'regressor__metric': 'euclidean', 'regressor__n_neighbors': 5, 'regressor__weights': 'distance'}


In [12]:
#Build the model and evaluate
print("")
print("-----EVAL METRICS FOR BEST MODEL-----")
print("")

best_pipeline = grid_search.best_estimator_

y_train_best_pred = best_pipeline.predict(X_train)
y_test_best_pred = best_pipeline.predict(X_test)

evaluate_model(y_train_best_pred, y_train,"Train(Best Model)")
evaluate_model(y_test_best_pred, y_test,"Test(Best Model)")


-----EVAL METRICS FOR BEST MODEL-----

Evaluation metrics for Train(Best Model) data:
Mean Squared Eror: 0.0
R2 Score:1.0


Evaluation metrics for Test(Best Model) data:
Mean Squared Eror: 4909.920602129284
R2 Score:0.9924965355410391




In [14]:
#Compare metrics

metrics_comparison=pd.DataFrame({
    'Metric':['Mean Squared Error','R2 Score'],
    'BaseModelTrain':[mean_squared_error(y_train,y_train_pred),r2_score(y_train,y_train_pred)],
    'BaseModelTest':[mean_squared_error(y_test, y_test_pred),r2_score(y_test,y_test_pred)],
    'BestModelTrain':[mean_squared_error(y_train, y_train_best_pred), r2_score(y_train,y_train_best_pred)],
    'BestModelTest':[mean_squared_error(y_test,y_test_best_pred),r2_score(y_test, y_test_best_pred)]
})
print(metrics_comparison)

               Metric  BaseModelTrain  BaseModelTest  BestModelTrain  \
0  Mean Squared Error     2718.788150    5070.264908             0.0   
1            R2 Score        0.996042       0.992247             1.0   

   BestModelTest  
0    4909.920602  
1       0.992493  


In [15]:
#Train the model on the entire dataset
final_model= best_pipeline.fit(X,y)

In [16]:
#Saving the final model
joblib.dump(final_model,'final_KNN_regressor_model.pkl')

print("Final model saved as 'final_KNN_regressor_model.pkl'")

Final model saved as 'final_KNN_regressor_model.pkl'
