In [1]:
import mlflow
import dagshub
import pandas as pd
import numpy as np
import os
import yaml
import pickle
import optuna
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK, space_eval
from hyperopt.pyll.base import scope
from pathlib import Path
from box import ConfigBox

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

import warnings as w
w.filterwarnings("ignore")

from typing import NewType #type: ignore
ML_Model = NewType('Machine_Learning_Model', object)
os.chdir("F:\iNeuron\End2End\ML Project - KrishNaik - Gemstone Price Prediction\Studentmlprojectregression")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pwd

'F:\\iNeuron\\End2End\\ML Project - KrishNaik - Gemstone Price Prediction\\Studentmlprojectregression'

In [3]:
data_path = os.path.join("F:\iNeuron\End2End\ML Project - KrishNaik - Gemstone Price Prediction\Studentmlprojectregression\\notebook\data","gemstone.csv")
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [4]:
X = df.drop(labels=['id','price'],axis=1)
Y = df[['price']]

In [5]:
xtrain, xtest, ytrain, ytest = train_test_split(X,Y,test_size=0.2,random_state=42)

In [6]:
# Define which columns should be ordinal-encoded and which should be scaled
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns
            
# Define the custom ranking for each ordinal variable
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

num_pipeline = Pipeline(
                steps = [
                ('imputer',SimpleImputer(strategy='median')),
                ('scaler',StandardScaler())                
                ]
            )

# Categorical Pipeline
cat_pipeline = Pipeline(
                steps=[
                ('imputer',SimpleImputer(strategy='most_frequent')),
                ('ordinal_encoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
                ('scaler',StandardScaler())
                ]
            )

preprocessor = ColumnTransformer(
                [
                ('num_pipeline',num_pipeline,numerical_cols),
                ('cat_pipeline',cat_pipeline,categorical_cols)
                ]
            )

In [7]:
xtrain = pd.DataFrame(preprocessor.fit_transform(xtrain),columns=preprocessor.get_feature_names_out())
xtest = pd.DataFrame(preprocessor.transform(xtest),columns=preprocessor.get_feature_names_out())

In [8]:
preprocessor.get_feature_names_out()

array(['num_pipeline__carat', 'num_pipeline__depth',
       'num_pipeline__table', 'num_pipeline__x', 'num_pipeline__y',
       'num_pipeline__z', 'cat_pipeline__cut', 'cat_pipeline__color',
       'cat_pipeline__clarity'], dtype=object)

In [9]:
def load_yaml(filepath:Path):
    try:
        filepath_,filename = os.path.split(filepath)
        with open(filepath) as yaml_file:
            config = yaml.load(yaml_file,
                               Loader = yaml.CLoader)
            # logger.info(f"{filename} yaml_file is loaded")
            return ConfigBox(config)
    except Exception as e:
        raise e

In [10]:
def eval_metrics(true , predicted) -> float:
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    # return mae, rmse, r2_square
    # tn, fp, fn, tp = confusion_matrix(y_true=y_true, y_pred=y_pred).ravel()      
    return (mae, mse, rmse, r2_square)

In [11]:
TOKEN = dagshub.auth.get_token()
REPO = 'test_student_performance'
USER_NAME = 'Raj-Narayanan-B'

In [12]:
from dagshub.upload import create_repo
repo_ = create_repo(repo_name = 'test_student_performance', private=False)

In [21]:
repo_.upload_files

TypeError: Repo.get_repo_url() missing 2 required positional arguments: 'url_format' and 'directory'

In [15]:
!git clone https://{USER_NAME}:{TOKEN}@dagshub.com/{USER_NAME}/{REPO}.git

Cloning into 'test_student_performance'...


In [16]:
%cd {REPO}

F:\iNeuron\End2End\ML Project - KrishNaik - Gemstone Price Prediction\Studentmlprojectregression\test_student_performance


In [17]:
!dvc get https://dagshub.com/Dean/Walkthrough requirements.txt
!dvc get https://dagshub.com/Dean/Walkthrough src
!dvc get https://dagshub.com/NirBarazida/hello-world-files data/

In [19]:
dagshub.upload_files(local_path="data/", remote_path="data/",
            commit_message="Added Raw Data",versioning="dvc",
               repo=f"{USER_NAME}/{REPO}")

In [24]:
!dvc remote add origin https://dagshub.com/Raj-Narayanan-B/test_student_performance.dvc -f

In [29]:
!dvc remote modify origin --local auth basic 
!dvc remote modify origin --local user Raj-Narayanan-B 
!dvc remote modify origin --local password 8af4cc66be8aec751397fd525e47ae395fa67442

In [16]:
dagshub.upload_files(repo = Raj-Narayanan-B/test_student_performance,
    local_path = 'artifacts\data.csv',
    commit_message = "adding data.csv",
    remote_path = 'https://dagshub.com/Raj-Narayanan-B/test_student_performance/data',
)

NameError: name 'Raj' is not defined

In [11]:
PARAMS_PATH = 'params.yaml'

def parameter_tuning(model_class : ML_Model, 
                     model_name: str, 
                     x_train: pd.DataFrame, 
                     x_test: pd.DataFrame, 
                     y_train: pd.DataFrame, 
                     y_test: pd.DataFrame,
                     report_: dict,
                     *args):
    tuner_report = {}
    tuner_report['Optuna'] = {}
    tuner_report['HyperOpt'] = {}

    params_config = load_yaml(PARAMS_PATH)

####################################################### OPTUNA #######################################################
    def optuna_objective(trial):
        space_optuna = {}
        for key,value in params_config['optuna'][model_name].items():
            space_optuna[key] = eval(value)
        if model_name == 'Stacked_Classifier':
            model = model_class.set_params(**space_optuna)
        else:
            model = model_class(**space_optuna)
        # model.set_params(**space_optuna)
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        mae, mse, rmse, r2_square = eval_metrics(y_test , y_pred)
    
        return rmse
    find_param=optuna.create_study(direction = "minimize")
    find_param.optimize(optuna_objective,n_trials=2)

    tuner_report['Optuna'] = {'RMSE':find_param.best_value, 'params': find_param.best_params}
    print (f"Optuna: {model_name} --- {tuner_report['Optuna']}\n\n")

####################################################### HYPEROPT #######################################################
    def hp_objective(space):
        if model_name == 'Stacked_Classifier':
            model = model_class.set_params(**space)
        else:
            model = model_class(**space)
        # model.set_params(**space)
        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        mae, mse, rmse, r2_square = eval_metrics(y_test , y_pred)
        print ("RMSE: ", rmse)
        return rmse
    trials = Trials()
    space = {}
    for key,value in params_config['hyperopt'][model_name].items():
        space[key] = eval(value)
    best = fmin(fn= hp_objective,
                space= space,
                algo= tpe.suggest,
                max_evals = 2,
                trials= trials)
    best_params = space_eval(space,best)
    tuner_report['HyperOpt'] = {'RMSE':int(trials.average_best_error()), 'params': best_params}
    print (f"HyperOpt: {model_name} --- {tuner_report['HyperOpt']}\n\n")

####################################################### Best_RMSE & Best_Fittable_Params #######################################################
    min_rmse_value = min(tuner_report['Optuna']['RMSE'],tuner_report['HyperOpt']['RMSE'])
    if min_rmse_value == tuner_report['Optuna']['RMSE']:
        params = tuner_report['Optuna']['params']
    else:
        params = tuner_report['HyperOpt']['params']
    tuner_report['Fittable_Params'] = params
    tuner_report['Best_RMSE'] = min_rmse_value

    report_[model_name] = tuner_report
    print (f'\n\n{model_name}\nMin RMSE: {min_rmse_value}\n{report_[model_name]}\n\n')
    # print(report_.values())
    rmses = [value['Best_RMSE'] for value in report_.values()]
    min_rmse = min(rmses)
    best_model_so_far_ = [(i, min_rmse, report_[i]['Fittable_Params']) for i in report_.keys() if min_rmse == report_[i]['Best_RMSE']]

    return (tuner_report, report_, best_model_so_far_)

In [12]:
models = {
    # "Decision_Tree_Regressor": DecisionTreeRegressor,
    # "Random Forest": RandomForestRegressor,
    # "XGB_Regressor": XGBRegressor, 
    # # "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    # "Ada_Boost": AdaBoostRegressor,
    "Linear Regression": LinearRegression,
    "Lasso": Lasso,
    "Ridge": Ridge,
}
model_list = []
r2_list =[]

report = {}
for model_key, model_value in models.items():
    tuning_report,reports, best_model_so_far = parameter_tuning(model_class = model_value,
                                                                    model_name = model_key,
                                                                    x_train = xtrain,
                                                                    x_test = xtest,
                                                                    y_train = ytrain,
                                                                    y_test = ytest,
                                                                    report_ = report)
    report[model_key] = reports[model_key]
    best_model_so_far_ = best_model_so_far
    print(f"\nBest model so far: {best_model_so_far_[0]}\n")
    print(f"Model: {model_key}\nReport:\n{tuning_report}\n")

[I 2024-01-18 23:59:01,515] A new study created in memory with name: no-name-b47df972-5557-4b20-af59-4224c7215291
[I 2024-01-18 23:59:01,636] Trial 0 finished with value: 1006.6009861315857 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 1006.6009861315857.
[I 2024-01-18 23:59:01,745] Trial 1 finished with value: 1006.6009861315857 and parameters: {'fit_intercept': True}. Best is trial 0 with value: 1006.6009861315857.


Optuna: Linear Regression --- {'RMSE': 1006.6009861315857, 'params': {'fit_intercept': True}}


RMSE:                                                
4098.130848102527                                    
RMSE:                                                                         
4098.130848102527                                                             
100%|██████████| 2/2 [00:00<00:00,  9.42trial/s, best loss: 4098.130848102527]

[I 2024-01-18 23:59:01,978] A new study created in memory with name: no-name-0e26b165-c26c-4b89-aeaf-d2ea2143720e



HyperOpt: Linear Regression --- {'RMSE': 4098, 'params': {'fit_intercept': False}}




Linear Regression
Min RMSE: 1006.6009861315857
{'Optuna': {'RMSE': 1006.6009861315857, 'params': {'fit_intercept': True}}, 'HyperOpt': {'RMSE': 4098, 'params': {'fit_intercept': False}}, 'Fittable_Params': {'fit_intercept': True}, 'Best_RMSE': 1006.6009861315857}



Best model so far: ('Linear Regression', 1006.6009861315857, {'fit_intercept': True})

Model: Linear Regression
Report:
{'Optuna': {'RMSE': 1006.6009861315857, 'params': {'fit_intercept': True}}, 'HyperOpt': {'RMSE': 4098, 'params': {'fit_intercept': False}}, 'Fittable_Params': {'fit_intercept': True}, 'Best_RMSE': 1006.6009861315857}



[I 2024-01-18 23:59:02,477] Trial 0 finished with value: 1011.5477499323323 and parameters: {'alpha': 7.759774679813352, 'selection': 'cyclic'}. Best is trial 0 with value: 1011.5477499323323.
[I 2024-01-18 23:59:02,970] Trial 1 finished with value: 1008.9191956383542 and parameters: {'alpha': 4.909432354672935, 'selection': 'cyclic'}. Best is trial 1 with value: 1008.9191956383542.


Optuna: Lasso --- {'RMSE': 1008.9191956383542, 'params': {'alpha': 4.909432354672935, 'selection': 'cyclic'}}


RMSE:                                                
1007.832953861104                                    
RMSE:                                                                         
1007.9941462309794                                                            
100%|██████████| 2/2 [00:01<00:00,  1.78trial/s, best loss: 1007.832953861104]

[I 2024-01-18 23:59:04,101] A new study created in memory with name: no-name-6cef2a75-3d24-4247-8bec-f35da408ecf9
[I 2024-01-18 23:59:04,168] Trial 0 finished with value: 1006.6410068003378 and parameters: {'alpha': 7.242565844823842, 'solver': 'auto'}. Best is trial 0 with value: 1006.6410068003378.



HyperOpt: Lasso --- {'RMSE': 1007, 'params': {'alpha': 3.2568021538208622, 'selection': 'cyclic'}}




Lasso
Min RMSE: 1007
{'Optuna': {'RMSE': 1008.9191956383542, 'params': {'alpha': 4.909432354672935, 'selection': 'cyclic'}}, 'HyperOpt': {'RMSE': 1007, 'params': {'alpha': 3.2568021538208622, 'selection': 'cyclic'}}, 'Fittable_Params': {'alpha': 3.2568021538208622, 'selection': 'cyclic'}, 'Best_RMSE': 1007}



Best model so far: ('Linear Regression', 1006.6009861315857, {'fit_intercept': True})

Model: Lasso
Report:
{'Optuna': {'RMSE': 1008.9191956383542, 'params': {'alpha': 4.909432354672935, 'selection': 'cyclic'}}, 'HyperOpt': {'RMSE': 1007, 'params': {'alpha': 3.2568021538208622, 'selection': 'cyclic'}}, 'Fittable_Params': {'alpha': 3.2568021538208622, 'selection': 'cyclic'}, 'Best_RMSE': 1007}



[I 2024-01-18 23:59:09,848] Trial 1 finished with value: 1006.6430432778286 and parameters: {'alpha': 7.147564641279873, 'solver': 'saga'}. Best is trial 0 with value: 1006.6410068003378.


Optuna: Ridge --- {'RMSE': 1006.6410068003378, 'params': {'alpha': 7.242565844823842, 'solver': 'auto'}}


RMSE:                                                
1006.6368401908288                                   
RMSE:                                                
1006.6443379132719                                                             
100%|██████████| 2/2 [00:00<00:00, 11.83trial/s, best loss: 1006.6368401908288]
HyperOpt: Ridge --- {'RMSE': 1006, 'params': {'alpha': 7.45539899599816, 'solver': 'sparse_cg'}}




Ridge
Min RMSE: 1006
{'Optuna': {'RMSE': 1006.6410068003378, 'params': {'alpha': 7.242565844823842, 'solver': 'auto'}}, 'HyperOpt': {'RMSE': 1006, 'params': {'alpha': 7.45539899599816, 'solver': 'sparse_cg'}}, 'Fittable_Params': {'alpha': 7.45539899599816, 'solver': 'sparse_cg'}, 'Best_RMSE': 1006}



Best model so far: ('Ridge', 1006, {'alpha': 7.45539899599816, 'solver': 'sparse_cg'})

Model: Ridge
Report:
{'Optuna': {'RMSE': 1006.6410068003378, 'params': {'alpha'