## Importing libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import matplotlib
import time
from sklearn.metrics import classification_report
warnings.filterwarnings('ignore')

## 1. Load data

In [2]:
# df = pd.read_csv('/root/source_code/raw_data/Cars.csv')
df = pd.read_csv('C:/AIT/FirstSem/ML/Assignment/A3/A3_Car_Price_Prediction/Source_Code/raw_data/Cars.csv')

## 2. Preprocessing

In [3]:
var_value = df['max_power'].str.split(' ', expand=True)[0]
var_value_2 = [None if isinstance(value, str) and value.isalpha() else float(value) for value in var_value]
df['max_power'] = var_value_2

In [4]:
df['mileage'] = df['mileage'].str.split(' ', expand=True)[0].astype(float)

In [5]:
from datetime import datetime
df['car_age'] = (datetime.now().year) - df['year']

## Convert the continuous label "selling_price" into a discrete variable with four classes (0, 1, 2, 3)

In [6]:
min(df['selling_price']),max(df['selling_price'])

(29999, 10000000)

In [7]:
bins = [-float('inf'), 100000, 500000, 800000, float('inf')]

In [8]:
labels = [0, 1, 2, 3]

In [9]:
df['selling_price_category'] = pd.cut(df['selling_price'], bins=bins, labels=labels)

In [10]:
df.head(10)

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats,car_age,selling_price_category
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4,1248 CC,74.0,190Nm@ 2000rpm,5.0,9,1
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14,1498 CC,103.52,250Nm@ 1500-2500rpm,5.0,9,1
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7,1497 CC,78.0,"12.7@ 2,700(kgm@ rpm)",5.0,17,1
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0,1396 CC,90.0,22.4 kgm at 1750-2750rpm,5.0,13,1
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1,1298 CC,88.2,"11.5@ 4,500(kgm@ rpm)",5.0,16,1
5,Hyundai Xcent 1.2 VTVT E Plus,2017,440000,45000,Petrol,Individual,Manual,First Owner,20.14,1197 CC,81.86,113.75nm@ 4000rpm,5.0,6,1
6,Maruti Wagon R LXI DUO BSIII,2007,96000,175000,LPG,Individual,Manual,First Owner,17.3,1061 CC,57.5,"7.8@ 4,500(kgm@ rpm)",5.0,16,0
7,Maruti 800 DX BSII,2001,45000,5000,Petrol,Individual,Manual,Second Owner,16.1,796 CC,37.0,59Nm@ 2500rpm,4.0,22,0
8,Toyota Etios VXD,2011,350000,90000,Diesel,Individual,Manual,First Owner,23.59,1364 CC,67.1,170Nm@ 1800-2400rpm,5.0,12,1
9,Ford Figo Diesel Celebration Edition,2013,200000,169000,Diesel,Individual,Manual,First Owner,20.0,1399 CC,68.1,160Nm@ 2000rpm,5.0,10,1


In [11]:
df.groupby(['selling_price_category']).count()

Unnamed: 0_level_0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats,car_age
selling_price_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,376,376,376,376,376,376,376,376,319,319,318,318,319,376
1,4263,4263,4263,4263,4263,4263,4263,4263,4113,4113,4117,4113,4113,4263
2,2194,2194,2194,2194,2194,2194,2194,2194,2183,2183,2185,2183,2183,2194
3,1295,1295,1295,1295,1295,1295,1295,1295,1292,1292,1292,1292,1292,1295


## 4. Feature selection

In [12]:
X = df[        ['max_power', 'car_age', 'mileage']        ]

y = df['selling_price_category']

### Train test split

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 72)

Null values checking

In [14]:
X_train[['max_power', 'car_age', 'mileage']].isna().sum()

max_power    149
car_age        0
mileage      154
dtype: int64

In [15]:
X_test[['max_power', 'car_age', 'mileage']].isna().sum()

max_power    67
car_age       0
mileage      67
dtype: int64

In [16]:
y_train.isna().sum(), y_test.isna().sum()

(0, 0)

Fill Null value

In [17]:
X_train['max_power'].fillna(X_train['max_power'].median(), inplace=True)

In [18]:
X_test['max_power'].fillna(X_train['max_power'].median(), inplace=True)

In [19]:
X_train['mileage'].fillna(X_train['mileage'].mean(), inplace=True)

In [20]:
X_test['mileage'].fillna(X_train['mileage'].mean(), inplace=True)

In [21]:
X_train[['max_power', 'car_age', 'mileage']].isna().sum()

max_power    0
car_age      0
mileage      0
dtype: int64

In [22]:
X_test[['max_power', 'car_age', 'mileage']].isna().sum()

max_power    0
car_age      0
mileage      0
dtype: int64

In [23]:
y_train.isna().sum(), y_test.isna().sum()

(0, 0)

### Scaling

In [24]:
from sklearn.preprocessing import StandardScaler

# feature scaling for X features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

In [25]:
# add intercept to our X
intercept = np.ones((X_train.shape[0], 1))
X_train   = np.concatenate((intercept, X_train), axis=1)  #add intercept
intercept = np.ones((X_test.shape[0], 1))
X_test    = np.concatenate((intercept, X_test), axis=1)  #add intercept

In [26]:
import mlflow
import os

mlflow.set_tracking_uri("https://mlflow.cs.ait.ac.th/")

os.environ["LOGNAME"] = "st124377"
mlflow.set_experiment(experiment_name="st124377-a3")

<Experiment: artifact_location='mlflow-artifacts:/864302170325912155', creation_time=1695814774427, experiment_id='864302170325912155', last_update_time=1695814774427, lifecycle_stage='active', name='st124377-a3', tags={}>

In [27]:
class LogisticRegression:
    
    def __init__(self, regularization, k, n, method, alpha = 0.001, max_iter=5000):
        self.k = k
        self.n = n
        self.alpha = alpha
        self.max_iter = max_iter
        self.method = method
        self.regularization = regularization
    
    def fit(self, X, Y):
        self.W = np.random.rand(self.n, self.k)
        self.losses = []
        
        if self.method == "batch":
            start_time = time.time()
            for i in range(self.max_iter):
                loss, grad =  self.gradient(X, Y)
                self.losses.append(loss)
                self.W = self.W - self.alpha * grad
                # if i % 500 == 0:
                #     print(f"Loss at iteration {i}", loss)
            print(f"time taken: {time.time() - start_time}")
            
        elif self.method == "minibatch":
            start_time = time.time()
            batch_size = int(0.3 * X.shape[0])
            for i in range(self.max_iter):
                ix = np.random.randint(0, X.shape[0]) #<----with replacement
                batch_X = X[ix:ix+batch_size]
                batch_Y = Y[ix:ix+batch_size]
                loss, grad = self.gradient(batch_X, batch_Y)
                self.losses.append(loss)
                self.W = self.W - self.alpha * grad
                # if i % 500 == 0:
                #     print(f"Loss at iteration {i}", loss)
            print(f"time taken: {time.time() - start_time}")
            
        elif self.method == "sto":
            start_time = time.time()
            list_of_used_ix = []
            for i in range(self.max_iter):
                idx = np.random.randint(X.shape[0])
                while i in list_of_used_ix:
                    idx = np.random.randint(X.shape[0])
                X_train = X[idx, :].reshape(1, -1)
                Y_train = Y[idx]
                loss, grad = self.gradient(X_train, Y_train)
                self.losses.append(loss)
                self.W = self.W - self.alpha * grad
                
                list_of_used_ix.append(i)
                if len(list_of_used_ix) == X.shape[0]:
                    list_of_used_ix = []
                # if i % 500 == 0:
                #     print(f"Loss at iteration {i}", loss)
            print(f"time taken: {time.time() - start_time}")
            
        else:
            raise ValueError('Method must be one of the followings: "batch", "minibatch" or "sto".')
        
        
    def gradient(self, X, Y):
        m = X.shape[0]
        h = self.h_theta(X, self.W)
        loss = (- np.sum(Y*np.log(h)) / m) + self.regularization(self.W)
        error = h - Y
        grad = self.softmax_grad(X, error) + self.regularization.derivation(self.W)
        return loss, grad

    def softmax(self, theta_t_x):
        return np.exp(theta_t_x) / np.sum(np.exp(theta_t_x), axis=1, keepdims=True)

    def softmax_grad(self, X, error):
        return  X.T @ error

    def h_theta(self, X, W):
        '''
        Input:
            X shape: (m, n)
            w shape: (n, k)
        Returns:
            yhat shape: (m, k)
        '''
        return self.softmax(X @ W)
    
    def predict(self, X_test):
        return np.argmax(self.h_theta(X_test, self.W), axis=1)
    
    def plot(self):
        plt.plot(np.arange(len(self.losses)) , self.losses, label = "Train Losses")
        plt.title("Losses")
        plt.xlabel("epoch")
        plt.ylabel("losses")
        plt.legend()
        
    def set_confustion_matrix(self, confusion_matrix):
        self.confusion_matrix = confusion_matrix
    
    def cal_accuracy(self):
        true_pred = np.sum(np.diagonal(self.confusion_matrix))
        total_pred = np.sum(self.confusion_matrix)
        return true_pred / total_pred
    
    def cal_precision(self, class_label):
        true_positives = self.confusion_matrix[class_label, class_label]
        false_positives = np.sum(self.confusion_matrix[:, class_label]) - true_positives
        
        if true_positives + false_positives == 0:
            return 0  # Avoid division by zero
        
        precision_c = true_positives / (true_positives + false_positives)
        return precision_c

    def cal_recall(self, class_label):
        true_positives = self.confusion_matrix[class_label, class_label]
        false_negatives = np.sum(self.confusion_matrix[class_label, :]) - true_positives
        
        if true_positives + false_negatives == 0:
            return 0  # Avoid division by zero
        
        recall_c = true_positives / (true_positives + false_negatives)
        return recall_c

    def cal_f1_score(self, class_label):
        prec = self.cal_precision(class_label)
        rec = self.cal_recall(class_label)
        
        if prec + rec == 0:
            return 0  # Avoid division by zero
        
        f1_c = 2 * (prec * rec) / (prec + rec)
        return f1_c
    
    def macro_precision(self):
        total_precision = 0.0
        for class_label in range(self.k):
            total_precision += self.cal_precision(class_label)
        
        macro_precision_score = total_precision / self.k
        return macro_precision_score

    def macro_recall(self):
        total_recall = 0.0
        for class_label in range(self.k):
            total_recall += self.cal_recall(class_label)
        
        macro_recall_score = total_recall / self.k
        return macro_recall_score

    def macro_f1_score(self):
        total_f1 = 0.0
        for class_label in range(self.k):
            total_f1 += self.cal_f1_score(class_label)
        
        macro_f1_score = total_f1 / self.k
        return macro_f1_score
    
    def set_class_weights(self, class_weight):
        self.class_weight = class_weight

    def weighted_precision(self):
        weighted_precision_score = 0.0
        for class_label in range(self.k):
            precision_c = self.cal_precision(class_label)
            weighted_precision_score += self.class_weight[class_label] * precision_c
        
        return weighted_precision_score

    def weighted_recall(self):
        weighted_recall_score = 0.0
        for class_label in range(self.k):
            recall_c = self.cal_recall(class_label)
            weighted_recall_score += self.class_weight[class_label] * recall_c
        
        return weighted_recall_score

    def weighted_f1_score(self):
        weighted_f1_score = 0.0
        for class_label in range(self.k):
            f1_c = self.cal_f1_score(class_label)
            weighted_f1_score += self.class_weight[class_label] * f1_c
        
        return weighted_f1_score

In [28]:
class LassoPenalty:
    
    def __init__(self, l):
        self.l = l # lambda value
        
    def __call__(self, theta): #__call__ allows us to call class as method
        return self.l * np.sum(np.abs(theta))
        
    def derivation(self, theta):
        return self.l * np.sign(theta)
    
class RidgePenalty:
    
    def __init__(self, l):
        self.l = l
        
    def __call__(self, theta): #__call__ allows us to call class as method
        return self.l * np.sum(np.square(theta))
        
    def derivation(self, theta):
        return self.l * 2 * theta
    
class NormalPenalty:
    def __init__(self):
        pass
    
    def __call__(self, theta):
        return 0
    
    def derivation(self, theta):
        return 0
    
class Lasso(LogisticRegression):
    def __init__(self, k, n, method, l):
        self.regularization = LassoPenalty(l)
        super().__init__(self.regularization, k, n, method)
        
class Ridge(LogisticRegression):
    def __init__(self, k, n, method, l):
        self.regularization = RidgePenalty(l)
        super().__init__(self.regularization, k, n, method)
        
class Normal(LogisticRegression):
    def __init__(self, k, n, method, l):
        self.regularization = NormalPenalty()
        super().__init__(self.regularization, k, n, method)

In [29]:
k = len(set(y))  # no. of class  (can also use np.unique)
m = X_train.shape[0]  # no.of samples
n = X_train.shape[1]  # no. of features
Y_train_encoded = np.zeros((m, k))
for each_class in range(k):
    cond = y_train==each_class
    Y_train_encoded[np.where(cond), each_class] = 1

In [30]:
#helper function for looping classnames
import sys

def str_to_class(classname):
    return getattr(sys.modules[__name__], classname)

In [118]:
regs = ["Ridge","Lasso", "Normal"]
methods = ["batch", "minibatch" , "sto"] # "batch", "minibatch" or "sto"

for reg in regs:
    for method in methods:
        
        params = {"k" : k, "n" : X_train.shape[1],"method" : method, "l": 0.1}
        
        mlflow.start_run(run_name=f"method-{params['method']}-l-{params['l']}-reg-{reg}", nested=True)
        mlflow.log_params(params=params)
        
        type_of_regression = str_to_class(reg)
        model = type_of_regression(**params)
        model.fit(X_train, Y_train_encoded)
        yhat = model.predict(X_train)
        
        rows = 4
        cols = 4
        confusion_matrix = np.zeros((rows, cols))
        confusion_matrix
        
        for i in range(y_train.shape[0]):
            row = y_train.values[i]
            col = yhat[i]
            confusion_matrix[row][col] += 1
            
        model.set_confustion_matrix(confusion_matrix)
        
        print("="*5, reg, "="*5)
        
        print("Method: ", method)
        
        print("precision","\t","recall","\t","f1-score")
        
        for each_class in range(k):
            precision = model.cal_precision(each_class)
            recall    = model.cal_recall(each_class)
            f1_score  = model.cal_f1_score(each_class)
            print(f"{each_class:.2f}\t{precision:.2f}\t{recall:.2f}\t{f1_score:.2f}")
        
        accuracy = model.cal_accuracy()
        print(f"Accuracy:\t\t\t{accuracy:.2f}")
            
        macro_precision = model.macro_precision()
        macro_recall    = model.macro_recall()
        macro_f1score   = model.macro_f1_score()
        print(f"Macro-Avg:\t{macro_precision:.2f}\t{macro_recall:.2f}\t{macro_f1score:.2f}")
        
        class_weight = (y_train.value_counts() / y_train.count()).to_dict()
        model.set_class_weights(class_weight)
        
        weight_precision = model.weighted_precision() 
        weight_recall    = model.weighted_recall()
        weight_f1score   = model.weighted_f1_score()
        print(f"Weighted-Avg:\t{weight_precision:.2f}\t{weight_recall:.2f}\t{weight_f1score:.2f}")
        
        mlflow.log_metric(key="Accuracy", value=accuracy)
        mlflow.log_metric(key="Macro_F1_Score", value=macro_f1score)
        mlflow.log_metric(key="Weighted_F1_Score", value=weight_f1score)
        
        signature = mlflow.models.infer_signature(X_train, model.predict(X_train))
        mlflow.sklearn.log_model(model, artifact_path='model', signature=signature)
        
        mlflow.end_run()

time taken: 2.076197862625122
===== Ridge =====
Method:  batch
precision 	 recall 	 f1-score
0.00	0.73	0.40	0.52
1.00	0.83	0.80	0.82
2.00	0.58	0.75	0.66
3.00	0.87	0.67	0.76
Accuracy:			0.75
Macro-Avg:	0.75	0.66	0.69
Weighted-Avg:	0.77	0.75	0.75
time taken: 0.5689020156860352
===== Ridge =====
Method:  minibatch
precision 	 recall 	 f1-score
0.00	0.75	0.45	0.56
1.00	0.82	0.84	0.83
2.00	0.60	0.67	0.63
3.00	0.86	0.70	0.77
Accuracy:			0.76
Macro-Avg:	0.76	0.67	0.70
Weighted-Avg:	0.76	0.76	0.76
time taken: 0.12984132766723633
===== Ridge =====
Method:  sto
precision 	 recall 	 f1-score
0.00	0.00	0.00	0.00
1.00	0.70	0.96	0.81
2.00	0.67	0.29	0.41
3.00	0.74	0.73	0.74
Accuracy:			0.70
Macro-Avg:	0.53	0.50	0.49
Weighted-Avg:	0.67	0.70	0.65
time taken: 2.01151442527771
===== Lasso =====
Method:  batch
precision 	 recall 	 f1-score
0.00	0.72	0.40	0.51
1.00	0.83	0.80	0.82
2.00	0.58	0.75	0.66
3.00	0.87	0.67	0.76
Accuracy:			0.75
Macro-Avg:	0.75	0.66	0.69
Weighted-Avg:	0.77	0.75	0.75
time taken: 1.19

## Load Model from ML Flow

In [31]:
import mlflow.pyfunc
import os

mlflow.set_tracking_uri("https://mlflow.cs.ait.ac.th/")
os.environ["LOGNAME"] = "st124377"

model_name = "st124377-a3-model"
model_version = 2

loaded_model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{model_version}")

Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00,  5.53it/s]


In [32]:
yhat = loaded_model.predict(X_train)