## Importing libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import matplotlib
import time
from sklearn.metrics import classification_report
warnings.filterwarnings('ignore')

## 1. Load data

In [2]:
# df = pd.read_csv('/root/source_code/raw_data/Cars.csv')
df = pd.read_csv('C:/AIT/FirstSem/ML/Assignment/A3/A3_Car_Price_Prediction/Source_Code/raw_data/Cars.csv')

## 2. Preprocessing

In [3]:
var_value = df['max_power'].str.split(' ', expand=True)[0]
var_value_2 = [None if isinstance(value, str) and value.isalpha() else float(value) for value in var_value]
df['max_power'] = var_value_2

In [4]:
df['mileage'] = df['mileage'].str.split(' ', expand=True)[0].astype(float)

In [5]:
from datetime import datetime
df['car_age'] = (datetime.now().year) - df['year']

## Convert the continuous label "selling_price" into a discrete variable with four classes (0, 1, 2, 3)

In [6]:
min(df['selling_price']),max(df['selling_price'])

(29999, 10000000)

In [7]:
bins = [-float('inf'), 100000, 500000, 800000, float('inf')]

In [8]:
labels = [0, 1, 2, 3]

In [9]:
df['selling_price_category'] = pd.cut(df['selling_price'], bins=bins, labels=labels)

In [31]:
df.head(10)

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats,car_age,selling_price_category
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4,1248 CC,74.0,190Nm@ 2000rpm,5.0,9,1
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14,1498 CC,103.52,250Nm@ 1500-2500rpm,5.0,9,1
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7,1497 CC,78.0,"12.7@ 2,700(kgm@ rpm)",5.0,17,1
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0,1396 CC,90.0,22.4 kgm at 1750-2750rpm,5.0,13,1
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1,1298 CC,88.2,"11.5@ 4,500(kgm@ rpm)",5.0,16,1
5,Hyundai Xcent 1.2 VTVT E Plus,2017,440000,45000,Petrol,Individual,Manual,First Owner,20.14,1197 CC,81.86,113.75nm@ 4000rpm,5.0,6,1
6,Maruti Wagon R LXI DUO BSIII,2007,96000,175000,LPG,Individual,Manual,First Owner,17.3,1061 CC,57.5,"7.8@ 4,500(kgm@ rpm)",5.0,16,0
7,Maruti 800 DX BSII,2001,45000,5000,Petrol,Individual,Manual,Second Owner,16.1,796 CC,37.0,59Nm@ 2500rpm,4.0,22,0
8,Toyota Etios VXD,2011,350000,90000,Diesel,Individual,Manual,First Owner,23.59,1364 CC,67.1,170Nm@ 1800-2400rpm,5.0,12,1
9,Ford Figo Diesel Celebration Edition,2013,200000,169000,Diesel,Individual,Manual,First Owner,20.0,1399 CC,68.1,160Nm@ 2000rpm,5.0,10,1


In [10]:
df.groupby(['selling_price_category']).count()

Unnamed: 0_level_0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats,car_age
selling_price_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,376,376,376,376,376,376,376,376,319,319,318,318,319,376
1,4263,4263,4263,4263,4263,4263,4263,4263,4113,4113,4117,4113,4113,4263
2,2194,2194,2194,2194,2194,2194,2194,2194,2183,2183,2185,2183,2183,2194
3,1295,1295,1295,1295,1295,1295,1295,1295,1292,1292,1292,1292,1292,1295


## 4. Feature selection

In [11]:
X = df[        ['max_power', 'car_age', 'mileage']        ]

y = df['selling_price_category']

### Train test split

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 72)

Null values checking

In [13]:
X_train[['max_power', 'car_age', 'mileage']].isna().sum()

max_power    149
car_age        0
mileage      154
dtype: int64

In [14]:
X_test[['max_power', 'car_age', 'mileage']].isna().sum()

max_power    67
car_age       0
mileage      67
dtype: int64

In [15]:
y_train.isna().sum(), y_test.isna().sum()

(0, 0)

Fill Null value

In [16]:
X_train['max_power'].fillna(X_train['max_power'].median(), inplace=True)

In [17]:
X_test['max_power'].fillna(X_train['max_power'].median(), inplace=True)

In [18]:
X_train['mileage'].fillna(X_train['mileage'].mean(), inplace=True)

In [19]:
X_test['mileage'].fillna(X_train['mileage'].mean(), inplace=True)

In [20]:
X_train[['max_power', 'car_age', 'mileage']].isna().sum()

max_power    0
car_age      0
mileage      0
dtype: int64

In [21]:
X_test[['max_power', 'car_age', 'mileage']].isna().sum()

max_power    0
car_age      0
mileage      0
dtype: int64

In [22]:
y_train.isna().sum(), y_test.isna().sum()

(0, 0)

### Scaling

In [23]:
from sklearn.preprocessing import StandardScaler

# feature scaling for X features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

In [24]:
# add intercept to our X
intercept = np.ones((X_train.shape[0], 1))
X_train   = np.concatenate((intercept, X_train), axis=1)  #add intercept
intercept = np.ones((X_test.shape[0], 1))
X_test    = np.concatenate((intercept, X_test), axis=1)  #add intercept

In [25]:
class LogisticRegression:
    
    def __init__(self, k, n, method, alpha = 0.001, max_iter=5000):
        self.k = k
        self.n = n
        self.alpha = alpha
        self.max_iter = max_iter
        self.method = method
    
    def fit(self, X, Y):
        self.W = np.random.rand(self.n, self.k)
        self.losses = []
        
        if self.method == "batch":
            start_time = time.time()
            for i in range(self.max_iter):
                loss, grad =  self.gradient(X, Y)
                self.losses.append(loss)
                self.W = self.W - self.alpha * grad
                if i % 500 == 0:
                    print(f"Loss at iteration {i}", loss)
            print(f"time taken: {time.time() - start_time}")
            
        elif self.method == "minibatch":
            start_time = time.time()
            batch_size = int(0.3 * X.shape[0])
            for i in range(self.max_iter):
                ix = np.random.randint(0, X.shape[0]) #<----with replacement
                batch_X = X[ix:ix+batch_size]
                batch_Y = Y[ix:ix+batch_size]
                loss, grad = self.gradient(batch_X, batch_Y)
                self.losses.append(loss)
                self.W = self.W - self.alpha * grad
                if i % 500 == 0:
                    print(f"Loss at iteration {i}", loss)
            print(f"time taken: {time.time() - start_time}")
            
        elif self.method == "sto":
            start_time = time.time()
            list_of_used_ix = []
            for i in range(self.max_iter):
                idx = np.random.randint(X.shape[0])
                while i in list_of_used_ix:
                    idx = np.random.randint(X.shape[0])
                X_train = X[idx, :].reshape(1, -1)
                Y_train = Y[idx]
                loss, grad = self.gradient(X_train, Y_train)
                self.losses.append(loss)
                self.W = self.W - self.alpha * grad
                
                list_of_used_ix.append(i)
                if len(list_of_used_ix) == X.shape[0]:
                    list_of_used_ix = []
                if i % 500 == 0:
                    print(f"Loss at iteration {i}", loss)
            print(f"time taken: {time.time() - start_time}")
            
        else:
            raise ValueError('Method must be one of the followings: "batch", "minibatch" or "sto".')
        
        
    def gradient(self, X, Y):
        m = X.shape[0]
        h = self.h_theta(X, self.W)
        loss = - np.sum(Y*np.log(h)) / m
        error = h - Y
        grad = self.softmax_grad(X, error)
        return loss, grad

    def softmax(self, theta_t_x):
        return np.exp(theta_t_x) / np.sum(np.exp(theta_t_x), axis=1, keepdims=True)

    def softmax_grad(self, X, error):
        return  X.T @ error

    def h_theta(self, X, W):
        '''
        Input:
            X shape: (m, n)
            w shape: (n, k)
        Returns:
            yhat shape: (m, k)
        '''
        return self.softmax(X @ W)
    
    def predict(self, X_test):
        return np.argmax(self.h_theta(X_test, self.W), axis=1)
    
    def plot(self):
        plt.plot(np.arange(len(self.losses)) , self.losses, label = "Train Losses")
        plt.title("Losses")
        plt.xlabel("epoch")
        plt.ylabel("losses")
        plt.legend()
        
    def set_confustion_matrix(self, confusion_matrix):
        self.confusion_matrix = confusion_matrix
    
    def accuracy(self):
        true_pred = np.sum(np.diagonal(self.confusion_matrix))
        total_pred = np.sum(self.confusion_matrix)
        return true_pred / total_pred
    
    def precision(self, class_label):
        true_positives = self.confusion_matrix[class_label, class_label]
        false_positives = np.sum(self.confusion_matrix[:, class_label]) - true_positives
        
        if true_positives + false_positives == 0:
            return 0  # Avoid division by zero
        
        precision_c = true_positives / (true_positives + false_positives)
        return precision_c

    def recall(self, class_label):
        true_positives = self.confusion_matrix[class_label, class_label]
        false_negatives = np.sum(self.confusion_matrix[class_label, :]) - true_positives
        
        if true_positives + false_negatives == 0:
            return 0  # Avoid division by zero
        
        recall_c = true_positives / (true_positives + false_negatives)
        return recall_c

    def f1_score(self, class_label):
        prec = self.precision(class_label)
        rec = self.recall(class_label)
        
        if prec + rec == 0:
            return 0  # Avoid division by zero
        
        f1_c = 2 * (prec * rec) / (prec + rec)
        return f1_c

In [None]:
class LogisticRegression:
    # ... (previous code)

    def macro_precision(self):
        total_precision = 0.0
        for class_label in range(self.k):
            total_precision += self.compute_precision(class_label)
        
        macro_precision_score = total_precision / self.k
        return macro_precision_score

    def macro_recall(self):
        total_recall = 0.0
        for class_label in range(self.k):
            total_recall += self.compute_recall(class_label)
        
        macro_recall_score = total_recall / self.k
        return macro_recall_score

    def macro_f1_score(self):
        total_f1 = 0.0
        for class_label in range(self.k):
            total_f1 += self.compute_f1_score(class_label)
        
        macro_f1_score = total_f1 / self.k
        return macro_f1_score


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import time

class LogisticRegression:
    
    def __init__(self, k, n, method, alpha=0.001, max_iter=5000):
        self.k = k
        self.n = n
        self.alpha = alpha
        self.max_iter = max_iter
        self.method = method
        self.W = None
        self.losses = []
        self.confusion_matrix = None
    
    # Other methods...

    def set_confusion_matrix(self, confusion_matrix):
        self.confusion_matrix = confusion_matrix
    
    def compute_accuracy(self):
        true_pred = np.sum(np.diagonal(self.confusion_matrix))
        total_pred = np.sum(self.confusion_matrix)
        return true_pred / total_pred
    
    def compute_precision(self, class_label):
        true_positives = self.confusion_matrix[class_label, class_label]
        false_positives = np.sum(self.confusion_matrix[:, class_label]) - true_positives
        
        if true_positives + false_positives == 0:
            return 0  # Avoid division by zero
        
        precision_c = true_positives / (true_positives + false_positives)
        return precision_c

    def compute_recall(self, class_label):
        true_positives = self.confusion_matrix[class_label, class_label]
        false_negatives = np.sum(self.confusion_matrix[class_label, :]) - true_positives
        
        if true_positives + false_negatives == 0:
            return 0  # Avoid division by zero
        
        recall_c = true_positives / (true_positives + false_negatives)
        return recall_c

    def compute_f1_score(self, class_label):
        prec = self.compute_precision(class_label)
        rec = self.compute_recall(class_label)
        
        if prec + rec == 0:
            return 0  # Avoid division by zero
        
        f1_c = 2 * (prec * rec) / (prec + rec)
        return f1_c
    
    def macro_precision(self):
        num_classes = self.confusion_matrix.shape[0]
        precision_sum = sum([self.compute_precision(class_label) for class_label in range(num_classes)])
        macro_precision_score = precision_sum / num_classes
        return macro_precision_score
    
    def macro_recall(self):
        num_classes = self.confusion_matrix.shape[0]
        recall_sum = sum([self.compute_recall(class_label) for class_label in range(num_classes)])
        macro_recall_score = recall_sum / num_classes
        return macro_recall_score
    
    def macro_f1_score(self):
        num_classes = self.confusion_matrix.shape[0]
        f1_sum = sum([self.compute_f1_score(class_label) for class_label in range(num_classes)])
        macro_f1_score = f1_sum / num_classes
        return macro_f1_score


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import time

class LogisticRegression:
    
    def __init__(self, k, n, method, alpha=0.001, max_iter=5000):
        self.k = k
        self.n = n
        self.alpha = alpha
        self.max_iter = max_iter
        self.method = method
        self.W = None
        self.losses = []
        self.confusion_matrix = None
    
    # Other methods...

    def set_confusion_matrix(self, confusion_matrix):
        self.confusion_matrix = confusion_matrix
    
    def compute_accuracy(self):
        true_pred = np.sum(np.diagonal(self.confusion_matrix))
        total_pred = np.sum(self.confusion_matrix)
        return true_pred / total_pred
    
    def compute_precision(self, class_label):
        true_positives = self.confusion_matrix[class_label, class_label]
        false_positives = np.sum(self.confusion_matrix[:, class_label]) - true_positives
        
        if true_positives + false_positives == 0:
            return 0  # Avoid division by zero
        
        precision_c = true_positives / (true_positives + false_positives)
        return precision_c

    def compute_recall(self, class_label):
        true_positives = self.confusion_matrix[class_label, class_label]
        false_negatives = np.sum(self.confusion_matrix[class_label, :]) - true_positives
        
        if true_positives + false_negatives == 0:
            return 0  # Avoid division by zero
        
        recall_c = true_positives / (true_positives + false_negatives)
        return recall_c

    def compute_f1_score(self, class_label):
        prec = self.compute_precision(class_label)
        rec = self.compute_recall(class_label)
        
        if prec + rec == 0:
            return 0  # Avoid division by zero
        
        f1_c = 2 * (prec * rec) / (prec + rec)
        return f1_c
    
    def macro_precision(self):
        num_classes = self.confusion_matrix.shape[0]
        precision_sum = sum([self.compute_precision(class_label) for class_label in range(num_classes)])
        macro_precision_score = precision_sum / num_classes
        return macro_precision_score
    
    def macro_recall(self):
        num_classes = self.confusion_matrix.shape[0]
        recall_sum = sum([self.compute_recall(class_label) for class_label in range(num_classes)])
        macro_recall_score = recall_sum / num_classes
        return macro_recall_score
    
    def macro_f1_score(self):
        num_classes = self.confusion_matrix.shape[0]
        f1_sum = sum([self.compute_f1_score(class_label) for class_label in range(num_classes)])
        macro_f1_score = f1_sum / num_classes
        return macro_f1_score


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import time

class LogisticRegression:
    
    def __init__(self, k, n, method, alpha=0.001, max_iter=5000):
        self.k = k
        self.n = n
        self.alpha = alpha
        self.max_iter = max_iter
        self.method = method
        self.W = None
        self.losses = []
        self.confusion_matrix = None
    
    # Other methods...

    def set_confusion_matrix(self, confusion_matrix):
        self.confusion_matrix = confusion_matrix
    
    def compute_accuracy(self):
        true_pred = np.sum(np.diagonal(self.confusion_matrix))
        total_pred = np.sum(self.confusion_matrix)
        return true_pred / total_pred
    
    def compute_precision(self, class_label=None):
        if class_label is not None:
            true_positives = self.confusion_matrix[class_label, class_label]
            false_positives = np.sum(self.confusion_matrix[:, class_label]) - true_positives
            
            if true_positives + false_positives == 0:
                return 0  # Avoid division by zero
            
            precision_c = true_positives / (true_positives + false_positives)
            return precision_c
        else:
            num_classes = self.confusion_matrix.shape[0]
            precision_scores = [self.compute_precision(class_label) for class_label in range(num_classes)]
            return precision_scores
    
    def compute_recall(self, class_label=None):
        # Similar modification for recall
    
    def compute_f1_score(self, class_label=None):
        # Similar modification for F1-score
    
    def macro_precision(self):
        num_classes = self.confusion_matrix.shape[0]
        precision_sum = sum([self.compute_precision(class_label) for class_label in range(num_classes)])
        macro_precision_score = precision_sum / num_classes
        return macro_precision_score
    
    def macro_recall(self):
        # Similar modification for macro recall
    
    def macro_f1_score(self):
        # Similar modification for macro F1-score


In [58]:
k = len(set(y))  # no. of class  (can also use np.unique)
m = X_train.shape[0]  # no.of samples
n = X_train.shape[1]  # no. of features
Y_train_encoded = np.zeros((m, k))
for each_class in range(k):
    cond = y_train==each_class
    Y_train_encoded[np.where(cond), each_class] = 1

In [137]:
model = LogisticRegression(k, X_train.shape[1], "minibatch")
model.fit(X_train, Y_train_encoded)
yhat = model.predict(X_train)

Loss at iteration 0 1.3434983509468537
Loss at iteration 500 0.5609686400699181
Loss at iteration 1000 0.5854418120071745
Loss at iteration 1500 0.5626690824405367
Loss at iteration 2000 0.5704552735276375
Loss at iteration 2500 0.6270643397431978
Loss at iteration 3000 0.5379177759591983
Loss at iteration 3500 0.5484444535983476
Loss at iteration 4000 0.5971013922958279
Loss at iteration 4500 0.5503689639610956
time taken: 0.9316177368164062


In [138]:
# Create confusion matrix base
rows = 4
cols = 4
confustion_matrix = np.zeros((rows, cols))

In [139]:
confustion_matrix

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [136]:
# yhat = model.predict(X_train)
# y_train.shape
# yhat[123]

In [140]:
for i in range(y_train.shape[0]):
    row = y_train.values[i]
    col = yhat[i]
    confustion_matrix[row][col] += 1
    
confustion_matrix

array([[ 127.,  133.,    0.,    0.],
       [  66., 2517.,  414.,   10.],
       [   0.,  443.,  982.,   94.],
       [   0.,   20.,  251.,  632.]])

In [None]:
confustion_matrix

In [141]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_train, yhat)

In [142]:
conf_matrix

array([[ 127,  133,    0,    0],
       [  66, 2517,  414,   10],
       [   0,  443,  982,   94],
       [   0,   20,  251,  632]], dtype=int64)

In [None]:
accuracy = model.accuracy(true_pred,total_pred)