<a href="https://colab.research.google.com/github/Nkeeydata/ML-Beginning/blob/main/Machine_Learning_Algorithms.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Installation
!pip install pandas==2.1.4
!pip install scikit-learn==1.6.0

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [None]:
df = pd.read_csv('data/car_data.csv')
df.head()

Unnamed: 0,Make,Year of manufacture,Condition,Mileage,Engine Size,Fuel,Transmission,Price,Price Category
0,Toyota,2007.0,Nigerian Used,166418.0,2400.0,Petrol,Automatic,3120000,0
1,Lexus,,,138024.0,,,Automatic,5834000,1
2,Mercedes-Benz,2008.0,Nigerian Used,376807.0,3000.0,Petrol,Automatic,3640000,0
3,Lexus,,,213362.0,,,Automatic,3594000,0
4,Mercedes-Benz,,,106199.0,,,Automatic,8410000,1


In [None]:
# Handle missing values - fill numeric columns with median
df["Year of manufacture"].fillna(df["Year of manufacture"].median(), inplace=True)
df["Engine Size"].fillna(df["Engine Size"].median(), inplace=True)
df["Mileage"].fillna(df["Mileage"].median(), inplace=True)
df["Price"].fillna(df["Price"].median(), inplace=True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4095 entries, 0 to 4094
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Make                 4095 non-null   object 
 1   Year of manufacture  4095 non-null   float64
 2   Condition            3616 non-null   object 
 3   Mileage              4095 non-null   float64
 4   Engine Size          4095 non-null   float64
 5   Fuel                 3607 non-null   object 
 6   Transmission         4075 non-null   object 
 7   Price                4095 non-null   int64  
 8   Price Category       4095 non-null   int64  
dtypes: float64(3), int64(2), object(4)
memory usage: 288.1+ KB


In [None]:
df["Condition"].fillna(df["Condition"].mode()[0], inplace=True)
df["Fuel"].fillna(df["Fuel"].mode()[0], inplace=True)
df["Transmission"].fillna(df["Transmission"].mode()[0], inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder instance
le_condition = LabelEncoder()
le_fuel = LabelEncoder()
le_transmission = LabelEncoder()

In [None]:
# Apply Label Encoding to each categorical column
df["Condition"] = le_condition.fit_transform(df["Condition"])
df["Fuel"] = le_fuel.fit_transform(df["Fuel"])
df["Transmission"] = le_transmission.fit_transform(df["Transmission"])
df["Make"] = le_transmission.fit_transform(df["Make"])

In [None]:
df.head()

Unnamed: 0,Make,Year of manufacture,Condition,Mileage,Engine Size,Fuel,Transmission,Price,Price Category
0,46,2007.0,2,166418.0,2400.0,3,1,3120000,0
1,26,2008.0,2,138024.0,2500.0,3,1,5834000,1
2,30,2008.0,2,376807.0,3000.0,3,1,3640000,0
3,26,2008.0,2,213362.0,2500.0,3,1,3594000,0
4,30,2008.0,2,106199.0,2500.0,3,1,8410000,1


## Model Training

In [None]:
# Split data into features (X) and target (y)
X = df.drop(columns=["Price"])
y = df["Price"]

In [None]:
X

Unnamed: 0,Make,Year of manufacture,Condition,Mileage,Engine Size,Fuel,Transmission,Price Category
0,46,2007.0,2,166418.0,2400.0,3,1,0
1,26,2008.0,2,138024.0,2500.0,3,1,1
2,30,2008.0,2,376807.0,3000.0,3,1,0
3,26,2008.0,2,213362.0,2500.0,3,1,0
4,30,2008.0,2,106199.0,2500.0,3,1,1
...,...,...,...,...,...,...,...,...
4090,16,2004.0,2,207446.0,3500.0,3,1,0
4091,46,2005.0,2,106914.0,1800.0,3,1,0
4092,16,2006.0,2,247149.0,1800.0,3,1,0
4093,46,2007.0,2,249325.0,2500.0,3,1,0


In [None]:
y

0       3120000
1       5834000
2       3640000
3       3594000
4       8410000
         ...   
4090    1125000
4091    2643750
4092    1462500
4093    2475000
4094    6300000
Name: Price, Length: 4095, dtype: int64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(X_train.shape)
print(y_train.shape)

(3276, 8)
(3276,)


In [None]:
X_train

Unnamed: 0,Make,Year of manufacture,Condition,Mileage,Engine Size,Fuel,Transmission,Price Category
2576,33,2009.0,2,224443.0,2500.0,3,1,0
1373,7,2009.0,1,274542.0,3600.0,3,1,1
221,26,2007.0,2,156579.0,3500.0,3,1,1
2866,44,2008.0,2,86291.0,2000.0,3,1,0
2750,16,2008.0,2,161352.5,2500.0,3,1,1
...,...,...,...,...,...,...,...,...
1130,46,2012.0,1,78080.0,2800.0,3,1,0
1294,26,2008.0,2,213189.0,3500.0,3,1,0
860,30,2006.0,2,72991.0,3500.0,3,1,0
3507,35,2004.0,1,89405.0,14.0,3,1,0


In [None]:
y_train

2576     1890000
1373     6300000
221      5200000
2866     1365000
2750    28800000
          ...   
1130     4200000
1294     3360000
860      3360000
3507     2288000
3174     1995000
Name: Price, Length: 3276, dtype: int64

In [None]:
print(X_test.shape)
print(y_test.shape)

(819, 8)
(819,)


In [None]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

In [None]:
X_test

Unnamed: 0,Make,Year of manufacture,Condition,Mileage,Engine Size,Fuel,Transmission,Price Category
1298,13,2010.0,1,463172.0,3000.0,3,1,0
2711,16,2008.0,2,306026.0,2500.0,3,1,0
3543,16,2012.0,2,58920.0,2400.0,3,1,0
3357,47,2010.0,2,184724.0,2500.0,3,1,0
2865,46,2003.0,2,224344.0,2400.0,3,1,0
...,...,...,...,...,...,...,...,...
862,46,2016.0,2,71610.0,2500.0,3,1,1
3760,16,2002.0,2,298834.0,1700.0,3,1,0
2628,46,2007.0,2,223493.0,3500.0,2,1,0
1425,46,2010.0,1,189486.0,1800.0,3,1,0


In [None]:
linear_predictions = linear_model.predict(X_test)
linear_predictions

array([ 3.69764115e+06,  2.49684058e+06,  3.72761326e+06,  3.40644163e+06,
        1.50282478e+06,  2.85837141e+06,  1.75705442e+06,  1.00397902e+07,
        4.45059161e+06,  1.81978211e+06,  1.18012485e+07,  8.58207551e+06,
        2.28033634e+06,  4.21664829e+06,  3.15910400e+06,  3.64439710e+06,
        3.09867724e+06,  1.71131097e+06, -4.99322995e+05,  8.72113027e+06,
        8.98978609e+06,  2.71413606e+06,  3.32128681e+06,  1.38622399e+06,
        1.21810275e+07,  1.77492813e+06,  1.07546704e+07,  2.76438998e+06,
        1.04983247e+07,  8.50115244e+05,  8.54281424e+06,  1.79028888e+06,
        4.45852909e+06,  1.06797392e+07,  1.03057285e+07,  1.29599115e+06,
        1.06952548e+07,  2.54978314e+06,  9.67515943e+06,  1.06947691e+07,
        2.75636922e+06,  1.07868766e+07,  2.92402833e+06,  2.78526546e+06,
        4.27468318e+06,  1.09912307e+07,  3.14864333e+06,  4.03743159e+06,
        3.32954245e+06,  1.05768823e+06,  1.07806272e+07,  3.10747730e+06,
        4.37343279e+06,  

In [None]:
len(linear_predictions)

819

In [None]:
y_test

1298    2835000
2711    3057500
3543    2512000
3357    1050000
2865    1275000
         ...   
862     5775000
3760     840000
2628    2100000
1425    3570000
67      4725000
Name: Price, Length: 819, dtype: int64

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
tree_model = DecisionTreeRegressor()
tree_model.fit(X_train, y_train)

In [None]:
tree_predictions = tree_model.predict(X_test)
tree_predictions

array([ 2835000.,  2474000.,  1995000.,  3120000.,  1768000.,  2625000.,
        3432000.,  5250000.,  4680000.,  1050000.,  6864000.,  7290000.,
        1575000.,  2205000.,  2835000.,  3990000.,  1995000.,  1680000.,
         900000.,  5775000.,  8900000.,  2625000.,  1260000.,  1560000.,
       52000000.,  2184000., 10400000.,  2940000., 15690000.,   832000.,
       14010000.,  1160000.,  4200000.,  5250000., 16275000.,  1352000.,
        8112000.,  2100000., 10500000.,  8746000.,  1578000., 12480000.,
        1560000.,  3395000.,  3120000.,  9880000.,  1995000.,  4200000.,
        1050000.,  1456000., 11550000.,  2940000.,  4784000.,   832000.,
        6344000.,  2310000., 11250000.,  4714000.,  3100000.,  2415000.,
        1890000.,  2500000.,  3068000.,  5720000.,  2205000.,  1785000.,
        3360000.,   792500.,  2940000.,  9880000.,  2940000.,  3150000.,
        3952000.,  6300000.,  7875000.,  2520000.,  1560000.,  1914000.,
        3780000.,  2600000.,   945000.,  4160000., 

In [None]:
# Define evaluation metrics
metrics = {
    "Mean Absolute Error": mean_absolute_error,
    "Mean Squared Error": mean_squared_error
}

results = {}
for name, func in metrics.items():
    # Calculate scores for each model
    linear_score = func(y_test, linear_predictions)
    tree_score = func(y_test, tree_predictions)

    # Store results in the dictionary
    results[name] = {
        "Linear Regression": linear_score,
        "Decision Tree": tree_score
    }

In [None]:
results

{'Mean Absolute Error': {'Linear Regression': 1515165.032366688,
  'Decision Tree': 1030917.5702075702},
 'Mean Squared Error': {'Linear Regression': 9721159878874.846,
  'Decision Tree': 9612736137992.797}}

## Classification

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score

In [None]:
df

Unnamed: 0,Make,Year of manufacture,Condition,Mileage,Engine Size,Fuel,Transmission,Price,Price Category
0,46,2007.0,2,166418.0,2400.0,3,1,3120000,0
1,26,2008.0,2,138024.0,2500.0,3,1,5834000,1
2,30,2008.0,2,376807.0,3000.0,3,1,3640000,0
3,26,2008.0,2,213362.0,2500.0,3,1,3594000,0
4,30,2008.0,2,106199.0,2500.0,3,1,8410000,1
...,...,...,...,...,...,...,...,...,...
4090,16,2004.0,2,207446.0,3500.0,3,1,1125000,0
4091,46,2005.0,2,106914.0,1800.0,3,1,2643750,0
4092,16,2006.0,2,247149.0,1800.0,3,1,1462500,0
4093,46,2007.0,2,249325.0,2500.0,3,1,2475000,0


In [None]:
X = df.drop(columns=["Price Category"])
y = df["Price Category"]

In [None]:
X

Unnamed: 0,Make,Year of manufacture,Condition,Mileage,Engine Size,Fuel,Transmission,Price
0,46,2007.0,2,166418.0,2400.0,3,1,3120000
1,26,2008.0,2,138024.0,2500.0,3,1,5834000
2,30,2008.0,2,376807.0,3000.0,3,1,3640000
3,26,2008.0,2,213362.0,2500.0,3,1,3594000
4,30,2008.0,2,106199.0,2500.0,3,1,8410000
...,...,...,...,...,...,...,...,...
4090,16,2004.0,2,207446.0,3500.0,3,1,1125000
4091,46,2005.0,2,106914.0,1800.0,3,1,2643750
4092,16,2006.0,2,247149.0,1800.0,3,1,1462500
4093,46,2007.0,2,249325.0,2500.0,3,1,2475000


In [None]:
y

0       0
1       1
2       0
3       0
4       1
       ..
4090    0
4091    0
4092    0
4093    0
4094    1
Name: Price Category, Length: 4095, dtype: int64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_test

Unnamed: 0,Make,Year of manufacture,Condition,Mileage,Engine Size,Fuel,Transmission,Price
1298,13,2010.0,1,463172.0,3000.0,3,1,2835000
2711,16,2008.0,2,306026.0,2500.0,3,1,3057500
3543,16,2012.0,2,58920.0,2400.0,3,1,2512000
3357,47,2010.0,2,184724.0,2500.0,3,1,1050000
2865,46,2003.0,2,224344.0,2400.0,3,1,1275000
...,...,...,...,...,...,...,...,...
862,46,2016.0,2,71610.0,2500.0,3,1,5775000
3760,16,2002.0,2,298834.0,1700.0,3,1,840000
2628,46,2007.0,2,223493.0,3500.0,2,1,2100000
1425,46,2010.0,1,189486.0,1800.0,3,1,3570000


In [None]:
y_test

1298    0
2711    0
3543    0
3357    0
2865    0
       ..
862     1
3760    0
2628    0
1425    0
67      0
Name: Price Category, Length: 819, dtype: int64

In [None]:
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
logistic_predictions = logistic_model.predict(X_test)

In [None]:
logistic_predictions

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,

In [None]:
y_test

1298    0
2711    0
3543    0
3357    0
2865    0
       ..
862     1
3760    0
2628    0
1425    0
67      0
Name: Price Category, Length: 819, dtype: int64

In [None]:
metrics = {
    "Accuracy": accuracy_score,
    "Precision": precision_score
}

results = {}
for name, func in metrics.items():
    logistic_score = func(y_test, logistic_predictions)
    results[name] = {"Logistic Regression": logistic_score}

In [None]:
results

{'Accuracy': {'Logistic Regression': 0.9987789987789988},
 'Precision': {'Logistic Regression': 0.9949494949494949}}

In [None]:

# Decision Tree model
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train, y_train)
tree_predictions = tree_model.predict(X_test)

print("Model Evaluation Results:\n")
for name, func in metrics.items():
    logistic_score = func(y_test, logistic_predictions)
    tree_score = func(y_test, tree_predictions)
    results[name] = {"Logistic Regression": logistic_score, "Decision Tree": tree_score}
    print(f"{name}:\n  Logistic Regression: {logistic_score:.2f}\n  Decision Tree: {tree_score:.2f}\n")

# Print summary of best model per metric
print("Best Model per Metric:")
for metric, scores in results.items():
    best_model = max(scores, key=scores.get)
    print(f"{metric}: {best_model}")

Model Evaluation Results:

Accuracy:
  Logistic Regression: 1.00
  Decision Tree: 1.00

Precision:
  Logistic Regression: 0.99
  Decision Tree: 1.00

Best Model per Metric:
Accuracy: Decision Tree
Precision: Decision Tree
