In [200]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import sklearn

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostRegressor
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder, LabelBinarizer

In [201]:
df = pd.read_excel("AI_Invasion_In-Class_Dataset.xlsx") 

In [202]:
df.shape

(4487, 8)

In [203]:
df.head(5)

Unnamed: 0,Location,Maker,Model,Year,Colour,Amount (Million ₦),Type,Distance_Km
0,Abuja,Mercedes-Benz,GLA 250,2015.0,Brown,14.5,Foreign Used,50000.0
1,Abuja,Hyundai,Accent,2013.0,Red,1.55,Nigerian Used,
2,Lagos,Lexus,GX 460 Premium,2011.0,White,14.0,Foreign Used,85000.0
3,Lagos,Lexus,ES 350,2011.0,Gray,4.95,Foreign Used,
4,Ibadan,Toyota,Verso 1.6,2009.0,Silver,1.69,Nigerian Used,118906.0


In [204]:
df.columns

Index(['Location', 'Maker', 'Model', 'Year', 'Colour', 'Amount (Million ₦)',
       'Type', 'Distance_Km'],
      dtype='object')

In [205]:
df.isnull().sum()

Location                 0
Maker                    0
Model                    0
Year                     0
Colour                   0
Amount (Million ₦)       0
Type                     0
Distance_Km           1555
dtype: int64

In [206]:
meanValue = df["Distance_Km"].mean()

In [207]:
meanValue

101038.32128240108

In [208]:
df["Distance_Km"] = df["Distance_Km"].fillna( df["Distance_Km"].mean())

In [209]:
df.isnull().sum()

Location              0
Maker                 0
Model                 0
Year                  0
Colour                0
Amount (Million ₦)    0
Type                  0
Distance_Km           0
dtype: int64

In [210]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4487 entries, 0 to 4486
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Location            4487 non-null   object 
 1   Maker               4487 non-null   object 
 2   Model               4487 non-null   object 
 3   Year                4487 non-null   float64
 4   Colour              4487 non-null   object 
 5   Amount (Million ₦)  4487 non-null   float64
 6   Type                4487 non-null   object 
 7   Distance_Km         4487 non-null   float64
dtypes: float64(3), object(5)
memory usage: 280.6+ KB


In [211]:
catFeatures = {"Location", "Maker", "Model", "Colour", "Type", "Year"}

In [212]:
for catFeature in catFeatures:
    print(catFeature,  df[catFeature].unique(), sep= ":")
    print("#" * 50)

Maker:['Mercedes-Benz' 'Hyundai' 'Lexus' 'Toyota' 'Mazda' 'Honda' 'Land Rover'
 'Porsche' 'Acura' 'Nissan' 'Pontiac' 'Ford' 'Jeep' 'Kia' 'Peugeot' 'BMW'
 'Mitsubishi' 'Dodge' 'Chevrolet' 'Scion' 'Audi' 'Infiniti' 'Mini'
 'Volkswagen' 'Suzuki' 'Chrysler' 'Volvo' 'Rolls-Royce' 'JAC' 'Subaru'
 'Renault' 'GMC' 'Rover' 'IVM' 'Bentley' 'Opel' 'Lincoln' 'Hummer'
 'Saturn' 'Cadillac' 'Lamborghini' 'Buick' 'Smart' 'Jaguar' 'Ferrari'
 'Tata' 'Skoda']
##################################################
Type:['Foreign Used' 'Nigerian Used' 'Brand New']
##################################################
Location:['Abuja' 'Lagos' 'Ibadan']
##################################################
Year:[2015. 2013. 2011. 2009. 2008. 2010. 2014. 2012. 2022. 2006. 2021. 2017.
 2007. 2002. 2016. 2019. 2020. 2004. 2018. 2005. 2003. 2000. 1999. 2001.
 1989. 1998. 1982. 1994. 1993. 1997.]
##################################################
Model:['GLA 250' 'Accent' 'GX 460 Premium' 'ES 350' 'Verso 1.6' 'Corolla 1.8

In [213]:
df["Model"].value_counts()

Camry              437
Corolla            202
ES 350             188
C300               133
Accord             104
                  ... 
Ranger XL            1
MDX Base FWD         1
Odyssey 2.4 2WD      1
Traverse 1LT         1
Outback              1
Name: Model, Length: 897, dtype: int64

In [214]:
df.drop("Model", axis = "columns", inplace = True)

In [215]:
encFeatures = ["Location", "Maker", "Colour", "Type", "Year"]

In [216]:
# Creating a for loop to encode
for encFeature in encFeatures:
    # Giving each "encFeature" i.e each column, a value
    df[f"{encFeature}_cat"] = df[encFeature].astype("category")
    # Converting the categorized datapoints into integers
    df[f"{encFeature}_cat"].cat.codes
    # instantiating the converted datapoints into new series/ separate from the original series
    df[f"{encFeature}_cat"] = df[f"{encFeature}_cat"].cat.codes
    

     
    

In [217]:
df.head()

Unnamed: 0,Location,Maker,Year,Colour,Amount (Million ₦),Type,Distance_Km,Location_cat,Maker_cat,Colour_cat,Type_cat,Year_cat
0,Abuja,Mercedes-Benz,2015.0,Brown,14.5,Foreign Used,50000.0,0,26,3,1,22
1,Abuja,Hyundai,2013.0,Red,1.55,Nigerian Used,101038.321282,0,14,14,2,20
2,Lagos,Lexus,2011.0,White,14.0,Foreign Used,85000.0,2,23,17,1,18
3,Lagos,Lexus,2011.0,Gray,4.95,Foreign Used,101038.321282,2,23,6,1,18
4,Ibadan,Toyota,2009.0,Silver,1.69,Nigerian Used,118906.0,1,44,15,2,16


In [218]:
#df.drop(encFeatures, axis = 1, inplace = True)
df.drop(["Location","Maker","Year","Colour", "Type"], axis=1, inplace=True)

In [219]:
df.head()

Unnamed: 0,Amount (Million ₦),Distance_Km,Location_cat,Maker_cat,Colour_cat,Type_cat,Year_cat
0,14.5,50000.0,0,26,3,1,22
1,1.55,101038.321282,0,14,14,2,20
2,14.0,85000.0,2,23,17,1,18
3,4.95,101038.321282,2,23,6,1,18
4,1.69,118906.0,1,44,15,2,16


In [220]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4487 entries, 0 to 4486
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Amount (Million ₦)  4487 non-null   float64
 1   Distance_Km         4487 non-null   float64
 2   Location_cat        4487 non-null   int8   
 3   Maker_cat           4487 non-null   int8   
 4   Colour_cat          4487 non-null   int8   
 5   Type_cat            4487 non-null   int8   
 6   Year_cat            4487 non-null   int8   
dtypes: float64(2), int8(5)
memory usage: 92.1 KB


# Performing Data Segmentation

In [231]:
X = df.drop("Amount (Million ₦)", axis = "columns")

In [232]:
X

Unnamed: 0,Distance_Km,Location_cat,Maker_cat,Colour_cat,Type_cat,Year_cat
0,50000.000000,0,26,3,1,22
1,101038.321282,0,14,14,2,20
2,85000.000000,2,23,17,1,18
3,101038.321282,2,23,6,1,18
4,118906.000000,1,44,15,2,16
...,...,...,...,...,...,...
4482,90282.000000,2,23,2,1,13
4483,85000.000000,2,23,2,1,14
4484,65214.000000,0,26,7,1,21
4485,45000.000000,2,23,1,1,27


In [233]:
y = df["Amount (Million ₦)"]

In [234]:
y

0       14.50
1        1.55
2       14.00
3        4.95
4        1.69
        ...  
4482     4.60
4483     4.50
4484    10.45
4485    31.00
4486    14.00
Name: Amount (Million ₦), Length: 4487, dtype: float64

In [235]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1234, shuffle=True)

In [236]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4487 entries, 0 to 4486
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Distance_Km   4487 non-null   float64
 1   Location_cat  4487 non-null   int8   
 2   Maker_cat     4487 non-null   int8   
 3   Colour_cat    4487 non-null   int8   
 4   Type_cat      4487 non-null   int8   
 5   Year_cat      4487 non-null   int8   
dtypes: float64(1), int8(5)
memory usage: 57.1 KB


In [237]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3140, 6)
(1347, 6)
(3140,)
(1347,)


In [238]:
y_test

1070     1.45
4173     6.80
3186    10.50
3854     4.00
3910    12.90
        ...  
2055     4.00
1619     5.35
3019     5.40
2664     3.75
912      3.40
Name: Amount (Million ₦), Length: 1347, dtype: float64

In [239]:
logist = LogisticRegression()

In [240]:
logist.fit(X_train, y_train)

ValueError: Unknown label type: 'continuous'

In [241]:
one = OneHotEncoder()

In [242]:
one.fit(X_train, y_train)

OneHotEncoder()

In [243]:
one.predict(X_test)

AttributeError: 'OneHotEncoder' object has no attribute 'predict'

In [244]:
gaus = GaussianNB()

In [245]:
gaus.fit(X_train, y_train)

ValueError: Unknown label type: (array([4.20000000e-01, 4.50000000e-01, 6.70000000e-01, 6.75000000e-01,
       6.80000000e-01, 6.90000000e-01, 7.87500000e-01, 8.00000000e-01,
       8.50000000e-01, 9.00000000e-01, 9.50000000e-01, 9.56250000e-01,
       9.80000000e-01, 9.90000000e-01, 1.01250000e+00, 1.05000000e+00,
       1.06875000e+00, 1.10000000e+00, 1.12500000e+00, 1.15000000e+00,
       1.18000000e+00, 1.20000000e+00, 1.23750000e+00, 1.25000000e+00,
       1.29000000e+00, 1.30000000e+00, 1.35000000e+00, 1.40000000e+00,
       1.43000000e+00, 1.45000000e+00, 1.46250000e+00, 1.47000000e+00,
       1.48000000e+00, 1.50000000e+00, 1.51875000e+00, 1.55000000e+00,
       1.57500000e+00, 1.58000000e+00, 1.59000000e+00, 1.60000000e+00,
       1.65000000e+00, 1.68750000e+00, 1.69000000e+00, 1.70000000e+00,
       1.72000000e+00, 1.74375000e+00, 1.75000000e+00, 1.78000000e+00,
       1.80000000e+00, 1.85000000e+00, 1.87000000e+00, 1.90000000e+00,
       1.91250000e+00, 1.92000000e+00, 1.95000000e+00, 1.98000000e+00,
       2.00000000e+00, 2.00812500e+00, 2.02500000e+00, 2.05000000e+00,
       2.10000000e+00, 2.13750000e+00, 2.15000000e+00, 2.20000000e+00,
       2.25000000e+00, 2.28000000e+00, 2.29500000e+00, 2.30000000e+00,
       2.32000000e+00, 2.35000000e+00, 2.36250000e+00, 2.38000000e+00,
       2.39000000e+00, 2.40000000e+00, 2.45000000e+00, 2.47500000e+00,
       2.50000000e+00, 2.53125000e+00, 2.55000000e+00, 2.58750000e+00,
       2.60000000e+00, 2.62000000e+00, 2.64375000e+00, 2.65000000e+00,
       2.66000000e+00, 2.68000000e+00, 2.69000000e+00, 2.69999900e+00,
       2.70000000e+00, 2.75000000e+00, 2.75625000e+00, 2.78000000e+00,
       2.80000000e+00, 2.81250000e+00, 2.85000000e+00, 2.86875000e+00,
       2.90000000e+00, 2.92500000e+00, 2.95000000e+00, 2.95312500e+00,
       2.98125000e+00, 3.00000000e+00, 3.03750000e+00, 3.05000000e+00,
       3.09375000e+00, 3.10000000e+00, 3.15000000e+00, 3.18000000e+00,
       3.19000000e+00, 3.20000000e+00, 3.25000000e+00, 3.26250000e+00,
       3.30000000e+00, 3.32000000e+00, 3.35000000e+00, 3.37500000e+00,
       3.38000000e+00, 3.40000000e+00, 3.45000000e+00, 3.48750000e+00,
       3.49999900e+00, 3.50000000e+00, 3.54000000e+00, 3.55000000e+00,
       3.60000000e+00, 3.65000000e+00, 3.69999900e+00, 3.70000000e+00,
       3.71250000e+00, 3.75000000e+00, 3.79000000e+00, 3.79999900e+00,
       3.80000000e+00, 3.82500000e+00, 3.85000000e+00, 3.86500000e+00,
       3.88000000e+00, 3.90000000e+00, 3.93750000e+00, 3.95000000e+00,
       3.99000000e+00, 3.99999900e+00, 4.00000000e+00, 4.01000000e+00,
       4.05000000e+00, 4.10000000e+00, 4.10625000e+00, 4.15000000e+00,
       4.20000000e+00, 4.25000000e+00, 4.27500000e+00, 4.30000000e+00,
       4.33000000e+00, 4.33125000e+00, 4.35000000e+00, 4.38750000e+00,
       4.40000000e+00, 4.40020000e+00, 4.45000000e+00, 4.46000000e+00,
       4.47000000e+00, 4.48000000e+00, 4.49999900e+00, 4.50000000e+00,
       4.55000000e+00, 4.59000000e+00, 4.60000000e+00, 4.62000000e+00,
       4.65000000e+00, 4.68000000e+00, 4.70000000e+00, 4.72500000e+00,
       4.75000000e+00, 4.78000000e+00, 4.79900000e+00, 4.80000000e+00,
       4.85000000e+00, 4.86000000e+00, 4.89000000e+00, 4.90000000e+00,
       4.95000000e+00, 4.99999900e+00, 5.00000000e+00, 5.05000000e+00,
       5.06250000e+00, 5.10000000e+00, 5.15000000e+00, 5.17000000e+00,
       5.17500000e+00, 5.20000000e+00, 5.25000000e+00, 5.30000000e+00,
       5.34999900e+00, 5.35000000e+00, 5.39900000e+00, 5.40000000e+00,
       5.45000000e+00, 5.50000000e+00, 5.55000000e+00, 5.60000000e+00,
       5.62500000e+00, 5.65000000e+00, 5.70000000e+00, 5.75000000e+00,
       5.80000000e+00, 5.85000000e+00, 5.86011100e+00, 5.90000000e+00,
       5.95000000e+00, 5.99999900e+00, 6.00000000e+00, 6.07500000e+00,
       6.10000000e+00, 6.15000000e+00, 6.18000000e+00, 6.18750000e+00,
       6.20000000e+00, 6.25000000e+00, 6.30000000e+00, 6.35000000e+00,
       6.40000000e+00, 6.41250000e+00, 6.45000000e+00, 6.48000000e+00,
       6.50000000e+00, 6.52500000e+00, 6.55000000e+00, 6.60000000e+00,
       6.65000000e+00, 6.70000000e+00, 6.75000000e+00, 6.79999900e+00,
       6.80000000e+00, 6.83000000e+00, 6.85000000e+00, 6.89999900e+00,
       6.90000000e+00, 6.95000000e+00, 6.97500000e+00, 7.00000000e+00,
       7.10000000e+00, 7.15000000e+00, 7.20000000e+00, 7.25000000e+00,
       7.30000000e+00, 7.31000000e+00, 7.31250000e+00, 7.35000000e+00,
       7.40000000e+00, 7.45000000e+00, 7.50000000e+00, 7.52000000e+00,
       7.55000000e+00, 7.60000000e+00, 7.65000000e+00, 7.70000000e+00,
       7.75000000e+00, 7.80000000e+00, 7.85000000e+00, 7.87500000e+00,
       7.90000000e+00, 7.95000000e+00, 7.99999900e+00, 8.00000000e+00,
       8.05000000e+00, 8.10000000e+00, 8.20000000e+00, 8.25000000e+00,
       8.30000000e+00, 8.35000000e+00, 8.40000000e+00, 8.43750000e+00,
       8.49000000e+00, 8.50000000e+00, 8.55000000e+00, 8.60000000e+00,
       8.65000000e+00, 8.70000000e+00, 8.75000000e+00, 8.80000000e+00,
       8.85000000e+00, 8.90000000e+00, 8.99000000e+00, 9.00000000e+00,
       9.05000000e+00, 9.20000000e+00, 9.22500000e+00, 9.25000000e+00,
       9.30000000e+00, 9.33750000e+00, 9.40000000e+00, 9.45000000e+00,
       9.50000000e+00, 9.56250000e+00, 9.60000000e+00, 9.65000000e+00,
       9.70000000e+00, 9.80000000e+00, 9.85000000e+00, 9.90000000e+00,
       9.99999900e+00, 1.00000000e+01, 1.01250000e+01, 1.01500000e+01,
       1.02000000e+01, 1.04000000e+01, 1.04500000e+01, 1.05000000e+01,
       1.05500000e+01, 1.06000000e+01, 1.07000000e+01, 1.08000000e+01,
       1.09000000e+01, 1.10000000e+01, 1.10250000e+01, 1.12000000e+01,
       1.12500000e+01, 1.13000000e+01, 1.15000000e+01, 1.16000000e+01,
       1.17000000e+01, 1.17500000e+01, 1.18000000e+01, 1.18125000e+01,
       1.18500000e+01, 1.19000000e+01, 1.20000000e+01, 1.20500000e+01,
       1.21000000e+01, 1.22000000e+01, 1.22500000e+01, 1.23000000e+01,
       1.23500000e+01, 1.23750000e+01, 1.24000000e+01, 1.25000000e+01,
       1.27000000e+01, 1.28000000e+01, 1.28500000e+01, 1.29000000e+01,
       1.29990000e+01, 1.30000000e+01, 1.32000000e+01, 1.34000000e+01,
       1.35000000e+01, 1.36000000e+01, 1.37000000e+01, 1.37500000e+01,
       1.38000000e+01, 1.38500000e+01, 1.39000000e+01, 1.39999990e+01,
       1.40000000e+01, 1.42000000e+01, 1.42500000e+01, 1.42999990e+01,
       1.43000000e+01, 1.45000000e+01, 1.46000000e+01, 1.46250000e+01,
       1.46500000e+01, 1.47000000e+01, 1.47500000e+01, 1.48000000e+01,
       1.49000000e+01, 1.50000000e+01, 1.52000000e+01, 1.53000000e+01,
       1.54700000e+01, 1.55000000e+01, 1.57500000e+01, 1.58000000e+01,
       1.60000000e+01, 1.63500000e+01, 1.65000000e+01, 1.65500000e+01,
       1.66000000e+01, 1.68000000e+01, 1.68750000e+01, 1.69000000e+01,
       1.69500000e+01, 1.70000000e+01, 1.72000000e+01, 1.73000000e+01,
       1.74000000e+01, 1.74375000e+01, 1.75000000e+01, 1.76000000e+01,
       1.77000000e+01, 1.78000000e+01, 1.79000000e+01, 1.79500000e+01,
       1.80000000e+01, 1.82000000e+01, 1.83000000e+01, 1.85000000e+01,
       1.87000000e+01, 1.87500000e+01, 1.88000000e+01, 1.89000000e+01,
       1.89999990e+01, 1.90000000e+01, 1.91250000e+01, 1.92000000e+01,
       1.95000000e+01, 1.97500000e+01, 1.98000000e+01, 1.99500000e+01,
       2.00000000e+01, 2.03000000e+01, 2.05000000e+01, 2.08000000e+01,
       2.08500000e+01, 2.10000000e+01, 2.15000000e+01, 2.17000000e+01,
       2.18500000e+01, 2.19999990e+01, 2.20000000e+01, 2.25000000e+01,
       2.29500000e+01, 2.30000000e+01, 2.35000000e+01, 2.36250000e+01,
       2.36500000e+01, 2.38000000e+01, 2.38500000e+01, 2.40000000e+01,
       2.45000000e+01, 2.47500000e+01, 2.50000000e+01, 2.53000000e+01,
       2.55000000e+01, 2.58000000e+01, 2.60000000e+01, 2.64000000e+01,
       2.65000000e+01, 2.67000000e+01, 2.70000000e+01, 2.75000000e+01,
       2.80000000e+01, 2.81250000e+01, 2.85000000e+01, 2.85500000e+01,
       2.90000000e+01, 2.95000000e+01, 2.99999990e+01, 3.00000000e+01,
       3.05000000e+01, 3.10000000e+01, 3.15000000e+01, 3.20000000e+01,
       3.25000000e+01, 3.30000000e+01, 3.35000000e+01, 3.40000000e+01,
       3.41000000e+01, 3.45000000e+01, 3.50000000e+01, 3.55000000e+01,
       3.60000000e+01, 3.68000000e+01, 3.70000000e+01, 3.75000000e+01,
       3.80000000e+01, 3.85000000e+01, 3.88000000e+01, 3.90000000e+01,
       3.95000000e+01, 4.00000000e+01, 4.20000000e+01, 4.30000000e+01,
       4.33500000e+01, 4.35000000e+01, 4.38000000e+01, 4.40000000e+01,
       4.50000000e+01, 4.60000000e+01, 4.70000000e+01, 4.80000000e+01,
       4.85000000e+01, 4.94000000e+01, 4.95000000e+01, 5.00000000e+01,
       5.06250000e+01, 5.10000000e+01, 5.35000000e+01, 5.40000000e+01,
       5.45000000e+01, 5.49000000e+01, 5.50000000e+01, 5.60000000e+01,
       5.65000000e+01, 5.70000000e+01, 5.80000000e+01, 5.90000000e+01,
       5.95000000e+01, 6.00000000e+01, 6.30000000e+01, 6.50000000e+01,
       6.80000000e+01, 6.90000000e+01, 7.00000000e+01, 7.10000000e+01,
       7.45000000e+01, 7.50000000e+01, 7.70000000e+01, 7.80000000e+01,
       7.90000000e+01, 8.00000000e+01, 8.20000000e+01, 8.50000000e+01,
       8.90000000e+01, 9.50000000e+01, 9.60000000e+01, 9.80000000e+01,
       9.90000000e+01, 1.00000000e+02, 1.20000000e+02, 1.25000000e+02,
       1.35000000e+02, 1.48000000e+02, 1.55000000e+02, 1.70000000e+02,
       1.75000000e+02, 1.80000000e+02, 1.90000000e+02, 1.98000000e+02,
       2.10000000e+02, 2.20000000e+02, 2.69999999e+02, 3.60000000e+02,
       4.54000000e+02]),)

# Evaluating Model
### Absolute Error
### Mean

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error