In [109]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import sklearn

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression, GammaRegressor, HuberRegressor, PoissonRegressor, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesRegressor, BaggingRegressor
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder, LabelBinarizer

In [110]:
df = pd.read_excel("AI_Invasion_In-Class_Dataset.xlsx") 

In [111]:
df.shape

(4487, 8)

In [112]:
df.head(5)

Unnamed: 0,Location,Maker,Model,Year,Colour,Amount (Million ₦),Type,Distance_Km
0,Abuja,Mercedes-Benz,GLA 250,2015.0,Brown,14.5,Foreign Used,50000.0
1,Abuja,Hyundai,Accent,2013.0,Red,1.55,Nigerian Used,
2,Lagos,Lexus,GX 460 Premium,2011.0,White,14.0,Foreign Used,85000.0
3,Lagos,Lexus,ES 350,2011.0,Gray,4.95,Foreign Used,
4,Ibadan,Toyota,Verso 1.6,2009.0,Silver,1.69,Nigerian Used,118906.0


In [113]:
df.columns

Index(['Location', 'Maker', 'Model', 'Year', 'Colour', 'Amount (Million ₦)',
       'Type', 'Distance_Km'],
      dtype='object')

In [114]:
df.isnull().sum()

Location                 0
Maker                    0
Model                    0
Year                     0
Colour                   0
Amount (Million ₦)       0
Type                     0
Distance_Km           1555
dtype: int64

In [115]:
meanValue = df["Distance_Km"].mean()

In [116]:
meanValue

101038.32128240108

In [117]:
df["Distance_Km"] = df["Distance_Km"].fillna( df["Distance_Km"].mean())

In [118]:
df.isnull().sum()

Location              0
Maker                 0
Model                 0
Year                  0
Colour                0
Amount (Million ₦)    0
Type                  0
Distance_Km           0
dtype: int64

In [119]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4487 entries, 0 to 4486
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Location            4487 non-null   object 
 1   Maker               4487 non-null   object 
 2   Model               4487 non-null   object 
 3   Year                4487 non-null   float64
 4   Colour              4487 non-null   object 
 5   Amount (Million ₦)  4487 non-null   float64
 6   Type                4487 non-null   object 
 7   Distance_Km         4487 non-null   float64
dtypes: float64(3), object(5)
memory usage: 280.6+ KB


In [120]:
catFeatures = {"Location", "Maker", "Model", "Colour", "Type", "Year"}

In [121]:
for catFeature in catFeatures:
    print(catFeature,  df[catFeature].unique(), sep= ":")
    print("#" * 50)

Location:['Abuja' 'Lagos' 'Ibadan']
##################################################
Model:['GLA 250' 'Accent' 'GX 460 Premium' 'ES 350' 'Verso 1.6' 'Corolla 1.8 LE'
 'E350' 'GL-Class' 'RX 350 AWD' 'Land Cruiser 3.5 V6' 'Matrix'
 'Land Cruiser' 'C350' 'Corolla' 'IS 250 4WD' 'Venza V6' 'CX-7' 'RX 350'
 'Highlander Limited 4x4' 'RX' 'RX 350 F Sport AWD' 'Camry'
 'Land Cruiser 5.7 V8 VX-S' 'GLK-Class' 'Avalon' 'GS 300' 'Accord'
 '4-Runner' 'Civic' 'ES 330 Sedan' 'Corolla LE (1.8L 4cyl 2A)' 'Santa Fe'
 'Highlander' 'Elantra' '4-Runner Limited V6' 'Venza Limited FWD V6'
 'M Class ML 350 4Matic' 'M Class' 'Hyundai Kona' 'C300'
 'Camry XLE V6 FWD' 'Range Rover Velar' 'IS 250' 'Highlander Limited'
 'RAV4 Limited FWD' 'Cayenne' 'RX 330' 'RDX' 'Corolla XSE (1.8L 4cyl 2A)'
 'Micra' 'Vibe 2.4L' 'Tacoma TRD Sport' 'Focus 1.8 TDDi Viva' 'GX 460'
 'RAV4 Limited V6 4x4' 'Commander Limited 4x4' 'Tundra' 'Sonata' 'RAV4'
 'F-150' 'GX' 'GS 300 Automatic' 'Sienna' 'Altima' 'Hyundai Ioniq'
 'Tacoma' 'CLA-

In [15]:
df["Model"].value_counts()

Camry              437
Corolla            202
ES 350             188
C300               133
Accord             104
                  ... 
Ranger XL            1
MDX Base FWD         1
Odyssey 2.4 2WD      1
Traverse 1LT         1
Outback              1
Name: Model, Length: 897, dtype: int64

In [122]:
df.drop("Model", axis = "columns", inplace = True)

In [123]:
encFeatures = ["Location", "Maker", "Colour", "Type", "Year"]

In [124]:
# Creating a for loop to encode
for encFeature in encFeatures:
    # Giving each "encFeature" i.e each column, a value
    df[f"{encFeature}_cat"] = df[encFeature].astype("category")
    # Converting the categorized datapoints into integers
    df[f"{encFeature}_cat"].cat.codes
    # instantiating the converted datapoints into new series/ separate from the original series
    df[f"{encFeature}_cat"] = df[f"{encFeature}_cat"].cat.codes
    

     
    

In [125]:
df.head()

Unnamed: 0,Location,Maker,Year,Colour,Amount (Million ₦),Type,Distance_Km,Location_cat,Maker_cat,Colour_cat,Type_cat,Year_cat
0,Abuja,Mercedes-Benz,2015.0,Brown,14.5,Foreign Used,50000.0,0,26,3,1,22
1,Abuja,Hyundai,2013.0,Red,1.55,Nigerian Used,101038.321282,0,14,14,2,20
2,Lagos,Lexus,2011.0,White,14.0,Foreign Used,85000.0,2,23,17,1,18
3,Lagos,Lexus,2011.0,Gray,4.95,Foreign Used,101038.321282,2,23,6,1,18
4,Ibadan,Toyota,2009.0,Silver,1.69,Nigerian Used,118906.0,1,44,15,2,16


In [126]:
#df.drop(encFeatures, axis = 1, inplace = True)
df.drop(["Location","Maker","Year","Colour", "Type"], axis=1, inplace=True)

In [127]:
df.head()

Unnamed: 0,Amount (Million ₦),Distance_Km,Location_cat,Maker_cat,Colour_cat,Type_cat,Year_cat
0,14.5,50000.0,0,26,3,1,22
1,1.55,101038.321282,0,14,14,2,20
2,14.0,85000.0,2,23,17,1,18
3,4.95,101038.321282,2,23,6,1,18
4,1.69,118906.0,1,44,15,2,16


In [128]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4487 entries, 0 to 4486
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Amount (Million ₦)  4487 non-null   float64
 1   Distance_Km         4487 non-null   float64
 2   Location_cat        4487 non-null   int8   
 3   Maker_cat           4487 non-null   int8   
 4   Colour_cat          4487 non-null   int8   
 5   Type_cat            4487 non-null   int8   
 6   Year_cat            4487 non-null   int8   
dtypes: float64(2), int8(5)
memory usage: 92.1 KB


# Performing Data Segmentation

In [129]:
X = df.drop("Amount (Million ₦)", axis = "columns")

In [130]:
X

Unnamed: 0,Distance_Km,Location_cat,Maker_cat,Colour_cat,Type_cat,Year_cat
0,50000.000000,0,26,3,1,22
1,101038.321282,0,14,14,2,20
2,85000.000000,2,23,17,1,18
3,101038.321282,2,23,6,1,18
4,118906.000000,1,44,15,2,16
...,...,...,...,...,...,...
4482,90282.000000,2,23,2,1,13
4483,85000.000000,2,23,2,1,14
4484,65214.000000,0,26,7,1,21
4485,45000.000000,2,23,1,1,27


In [131]:
y

0       14.50
1        1.55
2       14.00
3        4.95
4        1.69
        ...  
4482     4.60
4483     4.50
4484    10.45
4485    31.00
4486    14.00
Name: Amount (Million ₦), Length: 4487, dtype: float64

In [82]:
df["Distance_Km"].value_counts()

101038.321282    1555
75000.000000       39
65000.000000       38
85000.000000       35
80000.000000       32
                 ... 
62385.000000        1
61515.000000        1
62527.000000        1
19736.000000        1
90282.000000        1
Name: Distance_Km, Length: 1933, dtype: int64

In [92]:
df["Colour_cat"].unique()

array([ 3, 14, 17,  6, 15,  1,  2,  5,  7,  0, 13, 10,  4,  8, 12, 11, 18,
        9, 16], dtype=int8)

In [93]:
df["Type_cat"].unique()

array([1, 2, 0], dtype=int8)

In [26]:
y = df["Amount (Million ₦)"]

In [27]:
y

0       14.50
1        1.55
2       14.00
3        4.95
4        1.69
        ...  
4482     4.60
4483     4.50
4484    10.45
4485    31.00
4486    14.00
Name: Amount (Million ₦), Length: 4487, dtype: float64

In [132]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=1234, shuffle=True)

In [133]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4487 entries, 0 to 4486
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Distance_Km   4487 non-null   float64
 1   Location_cat  4487 non-null   int8   
 2   Maker_cat     4487 non-null   int8   
 3   Colour_cat    4487 non-null   int8   
 4   Type_cat      4487 non-null   int8   
 5   Year_cat      4487 non-null   int8   
dtypes: float64(1), int8(5)
memory usage: 57.1 KB


In [30]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3140, 6)
(1347, 6)
(3140,)
(1347,)


In [31]:
y_test

1070     1.45
4173     6.80
3186    10.50
3854     4.00
3910    12.90
        ...  
2055     4.00
1619     5.35
3019     5.40
2664     3.75
912      3.40
Name: Amount (Million ₦), Length: 1347, dtype: float64

In [134]:
linear = LinearRegression()

In [135]:
linear.fit(X_train, y_train)

LinearRegression()

In [136]:
linear.predict(X_test)

array([-10.32917044,  11.7652606 ,  21.93416557, ...,   9.31520146,
         7.6200497 ,   9.85098018])

# Evaluating Model
### Absolute Error

In [137]:
linearPred = linear.predict(X_test)

In [138]:
mean_absolute_error(y_test, linearPred)

7.94593449242345

# Other Regressors

## Gamma Regressor

In [37]:
gamma = GammaRegressor()

In [38]:
gamma.fit(X_train, y_train)

  return -2 * (y - y_pred) / self.unit_variance(y_pred)
  temp = d1 * family.deviance_derivative(y, y_pred, weights)
  devp = np.concatenate(([temp.sum()], temp @ X))
  dev = 2 * (np.log(y_pred / y) + y / y_pred - 1)
  dev = 2 * (np.log(y_pred / y) + y / y_pred - 1)
  dev = 2 * (np.log(y_pred / y) + y / y_pred - 1)
  dev = 2 * (np.log(y_pred / y) + y / y_pred - 1)
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res)


GammaRegressor()

In [39]:
gamma.predict(X_test)

array([11.26406602, 11.26406602, 11.26406602, ..., 11.26406602,
       11.26406602, 11.26406602])

In [40]:
gammaPred = gamma.predict(X_test)

In [41]:
mean_absolute_error(y_test, gammaPred)

9.571488888333118

# HuberRegressor

In [42]:
huber = HuberRegressor()

In [43]:
huber.fit(X_train, y_train)

HuberRegressor()

In [44]:
huber.predict(X_test)

array([ 0.58230236,  8.31099242, 10.95063323, ...,  7.12934138,
        5.54231209,  6.63405118])

In [45]:
huberPred = huber.predict(X_test)

In [46]:
mean_absolute_error(y_test, huberPred)

6.385831027536613

## PoissonRegressor

In [47]:
poison = PoissonRegressor()

In [48]:
poison.fit(X_train, y_train)

  return -2 * (y - y_pred) / self.unit_variance(y_pred)
  temp = d1 * family.deviance_derivative(y, y_pred, weights)
  dev = 2 * (xlogy(y, y / y_pred) - y + y_pred)


PoissonRegressor()

In [49]:
poison.predict(X_test)

array([11.26406602, 11.26406602, 11.26406602, ..., 11.26406602,
       11.26406602, 11.26406602])

In [50]:
poisonPred = poison.predict(X_test)

In [51]:
mean_absolute_error(y_test, poisonPred)

9.571488888333118

## AdaBoostRegressor

In [52]:
ada = AdaBoostRegressor()

In [53]:
ada.fit(X_train, y_train)

AdaBoostRegressor()

In [54]:
ada.predict(X_test)

array([14.36301354, 32.71411776, 36.66129032, ..., 28.60184391,
       14.36301354, 12.53466781])

In [55]:
adaPred = ada.predict(X_test)

In [56]:
mean_absolute_error(y_test, adaPred)

20.79512160351856

## GradientBoostingRegressor

In [139]:
grad = GradientBoostingRegressor()

In [140]:
grad.fit(X_train, y_train)

GradientBoostingRegressor()

In [141]:
grad.predict(X_test)

array([ 1.62859943,  7.50274722, 14.42774657, ...,  7.03947282,
        4.73458257,  2.52282439])

In [142]:
gradPred = grad.predict(X_test)

In [143]:
mean_absolute_error(y_test, gradPred)

4.022826846434734

In [106]:
gradPred

array([ 1.62859943,  7.50274722, 14.42774657, ...,  7.03947282,
        4.73458257,  2.52282439])

## ExtraTreesRegressor

In [62]:
extra = ExtraTreesRegressor()

In [63]:
extra.fit(X_train, y_train)

ExtraTreesRegressor()

In [64]:
extra.predict(X_test)

array([ 1.744475  ,  6.9869    , 10.40466667, ...,  7.90083333,
        3.85      ,  3.7       ])

In [65]:
extraPred = extra.predict(X_test)

In [66]:
mean_absolute_error(y_test, extraPred)

4.611022033366741

## RadiusNeighborsRegressor

In [67]:
rad = RadiusNeighborsRegressor()

In [68]:
rad.fit(X_train, y_train)

RadiusNeighborsRegressor()

In [69]:
rad.predict(X_test)



array([       nan,        nan,        nan, ..., 9.58333333, 5.23857143,
       3.46666667])

In [70]:
radPred = rad.predict(X_test)



## RandomForestRegressor

In [71]:
random = RandomForestRegressor()

In [72]:
random.fit(X_train, y_train)

RandomForestRegressor()

In [73]:
random.predict(X_test)

array([ 1.947775  ,  6.67414996, 10.08233333, ...,  7.78190833,
        4.221417  ,  3.62483333])

In [74]:
randomPred = random.predict(X_test)

In [75]:
mean_absolute_error(y_test, randomPred)

4.2835035880504995

# Exporting for Model Deployment

In [76]:
grad = GradientBoostingRegressor()

In [77]:
import pickle

In [145]:
filename = "vehicle.sav"
pickle.dump(grad, open(filename, "wb"))

In [95]:
X

Unnamed: 0,Distance_Km,Location_cat,Maker_cat,Colour_cat,Type_cat,Year_cat
0,50000.000000,0,26,3,1,22
1,101038.321282,0,14,14,2,20
2,85000.000000,2,23,17,1,18
3,101038.321282,2,23,6,1,18
4,118906.000000,1,44,15,2,16
...,...,...,...,...,...,...
4482,90282.000000,2,23,2,1,13
4483,85000.000000,2,23,2,1,14
4484,65214.000000,0,26,7,1,21
4485,45000.000000,2,23,1,1,27


In [107]:
linear = LinearRegression()

In [144]:
filename = "mypred.sav"
pickle.dump(grad, open(filename, "wb"))