In [12]:
#importing all the necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

In [13]:
#loading the dataset
df = pd.read_csv("Train.csv")
test = pd.read_csv("Test.csv")
print(df.shape)
df.head()

(7205, 9)


Unnamed: 0,VehicleID,Location,Maker,Model,Year,Colour,Amount (Million Naira),Type,Distance
0,VHL12546,Abuja,Honda,Accord Coupe EX V-6,2011,Silver,2.2,Nigerian Used,
1,VHL18827,Ibadan,Hyundai,Sonata,2012,Silver,3.5,Nigerian Used,125000.0
2,VHL19499,Lagos,Lexus,RX 350,2010,Red,9.2,Foreign Used,110852.0
3,VHL17991,Abuja,Mercedes-Benz,GLE-Class,2017,Blue,22.8,Foreign Used,30000.0
4,VHL12170,Ibadan,Toyota,Highlander,2002,Red,2.6,Nigerian Used,125206.0


In [14]:
test["Location"].unique()

array(['Abuja', 'Lagos', 'Ibadan'], dtype=object)

In [15]:
submission = pd.read_csv("SampleSubmission.csv")
submission.head()

Unnamed: 0,VehicleID,Amount (Million Naira)
0,VHL18518,1.0
1,VHL17149,1.0
2,VHL10927,1.0
3,VHL12909,1.0
4,VHL12348,1.0


In [16]:
#checking for missing values
df.isnull().sum()

VehicleID                    0
Location                     0
Maker                        0
Model                        0
Year                        21
Colour                       0
Amount (Million Naira)      17
Type                       197
Distance                  2360
dtype: int64

In [17]:
#df.dtypes
test.dtypes

VehicleID     object
Location      object
Maker         object
Model         object
Year          object
Colour        object
Type          object
Distance     float64
dtype: object

In [18]:
def preprocessing(data):
    data = data[["VehicleID","Location","Maker","Model","Year","Colour","Type","Distance"]]
    
    #removing the , in the year and distance column
    data["Year"] = data["Year"].replace(",","",regex=True)
    data["Distance"] = data["Distance"].replace(",","",regex=True)
    
    #converting year and Distance to floats
    data["Year"] = pd.to_numeric(data["Year"].fillna(0).astype(float))
    data["Distance"] = pd.to_numeric(data["Distance"].fillna(0).astype(float))
    
    #filling in nan values
    data["Type"].fillna(value=data["Type"].mode()[0], inplace=True)
    data['Year'] = data['Year'].replace(0,float(data["Year"].mode()[0]))
    data["Location"].fillna(value=data["Location"].mode()[0], inplace=True)
    data["Maker"].fillna(value=data["Maker"].mode()[0], inplace=True)
    data['Distance'] = data['Distance'].replace(0,data["Distance"].mean())
    
    #Feature Engineering
    X = data[["Year","Type","Distance","Maker","Model","Colour","Location"]]
    X["Present_Year"]=2022
    X["Number_of_Years_old"] = X["Present_Year"]- X["Year"]
    
    
    #Encoding Categorical features
    X["Maker"] = LabelEncoder().fit_transform(X["Maker"])
    X["Model"] = LabelEncoder().fit_transform(X["Model"])
    X["Colour"]= LabelEncoder().fit_transform(X["Colour"])
    Type = X[["Type"]]
    Type = pd.get_dummies(Type,drop_first=True)
    Location = X["Location"]
    Location = pd.get_dummies(Location,drop_first=True)
    X = pd.concat([X,Type,Location], axis=1)
    X.drop(labels=["Type","Year","Present_Year","Location"], axis=1,inplace=True)
    col_transformer = ColumnTransformer(
    remainder='passthrough',
    transformers=[
        ('scaler', StandardScaler(), ["Distance","Maker","Model","Colour",
                                     "Number_of_Years_old"]) # first 6 columns
        ]
    )

    col_transformer.fit(X)
    Xt = col_transformer.transform(X)
    #df = pd.DataFrame(Xt,columns=["Distance","Maker","Model","Colour","Number_of_Years_old",
     #                            "Type_Foreign Used","Type_Nigerian Used"])
    return Xt

In [19]:
df.isnull().sum()

VehicleID                    0
Location                     0
Maker                        0
Model                        0
Year                        21
Colour                       0
Amount (Million Naira)      17
Type                       197
Distance                  2360
dtype: int64

In [20]:
#Checking the unique features of the categorical features
print("Unique elements in Location: ", df["Location"].unique())
print("Unique elements in Maker: ", df["Maker"].unique())
print("Unique elements in Model: ", df["Model"].unique())
print("Unique elements in Colour: ", df["Colour"].unique())
print("Unique elements in Type: ", df["Type"].unique())

Unique elements in Location:  ['Abuja' 'Ibadan' 'Lagos']
Unique elements in Maker:  ['Honda' 'Hyundai' 'Lexus' 'Mercedes-Benz' 'Toyota' 'Acura' 'Dodge'
 'Nissan' 'Kia' 'BMW' 'Volvo' 'Ford' 'Land Rover' 'Lincoln' 'Peugeot'
 'Chevrolet' 'Audi' 'Jaguar' 'Infiniti' 'Porsche' 'Fiat' 'Maserati'
 'Volkswagen' 'Suzuki' 'Bentley' 'GAC' 'Mazda' 'Scion' 'Renault'
 'Mitsubishi' 'Mini' 'Pontiac' 'Cadillac' 'Ferrari' 'Jeep' 'Buick'
 'Rolls-Royce' 'GMC' 'Chrysler' 'Lamborghini' 'Citroen' 'King' 'BAW'
 'Saturn' 'Tata' 'Opel' 'JAC' 'MG' 'Hummer' 'Subaru' 'Rover' 'Saab'
 'Skoda' 'IVM' 'Brabus']
Unique elements in Model:  ['Accord Coupe EX V-6' 'Sonata' 'RX 350' ... 'Almera 1.6 Lux'
 'X5 3.0i Sports Activity' '320i SV Premium']
Unique elements in Colour:  ['Silver' 'Red' 'Blue' 'Black' 'Gold' 'White' 'Gray' 'Burgandy' 'Green'
 'Violet' 'Brown' 'Yellow' 'Orange' 'Pink' 'Beige' 'Purple' 'Ivory' 'G'
 'Teal' 'Mica' 'Pearl']
Unique elements in Type:  ['Nigerian Used' 'Foreign Used' 'Brand New' nan]


In [21]:

df.describe()

Unnamed: 0,Amount (Million Naira)
count,7188.0
mean,11.847999
std,25.318922
min,0.45
25%,3.5
50%,5.65
75%,11.6625
max,456.0


In [22]:
df["Amount (Million Naira)"].fillna(value=df["Amount (Million Naira)"].mean(), inplace=True)
Y = df["Amount (Million Naira)"]

In [23]:
#Building Model


# preprocess the train data 
processed_train = preprocessing(df)
processed_test = preprocessing(test)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pan

In [24]:
processed_train

array([[-0.23128415, -1.37643214, -1.44518408, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.33448395, -1.23669234,  1.5320183 , ...,  1.        ,
         1.        ,  0.        ],
       [ 0.19052853, -0.53799336,  1.08160464, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.23128415,  1.06901428,  0.92827233, ...,  1.        ,
         0.        ,  0.        ],
       [-0.06488347, -0.18864388, -0.18338693, ...,  0.        ,
         0.        ,  1.        ],
       [-0.08973074, -1.37643214, -1.42601755, ...,  1.        ,
         0.        ,  0.        ]])

In [25]:
#splitting the data
X_train,X_cross,Y_train,Y_cross = train_test_split(processed_train,Y,test_size=0.2)


In [35]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=50, max_depth=40)#50

gs = GridSearchCV(model,
                  {'max_features': np.arange(.05, 1, .05)},
                  cv=5,n_jobs=2,verbose=1
                )

gs.fit(X_train, Y_train)

Fitting 5 folds for each of 19 candidates, totalling 95 fits


GridSearchCV(cv=5,
             estimator=RandomForestRegressor(max_depth=40, n_estimators=50),
             n_jobs=2,
             param_grid={'max_features': array([0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55,
       0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95])},
             verbose=1)

In [36]:
processed_test.shape

(2061, 9)

In [37]:
Y_train.shape

(5764,)

In [42]:
print(gs.best_score_)
y_pred = gs.predict(X_cross)
mean_squared_error(y_pred,Y_cross)

0.6981903622124405


115.41562140472153

In [39]:
# Get the predicted result for the test Data
test['Amount (Million Naira)'] = gs.predict(processed_test)


In [40]:
# Create submission DataFrame
submission = pd.DataFrame({"VehicleID": test["VehicleID"] ,
                          "Amount (Million Naira)": test['Amount (Million Naira)']})
submission

Unnamed: 0,VehicleID,Amount (Million Naira)
0,VHL18518,14.163960
1,VHL17149,5.651333
2,VHL10927,4.970000
3,VHL12909,3.799200
4,VHL12348,8.887200
...,...,...
2056,VHL17903,28.628000
2057,VHL14018,5.392200
2058,VHL17473,5.641400
2059,VHL11480,11.549200


In [41]:
# Create submission csv file csv file
submission.to_csv('fourth_submission.csv', index = False)