In [51]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
import lightgbm as lgb

from sklearn.metrics import mean_squared_error

In [52]:
data = pd.read_csv("hyundai.csv")

In [53]:
data

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax(£),mpg,engineSize
0,I20,2017,7999,Manual,17307,Petrol,145,58.9,1.2
1,Tucson,2016,14499,Automatic,25233,Diesel,235,43.5,2.0
2,Tucson,2016,11399,Manual,37877,Diesel,30,61.7,1.7
3,I10,2016,6499,Manual,23789,Petrol,20,60.1,1.0
4,IX35,2015,10199,Manual,33177,Diesel,160,51.4,2.0
...,...,...,...,...,...,...,...,...,...
4855,I30,2016,8680,Manual,25906,Diesel,0,78.4,1.6
4856,I40,2015,7830,Manual,59508,Diesel,30,65.7,1.7
4857,I10,2017,6830,Manual,13810,Petrol,20,60.1,1.0
4858,Tucson,2018,13994,Manual,23313,Petrol,145,44.8,1.6


In [54]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4860 entries, 0 to 4859
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         4860 non-null   object 
 1   year          4860 non-null   int64  
 2   price         4860 non-null   int64  
 3   transmission  4860 non-null   object 
 4   mileage       4860 non-null   int64  
 5   fuelType      4860 non-null   object 
 6   tax(£)        4860 non-null   int64  
 7   mpg           4860 non-null   float64
 8   engineSize    4860 non-null   float64
dtypes: float64(2), int64(4), object(3)
memory usage: 341.8+ KB


In [55]:
unneeded_columns = ['tax(£)', 'engineSize']

data = data.drop(unneeded_columns, axis=1)

In [56]:
data

Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg
0,I20,2017,7999,Manual,17307,Petrol,58.9
1,Tucson,2016,14499,Automatic,25233,Diesel,43.5
2,Tucson,2016,11399,Manual,37877,Diesel,61.7
3,I10,2016,6499,Manual,23789,Petrol,60.1
4,IX35,2015,10199,Manual,33177,Diesel,51.4
...,...,...,...,...,...,...,...
4855,I30,2016,8680,Manual,25906,Diesel,78.4
4856,I40,2015,7830,Manual,59508,Diesel,65.7
4857,I10,2017,6830,Manual,13810,Petrol,60.1
4858,Tucson,2018,13994,Manual,23313,Petrol,44.8


In [57]:
def onehot_encode(df, colums, prefixes):
    df = df.copy()
    for column, prefix in zip(colums, prefixes):
        dummies = pd.get_dummies(df[column], prefix=prefix)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    return df

In [58]:
data = onehot_encode(
    data, ["model", "year", "transmission", "mileage", "fuelType", "mpg"],
    ["model", "year", "trans", "mile", "fuel", "mpg"]
)

In [59]:
data

Unnamed: 0,price,model_ Accent,model_ Amica,model_ Getz,model_ I10,model_ I20,model_ I30,model_ I40,model_ I800,model_ IX20,...,mpg_65.7,mpg_67.3,mpg_68.9,mpg_70.6,mpg_72.4,mpg_74.3,mpg_76.3,mpg_78.4,mpg_78.5,mpg_256.8
0,7999,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,14499,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,11399,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6499,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,10199,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4855,8680,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4856,7830,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
4857,6830,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4858,13994,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
for column in data.columns:
    data[column] = data[column].fillna(data[column].mean())

In [61]:
data.isna().sum().sum()

0

In [None]:
y = data.loc[:, 'price']
x = data.drop('price', axis=1)