In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats
import plotly.express as px

sns.set_palette('muted')
sns.set_color_codes('muted')
sns.set_style('white')

import warnings
warnings.filterwarnings('ignore')

Use the line below if you have high DPI screen and don't want Matplotlib plots to be blurry.

In [2]:
%config InlineBackend.figure_format = 'retina'

# Business task

# Dataset
The dataset is already collected:

In [3]:
df = pd.read_csv('car_train.csv')
test = pd.read_csv('car_test.csv')

In [4]:
df['sample'] = 'train'
test['sample'] = 'test'

In [5]:
# тестовые буду обрабатыать вместе с тренировочными, потом выделю обратно
train = pd.concat([df, test], axis=0, ignore_index=True)

In [6]:
train

Unnamed: 0,id,brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize,tax(£),price,sample
0,0,vauxhall,Astra,2017.0,Manual,19000 km,Diesel,20.0,72.4,1.6,,10095.0,train
1,1,ford,Focus,2017.0,Manual,26973 km,Petrol,145.0,57.7,1.0,,11999.0,train
2,2,Mercedes-Benz,A Class,2019.0,Manual,2078 km,Petrol,145.0,47.1,1.3,,23299.0,train
3,3,toyota,Aygo,2016.0,Manual,20169 km,Petrol,0.0,69.0,1.0,,6698.0,train
4,4,bmw,4 Series,2016.0,Manual,97706 km,Diesel,30.0,62.8,2.0,,11250.0,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...
108521,9573,ford,Mondeo,2016.0,Semi-Auto,59601 km,Diesel,125.0,57.7,2.0,,,test
108522,9574,audi,A3,2015.0,Manual,89060 km,Diesel,30.0,67.3,2.0,,,test
108523,9575,vw,Polo,2019.0,Manual,1267 km,Petrol,150.0,49.6,1.0,,,test
108524,9576,vw,Golf,2019.0,Semi-Auto,1658 km,Petrol,145.0,36.2,2.0,,,test


\>100K used cars postings from the British used cars site.

- **brand** : car manufacturer
- **model** : car model
- **year** : registration year
- **transmission** : type of gearbox (Manual, Semi-Auto, Automatic, Other)
- **mileage** : distance used, miles
- **fuelType** : engine fuel type (Diesel, Petrol, Hybrid, Electric, Other)
- **tax** : road tax
- **mpg** : miles per gallon (how many miles car can cover using 1 gallon of fuel; more miles -- less money spent on fuel)
- **engineSize** : engine size (volume) in litres
- **tax(£)** : road tax, £
- **price** : car price, £

# Solution

### Prep

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108526 entries, 0 to 108525
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            108526 non-null  int64  
 1   brand         108526 non-null  object 
 2   model         108526 non-null  object 
 3   year          108517 non-null  float64
 4   transmission  108526 non-null  object 
 5   mileage       108526 non-null  object 
 6   fuelType      108526 non-null  object 
 7   tax           94312 non-null   float64
 8   mpg           99171 non-null   float64
 9   engineSize    108526 non-null  float64
 10  tax(£)        4859 non-null    float64
 11  price         98948 non-null   float64
 12  sample        108526 non-null  object 
dtypes: float64(6), int64(1), object(6)
memory usage: 10.8+ MB


In [8]:
%%time
def nan_search(df):
  return round(df.isna().sum() / len(df) *100, 3)

CPU times: total: 0 ns
Wall time: 0 ns


In [9]:
nan_search(df)

id               0.000
brand            0.000
model            0.000
year             0.009
transmission     0.000
mileage          0.000
fuelType         0.000
tax             13.116
mpg              8.613
engineSize       0.000
tax(£)          95.497
price            0.000
sample           0.000
dtype: float64

In [10]:
nan_search(test)

id               0.000
brand            0.000
model            0.000
year             0.000
transmission     0.000
mileage          0.000
fuelType         0.000
tax             12.905
mpg              8.697
engineSize       0.000
tax(£)          95.792
sample           0.000
dtype: float64

### Year

In [11]:
# можно заменить средним годом
train[train.year.isna() == True]

Unnamed: 0,id,brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize,tax(£),price,sample
3889,3889,Hyundai,I10,,Manual,16512 km,Petrol,,57.7,1.2,145.0,7289.0,train
10869,10869,vw,Touareg,,Automatic,7094 km,Diesel,145.0,34.0,3.0,,36916.0,train
41026,41026,ford,Fiesta,,Manual,19741 km,Petrol,145.0,65.7,1.0,,10680.0,train
42118,42118,vw,Polo,,Manual,42140 km,Petrol,125.0,51.4,1.2,,6290.0,train
42551,42551,vauxhall,Mokka X,,Manual,20750 km,Petrol,150.0,47.1,1.4,,10495.0,train
53163,53163,audi,A6,,Automatic,17845 km,Diesel,145.0,58.9,2.0,,22800.0,train
61339,61339,vauxhall,Corsa,,Manual,17880 km,Petrol,145.0,54.3,1.4,,8995.0,train
66885,66885,vauxhall,Corsa,,Manual,3203 km,Petrol,150.0,43.5,1.4,,10975.0,train
77720,77720,vw,Sharan,,Manual,11565 km,Diesel,145.0,43.5,2.0,,23646.0,train


In [12]:
train['year_flg'] = np.where(train.year.isna()==True, 0, 1)

In [13]:
train.year.fillna(train.year.mean(), inplace=True)

### MPG search

In [14]:
nan_search(train)

id               0.000
brand            0.000
model            0.000
year             0.000
transmission     0.000
mileage          0.000
fuelType         0.000
tax             13.097
mpg              8.620
engineSize       0.000
tax(£)          95.523
price            8.826
sample           0.000
year_flg         0.000
dtype: float64

In [15]:
(train[train.mpg.isna()==True])['sample'].unique()

array(['train', 'test'], dtype=object)

In [16]:
train['mpg_flg'] = np.where(train.mpg.isna()==True, 0, 1)
train.mpg.fillna(train.mpg.median(), inplace=True)

In [17]:
nan_search(train)

id               0.000
brand            0.000
model            0.000
year             0.000
transmission     0.000
mileage          0.000
fuelType         0.000
tax             13.097
mpg              0.000
engineSize       0.000
tax(£)          95.523
price            8.826
sample           0.000
year_flg         0.000
mpg_flg          0.000
dtype: float64

### Categorical features

To build (one of) the simplest LR models that works, we need to:
1. select only numeric features.
2. drop incomplete examples.

In [18]:
def find_cat(data, num_uniq=10):
    columns = []
    for name in data.columns:
        message = name
        if type(data[name][0]) == str:
            message += " строка,"
        if data[name].nunique() <= num_uniq:
            message += " мало уникальных"
        if message != name:
            columns.append(name)
            print(message)
    return columns
            
find_cat(train)

brand строка, мало уникальных
model строка,
transmission строка, мало уникальных
mileage строка,
fuelType строка, мало уникальных
sample строка, мало уникальных
year_flg мало уникальных
mpg_flg мало уникальных


['brand',
 'model',
 'transmission',
 'mileage',
 'fuelType',
 'sample',
 'year_flg',
 'mpg_flg']

In [19]:
cat_features = ['brand',
                'transmission',
                'fuelType']

In [20]:
numeric_features = ['year', 'mileage', 'tax', 'mpg', 'engineSize', 'year_flg', 'mpg_flg']

### Transmission

**Possible directions:**

- work more carefully with missing values
- work more carefully with feature scales
- include more features in the model
- optimize regression coefficients (using L1 or L2 regularization)

In [21]:
train.transmission.value_counts()

Manual       61319
Semi-Auto    24903
Automatic    22294
Other           10
Name: transmission, dtype: int64

We will use **dummy** or **one-hot encoding** technique to encode these features. This is not the only option availble: ordinal encoding and target encoding are also possible for this feature ([brief explanation](https://medium.com/analytics-vidhya/target-encoding-vs-one-hot-encoding-with-simple-examples-276a7e7b3e64), [OHE API](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder), [ordinal API](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html#sklearn.preprocessing.OrdinalEncoder), [target API](https://contrib.scikit-learn.org/category_encoders/targetencoder.html)).

In [22]:
train.head()

Unnamed: 0,id,brand,model,year,transmission,mileage,fuelType,tax,mpg,engineSize,tax(£),price,sample,year_flg,mpg_flg
0,0,vauxhall,Astra,2017.0,Manual,19000 km,Diesel,20.0,72.4,1.6,,10095.0,train,1,1
1,1,ford,Focus,2017.0,Manual,26973 km,Petrol,145.0,57.7,1.0,,11999.0,train,1,1
2,2,Mercedes-Benz,A Class,2019.0,Manual,2078 km,Petrol,145.0,47.1,1.3,,23299.0,train,1,1
3,3,toyota,Aygo,2016.0,Manual,20169 km,Petrol,0.0,69.0,1.0,,6698.0,train,1,1
4,4,bmw,4 Series,2016.0,Manual,97706 km,Diesel,30.0,62.8,2.0,,11250.0,train,1,1


In [23]:
train = pd.get_dummies(train, columns=['brand', 'transmission', 'fuelType'])
train.head()

Unnamed: 0,id,model,year,mileage,tax,mpg,engineSize,tax(£),price,sample,...,brand_vw,transmission_Automatic,transmission_Manual,transmission_Other,transmission_Semi-Auto,fuelType_Diesel,fuelType_Electric,fuelType_Hybrid,fuelType_Other,fuelType_Petrol
0,0,Astra,2017.0,19000 km,20.0,72.4,1.6,,10095.0,train,...,0,0,1,0,0,1,0,0,0,0
1,1,Focus,2017.0,26973 km,145.0,57.7,1.0,,11999.0,train,...,0,0,1,0,0,0,0,0,0,1
2,2,A Class,2019.0,2078 km,145.0,47.1,1.3,,23299.0,train,...,0,0,1,0,0,0,0,0,0,1
3,3,Aygo,2016.0,20169 km,0.0,69.0,1.0,,6698.0,train,...,0,0,1,0,0,0,0,0,0,1
4,4,4 Series,2016.0,97706 km,30.0,62.8,2.0,,11250.0,train,...,0,0,1,0,0,1,0,0,0,0


In [24]:
nan_search(train)

id                         0.000
model                      0.000
year                       0.000
mileage                    0.000
tax                       13.097
mpg                        0.000
engineSize                 0.000
tax(£)                    95.523
price                      8.826
sample                     0.000
year_flg                   0.000
mpg_flg                    0.000
brand_Hyundai              0.000
brand_Mercedes-Benz        0.000
brand_audi                 0.000
brand_bmw                  0.000
brand_focus                0.000
brand_ford                 0.000
brand_skoda                0.000
brand_toyota               0.000
brand_vauxhall             0.000
brand_vw                   0.000
transmission_Automatic     0.000
transmission_Manual        0.000
transmission_Other         0.000
transmission_Semi-Auto     0.000
fuelType_Diesel            0.000
fuelType_Electric          0.000
fuelType_Hybrid            0.000
fuelType_Other             0.000
fuelType_P

In [25]:
sorted(train.year.unique())

[1970.0,
 1991.0,
 1995.0,
 1996.0,
 1997.0,
 1998.0,
 1999.0,
 2000.0,
 2001.0,
 2002.0,
 2003.0,
 2004.0,
 2005.0,
 2006.0,
 2007.0,
 2008.0,
 2009.0,
 2010.0,
 2011.0,
 2012.0,
 2013.0,
 2014.0,
 2015.0,
 2016.0,
 2017.0,
 2017.0963535667222,
 2018.0,
 2019.0,
 2020.0,
 2060.0]

In [26]:
train.year = train.year.apply(lambda x: int(x))

In [27]:
import numpy as np
train.mileage = train.mileage.str.replace(' km', '').apply(lambda x: float(x))

In [28]:
# замечаем, что значения tax(£) присутствуют только для отсутствующих tax, можно NaN ими заменить
nan_search(train[train['tax(£)'].isna()==False])

id                          0.000
model                       0.000
year                        0.000
mileage                     0.000
tax                       100.000
mpg                         0.000
engineSize                  0.000
tax(£)                      0.000
price                       8.294
sample                      0.000
year_flg                    0.000
mpg_flg                     0.000
brand_Hyundai               0.000
brand_Mercedes-Benz         0.000
brand_audi                  0.000
brand_bmw                   0.000
brand_focus                 0.000
brand_ford                  0.000
brand_skoda                 0.000
brand_toyota                0.000
brand_vauxhall              0.000
brand_vw                    0.000
transmission_Automatic      0.000
transmission_Manual         0.000
transmission_Other          0.000
transmission_Semi-Auto      0.000
fuelType_Diesel             0.000
fuelType_Electric           0.000
fuelType_Hybrid             0.000
fuelType_Other

In [29]:
index_tax=train[train['tax(£)'].isna()==False].index

In [30]:
np.sum(train.index.isin(index_tax))

4859

In [31]:
train['tax_new'] = np.where(train.index.isin(index_tax)==True, train['tax(£)'], train.tax)

In [32]:
train['tax_flg'] = np.where(train.index.isin(index_tax)==True, 0, 1)

In [33]:
train = train.drop(['tax', 'tax(£)'], axis=1)

In [34]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108526 entries, 0 to 108525
Data columns (total 31 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   id                      108526 non-null  int64  
 1   model                   108526 non-null  object 
 2   year                    108526 non-null  int64  
 3   mileage                 108526 non-null  float64
 4   mpg                     108526 non-null  float64
 5   engineSize              108526 non-null  float64
 6   price                   98948 non-null   float64
 7   sample                  108526 non-null  object 
 8   year_flg                108526 non-null  int32  
 9   mpg_flg                 108526 non-null  int32  
 10  brand_Hyundai           108526 non-null  uint8  
 11  brand_Mercedes-Benz     108526 non-null  uint8  
 12  brand_audi              108526 non-null  uint8  
 13  brand_bmw               108526 non-null  uint8  
 14  brand_focus         

In [35]:
tax_mode=float(train[train.tax_flg==1].tax_new.mode())
train.tax_new.fillna(tax_mode, inplace=True)

In [36]:
nan_search(train)

id                        0.000
model                     0.000
year                      0.000
mileage                   0.000
mpg                       0.000
engineSize                0.000
price                     8.826
sample                    0.000
year_flg                  0.000
mpg_flg                   0.000
brand_Hyundai             0.000
brand_Mercedes-Benz       0.000
brand_audi                0.000
brand_bmw                 0.000
brand_focus               0.000
brand_ford                0.000
brand_skoda               0.000
brand_toyota              0.000
brand_vauxhall            0.000
brand_vw                  0.000
transmission_Automatic    0.000
transmission_Manual       0.000
transmission_Other        0.000
transmission_Semi-Auto    0.000
fuelType_Diesel           0.000
fuelType_Electric         0.000
fuelType_Hybrid           0.000
fuelType_Other            0.000
fuelType_Petrol           0.000
tax_new                   0.000
tax_flg                   0.000
dtype: f

### Outliers fighting

In [37]:
numeric_features = ['year', 'mileage', 'tax_new', 'mpg', 'engineSize']

In [38]:
iqr=scipy.stats.iqr(train[train['sample']=='train'][numeric_features], axis=0)
q1=np.quantile(train[train['sample']=='train'][numeric_features], .25, axis=0)
q3=np.quantile(train[train['sample']=='train'][numeric_features], .75, axis=0)

In [39]:
# для удаления аутлайров границы
boundaries=[]
boundaries.append([*(q1-2.3*iqr)])
boundaries.append([*(q3+2.3*iqr)])
outliers_remedy=pd.DataFrame(boundaries, columns=numeric_features)

In [40]:
outliers_remedy

Unnamed: 0,year,mileage,tax_new,mpg,engineSize
0,2009.1,-49508.2,79.0,14.21,-0.64
1,2025.9,89240.2,191.0,94.29,3.84


In [41]:
# лучше удалить аутлайеры, но только в train_sample!
train[train['sample']=='train'][numeric_features].describe()

Unnamed: 0,year,mileage,tax_new,mpg,engineSize
count,98948.0,98948.0,98948.0,98948.0,98948.0
mean,2017.095323,23038.666825,122.406719,55.0963,1.661181
std,2.138052,21213.11568,60.837794,15.391497,0.556509
min,1970.0,1.0,0.0,0.3,0.0
25%,2016.0,7477.75,125.0,47.1,1.2
50%,2017.0,17289.5,145.0,54.3,1.6
75%,2019.0,32254.25,145.0,61.4,2.0
max,2060.0,323000.0,580.0,470.8,6.6


In [42]:
# удаляю
train_train = train[train['sample']=='train']

In [43]:
train_train=train_train[(train_train.year >= outliers_remedy.year[0]) & (train_train.year <= outliers_remedy.year[1])]

In [44]:
train_train=train_train[(train_train.mileage >= outliers_remedy.mileage[0]) & (train_train.mileage <= outliers_remedy.mileage[1])]

In [45]:
train_train=train_train[(train_train.tax_new >= outliers_remedy.tax_new[0]) & (train_train.tax_new <= outliers_remedy.tax_new[1])]

In [46]:
train_train=train_train[(train_train.mpg >= outliers_remedy.mpg[0]) & (train_train.mpg <= outliers_remedy.mpg[1])]

In [47]:
train_train=train_train[(train_train.engineSize >= outliers_remedy.engineSize[0]) & (train_train.engineSize <= outliers_remedy.engineSize[1])]

In [48]:
sub = train[train['sample']=='train']
sub.loc[sub.engineSize<0.6, 'engineSize'] = sub.engineSize.median()
train = pd.concat([sub, train[train['sample']=='test']], axis=0, ignore_index=True)

In [49]:
sub1 = train[train['sample']=='train']
sub1.loc[sub1.mileage<1500, 'mileage'] = sub1.mileage.median()
train = pd.concat([sub1, train[train['sample']=='test']], axis=0, ignore_index=True)

In [50]:
train_train[numeric_features].describe()

Unnamed: 0,year,mileage,tax_new,mpg,engineSize
count,71164.0,71164.0,71164.0,71164.0,71164.0
mean,2017.796414,16910.91698,144.423585,52.458637,1.643268
std,1.591428,15694.679137,6.848933,9.431533,0.510054
min,2010.0,1.0,110.0,24.8,0.0
25%,2017.0,5260.75,145.0,47.1,1.2
50%,2018.0,12431.5,145.0,53.3,1.5
75%,2019.0,23840.0,145.0,57.7,2.0
max,2020.0,89232.0,190.0,94.1,3.5


In [51]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108526 entries, 0 to 108525
Data columns (total 31 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   id                      108526 non-null  int64  
 1   model                   108526 non-null  object 
 2   year                    108526 non-null  int64  
 3   mileage                 108526 non-null  float64
 4   mpg                     108526 non-null  float64
 5   engineSize              108526 non-null  float64
 6   price                   98948 non-null   float64
 7   sample                  108526 non-null  object 
 8   year_flg                108526 non-null  int32  
 9   mpg_flg                 108526 non-null  int32  
 10  brand_Hyundai           108526 non-null  uint8  
 11  brand_Mercedes-Benz     108526 non-null  uint8  
 12  brand_audi              108526 non-null  uint8  
 13  brand_bmw               108526 non-null  uint8  
 14  brand_focus         

In [52]:
train = pd.concat([train_train, train[train['sample']=='test']], axis=0, ignore_index=True)

In [53]:
# check te significance of reation between price and tax_new, price and engineSize
import scipy.stats as stats
Sp_test = pd.DataFrame(stats.spearmanr(train[train['sample']=='train']['price'], train[train['sample']=='train']['tax_new']), columns=['value'], 
                           index=['Rho', 'T-test p-value'])

print('\n --- The Spearman rank correlation coefficient ---')
print(round(Sp_test, 3))


 --- The Spearman rank correlation coefficient ---
                value
Rho             0.038
T-test p-value  0.000


In [54]:
Sp_test = pd.DataFrame(stats.spearmanr(train[train['sample']=='train']['price'], train[train['sample']=='train']['engineSize']), columns=['value'], 
                           index=['Rho', 'T-test p-value'])

print('\n --- The Spearman rank correlation coefficient ---')
print(round(Sp_test, 3))


 --- The Spearman rank correlation coefficient ---
                value
Rho             0.626
T-test p-value  0.000


In [55]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80742 entries, 0 to 80741
Data columns (total 31 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      80742 non-null  int64  
 1   model                   80742 non-null  object 
 2   year                    80742 non-null  int64  
 3   mileage                 80742 non-null  float64
 4   mpg                     80742 non-null  float64
 5   engineSize              80742 non-null  float64
 6   price                   71164 non-null  float64
 7   sample                  80742 non-null  object 
 8   year_flg                80742 non-null  int32  
 9   mpg_flg                 80742 non-null  int32  
 10  brand_Hyundai           80742 non-null  uint8  
 11  brand_Mercedes-Benz     80742 non-null  uint8  
 12  brand_audi              80742 non-null  uint8  
 13  brand_bmw               80742 non-null  uint8  
 14  brand_focus             80742 non-null

Tax_new надо исключать - некоррелирован с Price

### Preprocessing

In [56]:
cat_features

['brand', 'transmission', 'fuelType']

In [57]:

from sklearn import preprocessing
import pandas as pd
le = preprocessing.LabelEncoder()

train.model=train[['model']].apply(le.fit_transform)

In [58]:
#train = pd.get_dummies(train, columns=['model'])
#train.head()

In [59]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80742 entries, 0 to 80741
Data columns (total 31 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      80742 non-null  int64  
 1   model                   80742 non-null  int32  
 2   year                    80742 non-null  int64  
 3   mileage                 80742 non-null  float64
 4   mpg                     80742 non-null  float64
 5   engineSize              80742 non-null  float64
 6   price                   71164 non-null  float64
 7   sample                  80742 non-null  object 
 8   year_flg                80742 non-null  int32  
 9   mpg_flg                 80742 non-null  int32  
 10  brand_Hyundai           80742 non-null  uint8  
 11  brand_Mercedes-Benz     80742 non-null  uint8  
 12  brand_audi              80742 non-null  uint8  
 13  brand_bmw               80742 non-null  uint8  
 14  brand_focus             80742 non-null

In [60]:
train['age'] = 2021-train.year

In [61]:
train

Unnamed: 0,id,model,year,mileage,mpg,engineSize,price,sample,year_flg,mpg_flg,...,transmission_Other,transmission_Semi-Auto,fuelType_Diesel,fuelType_Electric,fuelType_Hybrid,fuelType_Other,fuelType_Petrol,tax_new,tax_flg,age
0,1,54,2017,26973.0,57.7,1.0,11999.0,train,1,1,...,0,0,0,0,0,0,1,145.0,1,4
1,2,8,2019,2078.0,47.1,1.3,23299.0,train,1,1,...,0,0,0,0,0,0,1,145.0,1,2
2,9,83,2018,11562.0,57.7,1.2,8495.0,train,1,1,...,0,0,0,0,0,0,1,145.0,1,3
3,10,11,2018,26796.0,44.8,2.0,24790.0,train,1,1,...,0,0,0,0,0,0,1,145.0,1,3
4,11,53,2019,5014.0,58.9,1.0,13495.0,train,1,1,...,0,0,0,0,0,0,1,145.0,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80737,9573,99,2016,59601.0,57.7,2.0,,test,1,1,...,0,1,1,0,0,0,0,125.0,1,5
80738,9574,10,2015,89060.0,67.3,2.0,,test,1,1,...,0,0,1,0,0,0,0,30.0,1,6
80739,9575,104,2019,1267.0,49.6,1.0,,test,1,1,...,0,0,0,0,0,0,1,150.0,1,2
80740,9576,66,2019,1658.0,36.2,2.0,,test,1,1,...,0,1,0,0,0,0,1,145.0,1,2


In [62]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler().fit(train[['engineSize','mpg', 'tax_new', 'age', 'mileage']])
data_std_scaled = std_scaler.transform(train[['engineSize','mpg', 'tax_new', 'age', 'mileage']])

minmax_scaler = MinMaxScaler().fit(train[['engineSize','mpg', 'tax_new', 'age', 'mileage']])
data_minmax_scaled = minmax_scaler.transform(train[['engineSize','mpg', 'tax_new', 'age', 'mileage']])

In [63]:
train['st-ed_engineSize'] = data_std_scaled[:,0]
train['st-ed_mpg'] = data_std_scaled[:,1]
train['st-ed_tax_new'] = data_std_scaled[:,2]
train['st-ed_age'] = data_std_scaled[:,3]
train['st-ed_mileage'] = data_std_scaled[:,4]



In [64]:
train['mileage_t1']=1/(train.mileage+1e-7)
train['mileage_t2']=np.exp(-train.mileage+1e-7)
train['mpg_t1']=1/train.mpg
train['mpg_t2']=np.exp(-train.mpg)
train['tax_new_t1']=-train.tax_new**2
train['tax_new_t2']=-train.tax_new**2+1/(-train.tax_new**2+1e-7)
train['age_t1']=1/(train.age)
train['age_t2']=np.exp(-train.age)

In [65]:
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

y= train[train['sample']=='train']['price']
data = train[train['sample']=='train']

X = data.drop(columns=['id', 'year_flg', 'mpg_flg', 'tax_flg', 'price', 'year', 'tax_new', 'mpg', 'engineSize', 'sample',
                       'mileage', 'age'])


In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=42)

### Scaling

In [67]:
# in order not to check the normality in distributions of out features, let us apply Min-MAx Scaler

### Linear and Ridge Regression

In [68]:
print(

SyntaxError: unexpected EOF while parsing (149104261.py, line 1)

In [80]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=100, random_state=0, max_features = 'auto', max_depth = 90, 
                                 min_samples_split = 8, min_samples_leaf =3, n_jobs =-1)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
y_pred_train = regressor.predict(X_train)


In [81]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_percentage_error as mape

print("MAPE_test = {:.5f}".format((mape(y_test, y_pred)*100)))
print("MAPE_train = {:.5f}".format((mape(y_train, y_pred_train)*100)))

MAPE_test = 6.48877
MAPE_train = 4.69124


In [None]:
depth

In [None]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_percentage_error as mape
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
mape_lr = mape(y_test, y_pred_lr)
rr = Ridge()
rr.fit(X_train, y_train)
y_pred_rr = rr.predict(X_test)
mape_rr = mape(y_test ,y_pred_rr)
print('-------------Linear Regression-------------')
accuracy = lr.score(X_test,y_test)
print("MAPE = {:.2f}".format((mape_lr)))
print('Accuracy = ', accuracy*100,'%')
print('-------------Ridge Regression--------------')
print("MAPE = {:.2f}".format((mape_rr)))

In [None]:
from sklearn.tree import DecisionTreeClassifier

depth = 10

dt = DecisionTreeClassifier(max_depth=depth, min_samples_leaf=8, max_features=4)
dt.fit(X_train, y_train)

dt_pred_train = dt.predict(X_train)


dt_pred_test = dt.predict(X_test)


mape_test  = mape(y_test ,dt_pred_test)
mape_train = mape(y_train ,dt_pred_train)

print('-------------ДЕРЕВО-------------')
accuracy = dt.score(X_test,y_test)
print("MAPE_test = {:.2f}".format((mape_test)))
print('Accuracy = ', accuracy*100,'%')
print("MAPE_train = {:.2f}".format((mape_train)))
print('Accuracy = ', accuracy*100,'%')

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import random
# рандомно выбираю максимум фичей и минимум количества листов, глубину
param_dist = {'max_depth': [random.randint(100, 300)],
              'max_features': [random.randint(1, 24)],
              'min_samples_leaf': [random.randint(1, 24)]
              }

tree = DecisionTreeClassifier()

tree_cv = RandomizedSearchCV(tree, param_dist, cv=5)

tree_cv.fit(pd.DataFrame(X_train), pd.DataFrame(y_train))

print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))

depth=tree_cv.best_params_['max_depth']
feature=tree_cv.best_params_['max_features']
leaf=tree_cv.best_params_['min_samples_leaf']

In [None]:
dt = DecisionTreeClassifier(max_depth=depth, min_samples_leaf=feature, max_features=leaf)
dt.fit(X_train, y_train)

y_pr = dt.predict(X_test)
y_pred_train = dt.predict(X_train)
print("MAPE_test = {:.5f}".format((mape(y_test, y_pr)*100)))
print("MAPE_train = {:.5f}".format((mape(y_train, y_pred_train)*100)))

In [None]:
rf_random.best_params_

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1500, num = 7)]
# Number of features to consider at every split
max_features = ['auto']
# Maximum number of levels in tree
max_depth = [235]

# Minimum number of samples required to split a node
min_samples_split = [5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [2]
# Method of selecting samples for training each tree

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
print(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
rf_random.best_params_

In [None]:
slll = print(

In [82]:
final = train[train['sample']=='test']
X_f = final.drop(columns=['id', 'year_flg', 'mpg_flg', 'tax_flg', 'price', 'year', 'tax_new', 'mpg', 'engineSize', 'sample',
                       'mileage', 'age'])
kaggle = regressor.predict(X_f)

In [83]:
load = (pd.DataFrame(kaggle, columns=['price'])).rename_axis('id', axis=0)

In [84]:
# final test data
load.to_csv('submission5.csv', sep=',')

In [None]:
# сходмость гиперпараметра в Ridge

alphas = 10**np.linspace(10,-2,100)*0.5

from sklearn.preprocessing import scale 
from sklearn.linear_model import Ridge

clf= Ridge()
coefs = []

for a in alphas:
    clf.set_params(alpha=a)
    clf.fit(scale(X_train), y_train)
    coefs.append(clf.coef_)
    
ax = plt.gca()
ax.plot(alphas*2, coefs)
ax.set_xscale('log')
plt.axis('tight')
plt.xlabel('alpha')
plt.ylabel('weights')

In [None]:
# оптимальное значение через кросс-валидацию

from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV

ridgecv = RidgeCV(alphas = alphas, scoring = 'neg_mean_absolute_percentage_error')
ridgecv.fit(X_train, y_train)
ridgecv.alpha_

In [None]:
pred2 = ridgecv.predict(X_test)

In [None]:
from sklearn import metrics
print('Mean Absolute Precentage Error:', round(metrics.mean_absolute_percentage_error(y_test, pred2),2))

In [None]:
# модель, типы топлива, типы трансмиссии "другие", шкода, хундай, тойота незначимо влияют на цены хммм
coef_ridge= pd.Series(ridgecv.coef_, index = X.columns)
imp_coef = coef_ridge.sort_values()
import matplotlib
matplotlib.rcParams['figure.figsize'] = (7.0, 7.0)
imp_coef.plot(kind = "barh")
plt.title("Feature importance using Ridge Model")

In [None]:
# Lasso

from sklearn.linear_model import LassoCV

reg = LassoCV(alphas = None, cv = 10, max_iter = 100000)
reg.fit(X_train, y_train)
print("Best alpha using built-in LassoCV: %f" % reg.alpha_)
print("Best score using built-in LassoCV: %f" %reg.score(X_train,y_train))
coef = pd.Series(reg.coef_, index = X.columns)
print('Mean Absolute Precentage Error::', round(metrics.mean_absolute_percentage_error(y_test, reg.predict(X_test)),2))

In [None]:
y_pred = reg.predict(X_test)

In [None]:
# еще раз, контрольный
print('Mean Absolute Precentage Error:', round(metrics.mean_absolute_percentage_error(y_test, y_pred),2))

### XBoost

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

In [None]:
import xgboost as xgb

xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
xg_reg.fit(X_train,y_train)

preds = xg_reg.predict(X_test)
mape = mean_absolute_percentage_error(y_test, preds)
print("MAPE: %f" % (mape))

In [None]:
# fit model
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

xg_reg.fit(X_train,y_train)

predictions = xg_reg.predict(X_test)

In [None]:
print('Mean Absolute Precentage Error:', round(metrics.mean_absolute_percentage_error(y_test, predictions),2))