# PART- 2 `( Handling Missing Values )`

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats.mstats import winsorize

import warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")

%matplotlib inline
# %matplotlib notebook

plt.rcParams["figure.figsize"] = (10,6)
# plt.rcParams['figure.dpi'] = 100

sns.set_style("whitegrid")
pd.set_option('display.float_format', lambda x: '%.3f' % x)

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 150

In [2]:
df = pd.read_csv("clean_scout.csv")  # eğer bir önceki sayfada index= false yapmasaydık burada index_col=false ilave etmeliydik. yoksa bir ilave sütunumuz daha olurdu.

In [3]:
df.shape

(15919, 33)

In [4]:
df.head(3).T

Unnamed: 0,0,1,2
make_model,Audi A1,Audi A1,Audi A1
body_type,Sedans,Sedans,Sedans
price,15770,14500,14640
vat,VAT deductible,Price negotiable,VAT deductible
km,56013.000,80000.000,83450.000
Type,Used,Used,Used
Warranty,,,
Cylinders,3.000,4.000,
Fuel,Diesel,Benzine,Diesel
Gears,,7.000,


In [None]:
# sütunları nasıl dolduracağımızın kararını vermemiz gerekir. Mesela make_model, body_type, km, registration ilk başta doldurmam gereken sütunlar olarak değerlendirilebilir.

In [5]:
df.isnull().sum()/df.shape[0]*100

make_model             0.000
body_type              0.377
price                  0.000
vat                   28.350
km                     6.433
Type                   0.013
Warranty              69.514
Cylinders             35.681
Fuel                   0.000
Gears                 29.600
Comfort_Convenience    5.779
Entertainment_Media    8.631
Extras                18.607
Safety_Security        6.169
age                   10.032
Previous_Owners       41.711
hp_kW                  0.553
Inspection_new        75.300
Body_Color             3.750
Paint_Type            36.259
Upholstery_type       30.599
Upholstery_color      31.899
Nr_of_Doors            1.332
Nr_of_Seats            6.137
Gearing_Type           0.000
Displacement_cc        3.116
Weight_kg             43.809
Drive_chain           43.081
cons_comb             12.771
cons_city             15.302
cons_country          14.926
CO2_Emission          15.302
Emission_Class        22.790
dtype: float64

In [6]:
# missing value içeren sütunları bize getiren func.

miss_val = []

[miss_val.append(i) for i in df.columns if any(df[i].isnull())]

miss_val

['body_type',
 'vat',
 'km',
 'Type',
 'Warranty',
 'Cylinders',
 'Gears',
 'Comfort_Convenience',
 'Entertainment_Media',
 'Extras',
 'Safety_Security',
 'age',
 'Previous_Owners',
 'hp_kW',
 'Inspection_new',
 'Body_Color',
 'Paint_Type',
 'Upholstery_type',
 'Upholstery_color',
 'Nr_of_Doors',
 'Nr_of_Seats',
 'Displacement_cc',
 'Weight_kg',
 'Drive_chain',
 'cons_comb',
 'cons_city',
 'cons_country',
 'CO2_Emission',
 'Emission_Class']

In [7]:
# function for first looking to the columns

def first_looking(col):
    print("column name    : ", col)
    print("--------------------------------")
    print("per_of_nulls   : ", "%", round(df[col].isnull().sum()/df.shape[0]*100, 2))
    print("num_of_nulls   : ", df[col].isnull().sum())
    print("num_of_uniques : ", df[col].nunique())
    print("--------------------------------")
    print(df[col].value_counts(dropna = False))

## functions to fill the missing values

In [None]:
def fill_most(df, group_col, col_name):
    '''Fills the missing values with the most existing value (mode) in the relevant column according to single-stage grouping'''
    for group in list(df[group_col].unique()):
        cond = df[group_col]==group
        mode = list(df[cond][col_name].mode())
        if mode != []:
            df.loc[cond, col_name] = df.loc[cond, col_name].fillna(df[cond][col_name].mode()[0])
        else:
            df.loc[cond, col_name] = df.loc[cond, col_name].fillna(df[col_name].mode()[0])
    print("Number of NaN : ",df[col_name].isnull().sum())
    print("------------------")
    print(df[col_name].value_counts(dropna=False))

In [None]:
def fill_prop(df, group_col, col_name):
    '''Fills the missing values with "ffill and bfill method" according to single-stage grouping'''
    for group in list(df[group_col].unique()):
        cond = df[group_col]==group
        df.loc[cond, col_name] = df.loc[cond, col_name].fillna(method="ffill").fillna(method="bfill")
    df[col_name] = df[col_name].fillna(method="ffill").fillna(method="bfill")
    print("Number of NaN : ",df[col_name].isnull().sum())
    print("------------------")
    print(df[col_name].value_counts(dropna=False))

In [None]:
def fill(df, group_col1, group_col2, col_name, method): # method can be "mode" or "median" or "ffill"
    if method == "mode":
        for group1 in list(df[group_col1].unique()):
            for group2 in list(df[group_col2].unique()):
                cond1 = df[group_col1]==group1
                cond2 = (df[group_col1]==group1) & (df[group_col2]==group2)
                mode1 = list(df[cond1][col_name].mode())
                mode2 = list(df[cond2][col_name].mode())
                if mode2 != []:
                    df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[cond2][col_name].mode()[0])
                elif mode1 != []:
                    df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[cond1][col_name].mode()[0])
                else:
                    df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[col_name].mode()[0])
                
    elif method == "median":
        for group1 in list(df[group_col1].unique()):
            for group2 in list(df[group_col2].unique()):
                cond1 = df[group_col1]==group1
                cond2 = (df[group_col1]==group1) & (df[group_col2]==group2)
                df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[cond2][col_name].median()).fillna(df[cond1][col_name].median()).fillna(df[col_name].median())
                
    elif method == "ffill":           
        for group1 in list(df[group_col1].unique()):
            for group2 in list(df[group_col2].unique()):
                cond2 = (df[group_col1]==group1) & (df[group_col2]==group2)
                df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(method="ffill").fillna(method="bfill")
                
        for group1 in list(df[group_col1].unique()):
            cond1 = df[group_col1]==group1
            df.loc[cond1, col_name] = df.loc[cond1, col_name].fillna(method="ffill").fillna(method="bfill")            
           
        df[col_name] = df[col_name].fillna(method="ffill").fillna(method="bfill")
    
    print("Number of NaN : ",df[col_name].isnull().sum())
    print("------------------")
    print(df[col_name].value_counts(dropna=False))

## Let's examine and fill the missing values of all the columns/features one by one

## age

In [8]:
first_looking("age")

column name    :  age
--------------------------------
per_of_nulls   :  % 10.03
num_of_nulls   :  1597
num_of_uniques :  4
--------------------------------
1.000    4522
3.000    3674
2.000    3273
0.000    2853
NaN      1597
Name: age, dtype: int64


In [9]:
df['age'].fillna('-', inplace=True) # nan değerlerin describe alamadığımdan dolayı - ile doldurdum.

In [10]:
df["age"].value_counts(dropna=False)

1.0    4522
3.0    3674
2.0    3273
0.0    2853
-      1597
Name: age, dtype: int64

In [11]:
df.groupby("age").km.describe()  # burada min , max ve mean değerlerine baktım.

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,2706.0,2085.355,5365.881,1.0,10.0,50.0,3000.0,127022.0
1.0,4484.0,18035.239,11052.524,1.0,9990.0,17872.0,25078.5,136000.0
2.0,3272.0,41754.941,28295.748,1.0,21541.75,34752.0,54805.5,317000.0
3.0,3674.0,77442.521,39170.143,10.0,48000.0,72914.5,99950.0,291800.0
-,759.0,934.497,7416.244,0.0,5.0,10.0,10.0,89982.0


In [12]:
df[df["age"]=="-"]["km"].value_counts(dropna=False)

NaN          838
10.000       369
1.000        146
5.000         58
20.000        32
15.000        21
0.000         19
11.000        12
8.000         11
50.000        10
100.000        8
12.000         8
7.000          7
3.000          4
9.000          4
4.000          3
25.000         3
250.000        3
30.000         3
3000.000       2
22627.000      2
39962.000      2
2.000          2
19500.000      1
11000.000      1
85000.000      1
4307.000       1
89692.000      1
77.000         1
3500.000       1
68485.000      1
5000.000       1
141.000        1
150.000        1
34164.000      1
142.000        1
32084.000      1
81800.000      1
11200.000      1
20768.000      1
4500.000       1
40.000         1
784.000        1
89982.000      1
500.000        1
325.000        1
6100.000       1
196.000        1
6.000          1
60.000         1
497.000        1
99.000         1
281.000        1
Name: km, dtype: int64

In [13]:
cond1 = (df['km'] < 10000)
cond2 = ((df['km'] >= 10000) & (df['km'] < 28000))
cond3 = ((df['km'] >= 28000) & (df['km'] < 50000))
cond4 = (df['km'] >= 50000)

In [14]:
df.loc[cond1,'age'] = df.loc[cond1,'age'].replace('-', 0)
df.loc[cond2,'age'] = df.loc[cond2,'age'].replace('-', 1)
df.loc[cond3,'age'] = df.loc[cond3,'age'].replace('-', 2)
df.loc[cond4,'age'] = df.loc[cond4,'age'].replace('-', 3)

In [15]:
df.groupby('age').km.mean()  # age de km verisi olmayanları dolduramadım.

age
0.0    1647.363
1.0   18035.130
2.0   41748.577
3.0   77450.063
-           NaN
Name: km, dtype: float64

In [None]:
df["age"].value_counts(dropna=False)

In [16]:
df.groupby(['make_model',"body_type", 'age']).price.describe()  

# burada - olanlar ile 0 araçlarlar arasındaki fiyatlara baktık. Çok yakın olduğundan bunlara 0 verdik.

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
make_model,body_type,age,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Audi A1,Compact,0.0,198.0,23277.434,3510.406,14900.0,20503.5,22492.0,26798.5,31990.0
Audi A1,Compact,1.0,268.0,18596.041,2659.91,13980.0,16445.0,16980.0,20950.0,23829.0
Audi A1,Compact,2.0,161.0,16602.807,2085.384,10999.0,15450.0,15850.0,17700.0,22150.0
Audi A1,Compact,3.0,234.0,14532.91,1908.909,9950.0,13407.5,13994.5,15480.0,18900.0
Audi A1,Compact,-,178.0,23996.264,3383.852,16220.0,21515.0,22875.0,27380.0,29181.0
Audi A1,Coupe,2.0,1.0,15900.0,,15900.0,15900.0,15900.0,15900.0,15900.0
Audi A1,Coupe,3.0,1.0,13950.0,,13950.0,13950.0,13950.0,13950.0,13950.0
Audi A1,Other,0.0,8.0,23826.25,2057.439,21490.0,22490.0,22720.0,25900.0,26900.0
Audi A1,Other,1.0,3.0,16796.667,178.979,16590.0,16745.0,16900.0,16900.0,16900.0
Audi A1,Other,2.0,1.0,23490.0,,23490.0,23490.0,23490.0,23490.0,23490.0


In [17]:
df['age'].replace('-',0, inplace=True)

In [18]:
df.groupby('age').km.mean()

age
0.000    1647.363
1.000   18035.130
2.000   41748.577
3.000   77450.063
Name: km, dtype: float64

In [19]:
df["age"].value_counts(dropna=False)

1.000    4528
0.000    4435
3.000    3679
2.000    3277
Name: age, dtype: int64

## km

In [20]:
first_looking("km")

column name    :  km
--------------------------------
per_of_nulls   :  % 6.43
num_of_nulls   :  1024
num_of_uniques :  6689
--------------------------------
10.000        1045
NaN           1024
1.000          367
5.000          170
50.000         148
              ... 
160542.000       1
20719.000        1
91910.000        1
39860.000        1
57889.000        1
Name: km, Length: 6690, dtype: int64


In [21]:
df.groupby("age").km.mean()

age
0.000    1647.363
1.000   18035.130
2.000   41748.577
3.000   77450.063
Name: km, dtype: float64

In [22]:
df.groupby("age").km.transform("mean")   

# grupby yapsak da dahi transform uygulamadığımızdı df size değişmez. apply da ise değişir.

# transform agg yapmaz ancak agg sonucunu kullanır.

0       77450.063
1       41748.577
2       77450.063
3       77450.063
4       77450.063
           ...   
15914    1647.363
15915    1647.363
15916    1647.363
15917    1647.363
15918    1647.363
Name: km, Length: 15919, dtype: float64

In [23]:
df["km"].fillna(df.groupby("age").km.transform("mean"), inplace=True)

In [24]:
df.km.value_counts(dropna=False)

10.000        1045
1647.363       985
1.000          367
5.000          170
50.000         148
              ... 
160542.000       1
20719.000        1
91910.000        1
39860.000        1
57889.000        1
Name: km, Length: 6692, dtype: int64

## body type

In [25]:
first_looking("body_type")

column name    :  body_type
--------------------------------
per_of_nulls   :  % 0.38
num_of_nulls   :  60
num_of_uniques :  9
--------------------------------
Sedans           7903
Station wagon    3553
Compact          3153
Van               783
Other             290
Transporter        88
NaN                60
Off-Road           56
Coupe              25
Convertible         8
Name: body_type, dtype: int64


In [26]:
df.body_type.replace("Other", np.nan, inplace=True)   # other nan çevirdik ve bunları tekrar doldurduk.

In [27]:
df['body_type'].value_counts(dropna=False)

Sedans           7903
Station wagon    3553
Compact          3153
Van               783
NaN               350
Transporter        88
Off-Road           56
Coupe              25
Convertible         8
Name: body_type, dtype: int64

In [28]:
df["body_type"].mode()

0    Sedans
dtype: object

In [29]:
df["body_type"].mode()[0]

'Sedans'

In [None]:
#Step-1
#df["body_type"].fillna(df["body_type"].mode()[0])

#Step-2
#df.loc[df["make_model"]=="Audi A1", "body_type"].fillna(df[df["make_model"]=="Audi A1"]["body_type"].mode()[0])

In [30]:
#Step-3
for group in list(df["make_model"].unique()):
    cond = df["make_model"]==group
    mode = list(df[cond]["body_type"].mode())  
# alttaki kodu eğer araç tekse bunun mode() değerini bulmaz veya body_type boşsa doldurma durumu olmaz ve hata verir.

# bu bug düzeltmek için aşağıdaki kodu yazıyoruz.
    if mode != []:
        df.loc[cond, "body_type"] = df.loc[cond, "body_type"].fillna(df[cond]["body_type"].mode()[0])
    else:
        df.loc[cond, "body_type"] = df.loc[cond, "body_type"].fillna(df["body_type"].mode()[0])

In [None]:
df['body_type'].value_counts(dropna=False)

In [31]:
# burada yukarıda kullandığımız func genelleştirdik ve tüm sütunlara uygulanabilir hale getirdik.

def fill_most(df, group_col, col_name):
    '''Fills the missing values with the most existing value (mode) in the relevant column according to single-stage grouping'''
    for group in list(df[group_col].unique()):
        cond = df[group_col]==group
        mode = list(df[cond][col_name].mode())
        if mode != []:
            df.loc[cond, col_name] = df.loc[cond, col_name].fillna(df[cond][col_name].mode()[0])
        else:
            df.loc[cond, col_name] = df.loc[cond, col_name].fillna(df[col_name].mode()[0])
    print("Number of NaN : ",df[col_name].isnull().sum())
    print("------------------")
    print(df[col_name].value_counts(dropna=False))

In [32]:
fill_most(df, "make_model", "body_type")

Number of NaN :  0
------------------
Sedans           8005
Station wagon    3678
Compact          3242
Van               817
Transporter        88
Off-Road           56
Coupe              25
Convertible         8
Name: body_type, dtype: int64


## Previous_Owners

In [33]:
first_looking("Previous_Owners")

column name    :  Previous_Owners
--------------------------------
per_of_nulls   :  % 41.71
num_of_nulls   :  6640
num_of_uniques :  5
--------------------------------
1.000    8294
NaN      6640
2.000     778
0.000     188
3.000      17
4.000       2
Name: Previous_Owners, dtype: int64


In [34]:
df["Previous_Owners"].fillna("-", inplace = True)

In [35]:
df["Previous_Owners"].value_counts(dropna=False)

1.0    8294
-      6640
2.0     778
0.0     188
3.0      17
4.0       2
Name: Previous_Owners, dtype: int64

In [36]:
df.groupby(['make_model', 'age', 'Previous_Owners']).km.describe() 

# buradan prev_owner ile ilgili net bir sonuç çıkaramadık. Ancak Duster km düşük olduğundan dolayı ilk sahipleri olabileceğini değerlendiriyorum.

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
make_model,age,Previous_Owners,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Audi A1,0.0,0.0,45.0,958.365,815.362,0.0,10.0,1647.363,1647.363,1647.363
Audi A1,0.0,1.0,239.0,3069.543,3460.402,1.0,20.0,2500.0,5000.0,18000.0
Audi A1,0.0,2.0,1.0,3000.0,,3000.0,3000.0,3000.0,3000.0,3000.0
Audi A1,0.0,-,521.0,1549.992,2303.8,1.0,11.0,1647.363,1647.363,15500.0
Audi A1,1.0,0.0,1.0,15000.0,,15000.0,15000.0,15000.0,15000.0,15000.0
Audi A1,1.0,1.0,496.0,13772.192,8099.461,20.0,6898.0,11317.5,21054.0,35500.0
Audi A1,1.0,2.0,14.0,13734.286,10613.039,5000.0,8000.0,10500.0,13420.5,47000.0
Audi A1,1.0,-,236.0,13930.462,7544.057,1.0,9227.5,13437.0,19050.0,31877.0
Audi A1,2.0,0.0,1.0,68002.0,,68002.0,68002.0,68002.0,68002.0,68002.0
Audi A1,2.0,1.0,195.0,23858.369,19666.376,10.0,11659.5,18950.0,29968.5,148257.0


In [37]:
df[(df["make_model"]=="Renault Duster") & (df["Previous_Owners"] == "-")]["km"]

14894   1647.363
14895   1647.363
14896    101.000
14897   1647.363
14898    101.000
14899    101.000
14900   1647.363
14901    101.000
14903    101.000
14904   1647.363
14905   1647.363
14906    101.000
14907    101.000
14908   1647.363
14909   1647.363
14910    101.000
14911   1647.363
14912    101.000
14913   1647.363
14914    101.000
14915    101.000
14916    101.000
14917   1647.363
14918   1647.363
14919   1647.363
14920    101.000
14921   1647.363
14922   1647.363
14923    101.000
14924   1647.363
14925    101.000
14926    101.000
14927   1647.363
Name: km, dtype: float64

In [38]:
cond = (df["make_model"]=="Renault Duster") & (df["Previous_Owners"] == "-")
df.loc[cond, "Previous_Owners"] = 0.0

In [39]:
df["Previous_Owners"].value_counts(dropna=False)

1.0    8294
-      6607
2.0     778
0.0     221
3.0      17
4.0       2
Name: Previous_Owners, dtype: int64

In [None]:
df["Previous_Owners"].replace("-", np.nan, inplace=True)  # önce doldurabildiklerimi doldurduk şimdi kalanları nan yapıp dolduracağız.

In [None]:
#Step-1
#df.["Previous_Owners"].fillna(method="ffill")

#Step-2
#df.loc[df["age"]==0, "Previous_Owners"].fillna(method="ffill")

In [None]:
#Step-3

# ffill ve bfill data yapısını çok bozmaz çünkü veri seti sırayla gidiyor çok ekstra durumlar olmadıkça.

for group in list(df["age"].unique()):
    cond = df["age"]==group
    df.loc[cond, "Previous_Owners"] = df.loc[cond, "Previous_Owners"].fillna(method="ffill").fillna(method="bfill")
# yukarıdakinin sonrasında boş kalan yerler için kullandık. burada oluşan bug düzeltmek için. Mesela A1 bu durumda tek olduğundan sorun oluşturdu.
df["Previous_Owners"] = df["Previous_Owners"].fillna(method="ffill").fillna(method="bfill")   

In [40]:
df["Previous_Owners"].value_counts(dropna=False)

1.0    8294
-      6607
2.0     778
0.0     221
3.0      17
4.0       2
Name: Previous_Owners, dtype: int64

In [None]:
# Yukarıdaki func genelleştirmek için oluşturduğumuz func.

def fill_prop(df, group_col, col_name):
    '''Fills the missing values with "ffill and bfill method" according to single-stage grouping'''
    for group in list(df[group_col].unique()):
        cond = df[group_col]==group
        df.loc[cond, col_name] = df.loc[cond, col_name].fillna(method="ffill").fillna(method="bfill")
    df[col_name] = df[col_name].fillna(method="ffill").fillna(method="bfill")
    print("Number of NaN : ",df[col_name].isnull().sum())
    print("------------------")
    print(df[col_name].value_counts(dropna=False))

In [41]:
fill_prop(df, "age", "Previous_Owners")

NameError: name 'fill_prop' is not defined

## Warranty

In [None]:
first_looking("Warranty")

In [None]:
df["Warranty"].fillna("-", inplace = True)

In [None]:
df.groupby(['make_model', 'age', 'Warranty']).price.describe()

In [None]:
df.groupby(["make_model","Warranty"]).price.describe().sort_values(by = ["make_model", "mean"], ascending = False)

##### There are too many nan values and when we analyzed these nan values according to the km, age and make_model columns, we decided that this column does not have healthy data.

In [None]:
df.drop("Warranty", axis=1, inplace=True)

## vat

In [None]:
first_looking("vat")

There is no relation between vat and other columns. So we can use ffill

In [None]:
df.vat.fillna(method="ffill", inplace = True)

In [None]:
df.vat.value_counts(dropna=False)

## Body_Color

In [None]:
first_looking("Body_Color")

In [None]:
df["Body_Color"].fillna("-", inplace = True)

In [None]:
df["Body_Color"].value_counts(dropna=False)

In [None]:
df.groupby(["make_model", "body_type", 'Body_Color']).price.describe()

In [None]:
df.drop("Body_Color", axis=1, inplace=True)

### Paint Type

In [None]:
first_looking("Paint_Type")

In [None]:
df["Paint_Type"].fillna("-", inplace = True)

In [None]:
df["Paint_Type"].value_counts(dropna=False)

In [None]:
df.groupby(["make_model", "body_type", "age", 'Paint_Type']).price.describe()

In [None]:
df.groupby(["make_model", "body_type", 'Paint_Type']).price.describe().sort_values(by = ["make_model", "body_type", "mean"], ascending = False)

In [None]:
df["Paint_Type"].replace("-", np.nan, inplace = True)

In [None]:
df["Paint_Type"].value_counts(dropna=False)

In [None]:
#Step-1
#df.["Paint_Type"].fillna(method="ffill")

#Step-2
#df.loc[df["age"]==0, "Paint_Type"].fillna(method="ffill")

#Step-3
#for group in list(df["make_model"].unique()):
#    cond = df["make_model"]==group
#    df.loc[cond, "Paint_Type"] = df.loc[cond, "Paint_Type"].fillna(method="ffill").fillna(method="bfill")
#df["Paint_Type"] = df["Paint_Type"].fillna(method="ffill").fillna(method="bfill")

In [None]:
# Step-4
for group1 in list(df["make_model"].unique()):
    for group2 in list(df["body_type"].unique()):
        cond2 = (df["make_model"]==group1) & (df["body_type"]==group2)
        df.loc[cond2, "Paint_Type"] = df.loc[cond2, "Paint_Type"].fillna(method="ffill").fillna(method="bfill")
                
for group1 in list(df["make_model"].unique()):
    cond1 = df["make_model"]==group1
    df.loc[cond1, "Paint_Type"] = df.loc[cond1, "Paint_Type"].fillna(method="ffill").fillna(method="bfill")            
           
df["Paint_Type"] = df["Paint_Type"].fillna(method="ffill").fillna(method="bfill")

In [None]:
df["Paint_Type"].value_counts(dropna=False)

In [None]:
def fill_ffill(df, group_col1, group_col2, col_name):
    
    for group1 in list(df[group_col1].unique()):
        for group2 in list(df[group_col2].unique()):
            cond2 = (df[group_col1]==group1) & (df[group_col2]==group2)
            df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(method="ffill").fillna(method="bfill")
                
    for group1 in list(df[group_col1].unique()):
        cond1 = df[group_col1]==group1
        df.loc[cond1, col_name] = df.loc[cond1, col_name].fillna(method="ffill").fillna(method="bfill")            
           
    df[col_name] = df[col_name].fillna(method="ffill").fillna(method="bfill")

In [None]:
fill_ffill(df, "make_model", "body_type", "Paint_Type")

In [None]:
df["Paint_Type"].value_counts(dropna=False)

### Type

In [None]:
first_looking("Type")

In [None]:
df.Type.fillna("-", inplace=True)

In [None]:
df['Type'].value_counts(dropna=False)

In [None]:
df.groupby(["Type", "make_model", "age"]).km.describe()

In [None]:
cond1 = (df['make_model'] == "Audi A3") & (df["age"] == 0)
cond2 = (df['make_model'] == "Audi A3") & (df["age"] == 3)

In [None]:
df.loc[cond1,'Type'] = df.loc[cond1,'Type'].replace('-','New')
df.loc[cond2,'Type'] = df.loc[cond2,'Type'].replace('-','Used')

In [None]:
df['Type'].value_counts(dropna=False)

## Inspection new

In [None]:
first_looking("Inspection_new")

In [None]:
df["Inspection_new"].fillna("-", inplace=True)

In [None]:
df["Inspection_new"].value_counts(dropna=False)

In [None]:
df.groupby(["make_model", "body_type", "age", "Inspection_new"]).price.describe()

In [None]:
df["Inspection_new"].replace("-", "No", inplace=True)

In [None]:
df["Inspection_new"].value_counts(dropna=False)

In [None]:
df["Inspection_new"].replace(["Yes", "No"], [1,0], inplace = True) # just replace mentioned values
#df["Inspection_new"].map({"Yes":1, "No":0}, inplace=True) # if there is value apart from mention in map function,
                                                           # that value is replaced with Nan value.

In [None]:
df["Inspection_new"].value_counts(dropna=False)

## Upholstery_type

In [None]:
first_looking("Upholstery_type")

In [None]:
df["Upholstery_type"].replace(["Velour", "alcantara", "Part leather", "Full leather"], ["Cloth", "Part/Full Leather", "Part/Full Leather", "Part/Full Leather"], inplace=True)

In [None]:
df["Upholstery_type"].value_counts(dropna=False)

In [None]:
fill(df, "make_model", "body_type", "Upholstery_type", "ffill")

## Upholstery_color

In [None]:
df.drop("Upholstery_color", axis=1, inplace=True)

### Nr. of Doors

In [None]:
first_looking("Nr_of_Doors")

In [None]:
fill(df, "make_model", "body_type", "Nr_of_Doors", "mode")

### Nr. of Seats

In [None]:
first_looking("Nr_of_Seats")

In [None]:
fill(df, "make_model", "body_type", "Nr_of_Seats", "mode")

### Cylinders

In [None]:
first_looking("Cylinders")

In [None]:
fill(df, "make_model", "body_type", "Cylinders", "mode")

In [None]:
df.drop("Cylinders", axis = 1, inplace = True)

### Drive chain

In [None]:
first_looking("Drive_chain")

In [None]:
df["Drive_chain"].fillna("-", inplace=True)

In [None]:
df.groupby(["make_model", "body_type", "Drive_chain"]).price.describe()

In [None]:
cond = (df['make_model'] == "Renault Duster") & (df["body_type"] == "Off-Road")

In [None]:
df.loc[cond,'Drive_chain'] = df.loc[cond,'Drive_chain'].replace('-','4WD')

In [None]:
df["Drive_chain"].value_counts(dropna=False)

In [None]:
df["Drive_chain"] = df["Drive_chain"].replace('-', np.nan)

In [None]:
df["Drive_chain"].value_counts(dropna=False)

In [None]:
fill(df, "make_model", "body_type", "Drive_chain", "mode")

### Emission Class

In [None]:
first_looking("Emission_Class")

In [None]:
df["Emission_Class"].fillna("-", inplace=True)

In [None]:
df["Emission_Class"].value_counts(dropna=False)

In [None]:
df.groupby(["make_model", "age", "Fuel", "Emission_Class"]).price.describe()

In [None]:
df["Emission_Class"].replace("-", np.nan, inplace=True)

In [None]:
df["Emission_Class"].value_counts(dropna=False)

In [None]:
fill(df, "age", "Fuel", "Emission_Class", "ffill")

In [None]:
df.drop("Emission_Class", axis=1, inplace=True)

### Gears

In [None]:
first_looking("Gears")

In [None]:
df["Gears"].fillna("-", inplace=True)

In [None]:
df["Gears"].value_counts(dropna=False)

In [None]:
df.groupby(["make_model", "body_type", "Gearing_Type", "Gears"]).price.describe()

In [None]:
df["Gears"].replace([1,2,3,4,9,50,"-"], np.nan, inplace=True)

In [None]:
df["Gears"].value_counts(dropna=False)

In [None]:
for group1 in list(df["make_model"].unique()):
    for group2 in list(df["body_type"].unique()):
        for group3 in list(df["Gearing_Type"].unique()):
            cond1 = df["make_model"]==group1
            cond2 = (df["make_model"]==group1) & (df["body_type"]==group2)
            cond3 = (df["make_model"]==group1) & (df["body_type"]==group2) & (df["Gearing_Type"]==group3)
            mode1 = list(df[cond1]["Gears"].mode())
            mode2 = list(df[cond2]["Gears"].mode())
            mode3 = list(df[cond3]["Gears"].mode())
            if mode3 != []:
                df.loc[cond3, "Gears"] = df.loc[cond3, "Gears"].fillna(df[cond3]["Gears"].mode()[0])
            elif mode2 != []:
                df.loc[cond3, "Gears"] = df.loc[cond3, "Gears"].fillna(df[cond2]["Gears"].mode()[0])
            elif mode1 != []:
                df.loc[cond3, "Gears"] = df.loc[cond3, "Gears"].fillna(df[cond1]["Gears"].mode()[0])
            else:
                df.loc[cond3, "Gears"] = df.loc[cond3, "Gears"].fillna(df["Gears"].mode()[0])

In [None]:
df["Gears"].value_counts(dropna=False)

### hp_kW

In [None]:
first_looking("hp_kW")

In [None]:
df["hp_kW"].fillna("-", inplace=True)

In [None]:
df.groupby(["make_model", "body_type","hp_kW"]).price.describe()

In [None]:
df["hp_kW"].replace("-", np.nan, inplace=True)

In [None]:
fill(df, "make_model", "body_type", "hp_kW", "mode")

## Displacement_cc

In [None]:
first_looking("Displacement_cc")

In [None]:
df["Displacement_cc"].fillna("-", inplace=True)

In [None]:
df.groupby(["make_model", "body_type","Displacement_cc"]).price.describe()

In [None]:
df["Displacement_cc"].replace("-", np.nan, inplace=True)

In [None]:
fill(df, "make_model", "body_type", "Displacement_cc", "mode")

## Weight_kg

In [None]:
first_looking("Weight_kg")

In [None]:
df["Weight_kg"].fillna("-", inplace=True)

In [None]:
df.groupby(["make_model", "body_type","Weight_kg"]).price.describe()

In [None]:
df["Weight_kg"].replace("-", np.nan, inplace=True)

In [None]:
fill(df, "make_model", "body_type", "Weight_kg", "mode")

## CO2 Emission

In [None]:
first_looking("CO2_Emission")

In [None]:
df["CO2_Emission"].fillna("-", inplace=True)

In [None]:
df.groupby(["make_model", "body_type","CO2_Emission"]).price.describe()

In [None]:
df["CO2_Emission"].replace("-", np.nan, inplace=True)

In [None]:
fill(df, "make_model", "body_type", "CO2_Emission", "median")

### Comfort_Convenience

In [None]:
first_looking("Comfort_Convenience")

In [None]:
fill(df, "make_model", "body_type", "Comfort_Convenience", "mode")

### Entertainment_Media

In [None]:
first_looking("Entertainment_Media")

In [None]:
fill(df, "make_model", "body_type", "Entertainment_Media", "mode")

### Extras

In [None]:
first_looking("Extras")

In [None]:
fill(df, "make_model", "body_type", "Extras", "mode")

### Safety_Security

In [None]:
first_looking("Safety_Security")

In [None]:
fill(df, "make_model", "body_type", "Safety_Security", "mode")

## cons_comb

In [None]:
first_looking("cons_comb")

In [None]:
cons_comb = (df["cons_country"] + df["cons_city"])/2

In [None]:
df["cons_comb"] = df["cons_comb"].fillna(cons_comb)

In [None]:
df["cons_comb"].value_counts(dropna=False)

In [None]:
df["cons_comb"].fillna("-", inplace=True)

In [None]:
df.groupby(["make_model", "body_type","cons_comb"]).price.describe()

In [None]:
df["cons_comb"].replace([0.0, 1.0, 1.2, 1.6, 10, 11, 13.8, 32.0, 33.0, 38.0, 40.0, 43.0, 46.0, 50.0, 51.0, 54.0, 55.0, "-"], np.nan, inplace=True)

In [None]:
df["cons_comb"].value_counts(dropna=False)

In [None]:
fill(df, "make_model", "body_type", "cons_comb", "median")

## cons_country

In [None]:
df.drop("cons_country", axis = 1, inplace = True)

## cons_city 

In [None]:
df.drop("cons_city", axis = 1, inplace = True)

## End of this phase

In [None]:
df.shape

In [None]:
df.isnull().sum()/df.shape[0]*100

In [None]:
df.to_csv("filled_scout.csv", index=False)