# Handling missing values using all techniques (Numerical features)

## Frequent category imputation

In [None]:
df = pd.read_csv("loan.csv")
df.head()

In [None]:
df.columns

In [None]:
df = pd.read_csv("loan.csv",usecols=['BsmtQual','FireplaceQu','GarageType','SalePrice'])

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df.isnull().mean().sort_values(ascending = True)

In [None]:
df['BsmtQual'].value_counts().plot.bar()

In [None]:
df['FireplaceQu'].value_counts().plot.bar()

In [None]:
df['GarageType'].value_counts().plot.bar()

In [None]:
df['GarageType'].value_counts().index[0]

In [None]:
def impute_freq(df,variable):
    most_freq = df[variable].value_counts().index[0]
    df[variable] = df[variable].fillna(most_freq)

In [None]:
for feature in ['BsmtQual','FireplaceQu','GarageType']:
    impute_freq(df,feature)

In [None]:
df.head()

In [None]:
df.isnull().mean()

In [None]:
df.drop(['BsmtQual_freq','FireplaceQu_freq','GarageType_freq'],axis=1,inplace=True)

In [None]:
df.head()

## Adding a variable to capture NAN

In [None]:
df = pd.read_csv("loan.csv",usecols=['BsmtQual','FireplaceQu','GarageType','SalePrice'])
df.head()

In [None]:
df['BsmtQual_var'] = np.where(df['BsmtQual'].isnull(),1,0)

In [None]:
df.head()

In [None]:
def impute_new_feature(df,variable):
    df[variable+"_newvar"] = np.where(df[variable].isnull(),1,0)
    most_freq = df[variable].value_counts().index[0]
    df[variable] = df[variable].fillna(most_freq)

In [None]:
for feature in ['BsmtQual','FireplaceQu','GarageType']:
    impute_new_feature(df,feature)

In [None]:
df.head()

## Suppose we have more frequent categories, just replace NAN with new category

In [None]:
df = pd.read_csv("loan.csv",usecols=['BsmtQual','FireplaceQu','GarageType','SalePrice'])
df.head()

In [None]:
def impute_new_category(df,variable):
    df[variable+"_newvar"] = np.where(df[variable].isnull(),"Missing",df[variable])

In [None]:
for feature in ['BsmtQual','FireplaceQu','GarageType']:
    impute_new_category(df,feature)

In [None]:
df.head()

In [None]:
df.drop(['BsmtQual','FireplaceQu','GarageType'],axis=1,inplace=True)

In [None]:
df.head()

## One Hot Encoding (Nominal Features)

In [None]:
df = pd.read_csv("titanic.csv")
df.head()

In [None]:
df = pd.read_csv("titanic.csv", usecols = ["Sex","Embarked"])
df.head()

In [None]:
pd.get_dummies(df["Sex"],drop_first= True).head()

In [None]:
df["Embarked"].dropna(inplace = True)

In [None]:
pd.get_dummies(df["Embarked"],drop_first= True).head(10)

In [None]:
df.head(10)

### If we have many categories under single feature we follow top 10 approach

In [None]:
df= pd.read_csv("mercedes.csv")
df.head()

In [31]:
df= pd.read_csv("mercedes.csv" ,usecols =["X0","X1","X2","X3","X4","X5","X6"])
df.head()

<IPython.core.display.Javascript object>

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d


In [32]:
df["X0"].value_counts().head(10)

z     360
ak    349
y     324
ay    313
t     306
x     300
o     269
f     227
n     195
w     182
Name: X0, dtype: int64

In [33]:
for category in df.columns:
    print(len(df[category].value_counts()))
    

47
27
44
7
4
29
12


In [34]:
df["X1"].value_counts().head(10)

aa    833
s     598
b     592
l     590
v     408
r     251
i     203
a     143
c     121
o      82
Name: X1, dtype: int64

In [35]:
top_10_all=[]
for i in df.columns:
    top_10 = df[i].value_counts().head(10).index
    top_10= list(top_10)
    top_10_all.append(top_10)


In [36]:
top_10_all

[['z', 'ak', 'y', 'ay', 't', 'x', 'o', 'f', 'n', 'w'],
 ['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o'],
 ['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e'],
 ['c', 'f', 'a', 'd', 'g', 'e', 'b'],
 ['d', 'a', 'b', 'c'],
 ['v', 'w', 'q', 'r', 'd', 's', 'n', 'm', 'p', 'i'],
 ['g', 'j', 'd', 'i', 'l', 'a', 'h', 'k', 'c', 'b']]

In [None]:
top_10

In [27]:
def multi_encode(variable,top_10_all):
    print(top_10_all[0])
    for value in top_10_all:
        print(value)
        for categories in value:
            print(categories)
            print(df[variable])
            df[categories]= np.where(df[variable]== categories,1,0)
        value.append(variable)

In [28]:
for variable in ["X0","X1","X2","X3","X4","X5","X6"]:
    multi_encode(variable,top_10_all)
    

['z', 'ak', 'y', 'ay', 't', 'x', 'o', 'f', 'n', 'w']
['z', 'ak', 'y', 'ay', 't', 'x', 'o', 'f', 'n', 'w']
z
0       0
1       0
2       0
3       0
4       0
5       0
6       0
7       0
8       0
9       0
10      0
11      0
12      0
13      0
14      0
15      0
16      0
17      0
18      0
19      0
20      0
21      0
22      0
23      0
24      0
25      0
26      0
27      0
28      0
29      0
       ..
4179    0
4180    0
4181    0
4182    0
4183    0
4184    0
4185    0
4186    0
4187    0
4188    0
4189    0
4190    0
4191    0
4192    0
4193    0
4194    0
4195    0
4196    0
4197    0
4198    0
4199    0
4200    0
4201    0
4202    0
4203    0
4204    0
4205    0
4206    0
4207    0
4208    0
Name: X0, Length: 4209, dtype: int32


<IPython.core.display.Javascript object>

  result = method(y)


TypeError: invalid type comparison

In [10]:
top_10_all

[['z', 'ak', 'y', 'ay', 't', 'x', 'o', 'f', 'n', 'w'],
 ['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o'],
 ['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e'],
 ['c', 'f', 'a', 'd', 'g', 'e', 'b'],
 ['d', 'a', 'b', 'c'],
 ['v', 'w', 'q', 'r', 'd', 's', 'n', 'm', 'p', 'i'],
 ['g',
  'j',
  'd',
  'i',
  'l',
  'a',
  'h',
  'k',
  'c',
  'b',
  'X0',
  'X1',
  'X2',
  'X3',
  'X4',
  'X5',
  'X6']]