# Handling missing values using all techniques (Numerical features)

## Frequent category imputation

In [None]:
df = pd.read_csv("loan.csv")
df.head()

In [None]:
df.columns

In [None]:
df = pd.read_csv("loan.csv",usecols=['BsmtQual','FireplaceQu','GarageType','SalePrice'])

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df.isnull().mean().sort_values(ascending = True)

In [None]:
df['BsmtQual'].value_counts().plot.bar()

In [None]:
df['FireplaceQu'].value_counts().plot.bar()

In [None]:
df['GarageType'].value_counts().plot.bar()

In [None]:
df['GarageType'].value_counts().index[0]

In [None]:
def impute_freq(df,variable):
    most_freq = df[variable].value_counts().index[0]
    df[variable] = df[variable].fillna(most_freq)

In [None]:
for feature in ['BsmtQual','FireplaceQu','GarageType']:
    impute_freq(df,feature)

In [None]:
df.head()

In [None]:
df.isnull().mean()

In [None]:
df.drop(['BsmtQual_freq','FireplaceQu_freq','GarageType_freq'],axis=1,inplace=True)

In [None]:
df.head()

## Adding a variable to capture NAN

In [None]:
df = pd.read_csv("loan.csv",usecols=['BsmtQual','FireplaceQu','GarageType','SalePrice'])
df.head()

In [None]:
df['BsmtQual_var'] = np.where(df['BsmtQual'].isnull(),1,0)

In [None]:
df.head()

In [None]:
def impute_new_feature(df,variable):
    df[variable+"_newvar"] = np.where(df[variable].isnull(),1,0)
    most_freq = df[variable].value_counts().index[0]
    df[variable] = df[variable].fillna(most_freq)

In [None]:
for feature in ['BsmtQual','FireplaceQu','GarageType']:
    impute_new_feature(df,feature)

In [None]:
df.head()

## Suppose we have more frequent categories, just replace NAN with new category

In [None]:
df = pd.read_csv("loan.csv",usecols=['BsmtQual','FireplaceQu','GarageType','SalePrice'])
df.head()

In [None]:
def impute_new_category(df,variable):
    df[variable+"_newvar"] = np.where(df[variable].isnull(),"Missing",df[variable])

In [None]:
for feature in ['BsmtQual','FireplaceQu','GarageType']:
    impute_new_category(df,feature)

In [None]:
df.head()

In [None]:
df.drop(['BsmtQual','FireplaceQu','GarageType'],axis=1,inplace=True)

In [None]:
df.head()

## One Hot Encoding (Nominal Features)

In [None]:
df = pd.read_csv("titanic.csv")
df.head()

In [None]:
df = pd.read_csv("titanic.csv", usecols = ["Sex","Embarked"])
df.head()

In [None]:
pd.get_dummies(df["Sex"],drop_first= True).head()

In [None]:
df["Embarked"].dropna(inplace = True)

In [None]:
pd.get_dummies(df["Embarked"],drop_first= True).head(10)

In [None]:
df.head(10)

### If we have many categories under single feature we follow top 10 approach

In [4]:
df= pd.read_csv("mercedes.csv" ,usecols =["X0","X1","X2","X3","X4","X5","X6"])
df.head()

<IPython.core.display.Javascript object>

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d


In [5]:
df["X0"].value_counts().head(10)

z     360
ak    349
y     324
ay    313
t     306
x     300
o     269
f     227
n     195
w     182
Name: X0, dtype: int64

In [6]:
for category in df.columns:
    print(len(df[category].value_counts()))
    

47
27
44
7
4
29
12


In [7]:
df["X1"].value_counts().head(10)

aa    833
s     598
b     592
l     590
v     408
r     251
i     203
a     143
c     121
o      82
Name: X1, dtype: int64

In [8]:
#lst_10=df.X1.value_counts().sort_values(ascending=False).head(10).index
#lst_10=list(lst_10)

In [9]:
#or categories in lst_10:
 #  df[categories]=np.where(df['X1']==categories,1,0)

In [10]:
#print(type(df['X1']))
#print(type(categories))

In [11]:
#lst_10.append('X1')
#df[lst_10]

In [12]:
top_10_all=[]
for i in df.columns:
    top_10 = df[i].value_counts().head(10).index
    top_10= list(top_10)
    top_10_all.append(top_10)

In [13]:
top_10_all

[['z', 'ak', 'y', 'ay', 't', 'x', 'o', 'f', 'n', 'w'],
 ['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o'],
 ['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e'],
 ['c', 'f', 'a', 'd', 'g', 'e', 'b'],
 ['d', 'a', 'c', 'b'],
 ['v', 'w', 'q', 'r', 'd', 's', 'n', 'p', 'm', 'i'],
 ['g', 'j', 'd', 'i', 'l', 'a', 'h', 'k', 'c', 'b']]

In [14]:
def multi_encode(variable,top_10_all):
        first_top10 = top_10_all[0]
        print(first_top10)

        for value in first_top10:
            df[value]= np.where(df[variable]== value,1,0)
        first_top10.append(variable)
        print(df[first_top10])
        top_10_all = top_10_all.pop(0)

In [15]:
for variable in ["X0","X1","X2","X3","X4","X5","X6"]:
    multi_encode(variable,top_10_all)

['z', 'ak', 'y', 'ay', 't', 'x', 'o', 'f', 'n', 'w']


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

      z  ak  y  ay  t  x  o  f  n  w  X0
0     0   0  0   0  0  0  0  0  0  0   k
1     0   0  0   0  0  0  0  0  0  0   k
2     0   0  0   0  0  0  0  0  0  0  az
3     0   0  0   0  0  0  0  0  0  0  az
4     0   0  0   0  0  0  0  0  0  0  az
...  ..  .. ..  .. .. .. .. .. .. ..  ..
4204  0   1  0   0  0  0  0  0  0  0  ak
4205  0   0  0   0  0  0  0  0  0  0   j
4206  0   1  0   0  0  0  0  0  0  0  ak
4207  0   0  0   0  0  0  0  0  0  0  al
4208  1   0  0   0  0  0  0  0  0  0   z

[4209 rows x 11 columns]
['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

      aa  s  b  l  v  r  i  a  c  o X1
0      0  0  0  0  1  0  0  0  0  0  v
1      0  0  0  0  0  0  0  0  0  0  t
2      0  0  0  0  0  0  0  0  0  0  w
3      0  0  0  0  0  0  0  0  0  0  t
4      0  0  0  0  1  0  0  0  0  0  v
...   .. .. .. .. .. .. .. .. .. .. ..
4204   0  1  0  0  0  0  0  0  0  0  s
4205   0  0  0  0  0  0  0  0  0  1  o
4206   0  0  0  0  1  0  0  0  0  0  v
4207   0  0  0  0  0  1  0  0  0  0  r
4208   0  0  0  0  0  1  0  0  0  0  r

[4209 rows x 11 columns]
['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

      as  ae  ai  m  ak  r  n  s  f  e  X2
0      0   0   0  0   0  0  0  0  0  0  at
1      0   0   0  0   0  0  0  0  0  0  av
2      0   0   0  0   0  0  1  0  0  0   n
3      0   0   0  0   0  0  1  0  0  0   n
4      0   0   0  0   0  0  1  0  0  0   n
...   ..  ..  .. ..  .. .. .. .. .. ..  ..
4204   1   0   0  0   0  0  0  0  0  0  as
4205   0   0   0  0   0  0  0  0  0  0   t
4206   0   0   0  0   0  1  0  0  0  0   r
4207   0   0   0  0   0  0  0  0  0  1   e
4208   0   1   0  0   0  0  0  0  0  0  ae

[4209 rows x 11 columns]
['c', 'f', 'a', 'd', 'g', 'e', 'b']


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

      c  f  a  d  g  e  b X3
0     0  0  1  0  0  0  0  a
1     0  0  0  0  0  1  0  e
2     1  0  0  0  0  0  0  c
3     0  1  0  0  0  0  0  f
4     0  1  0  0  0  0  0  f
...  .. .. .. .. .. .. .. ..
4204  1  0  0  0  0  0  0  c
4205  0  0  0  1  0  0  0  d
4206  0  0  1  0  0  0  0  a
4207  0  1  0  0  0  0  0  f
4208  1  0  0  0  0  0  0  c

[4209 rows x 8 columns]
['d', 'a', 'c', 'b']


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

      d  a  c  b X4
0     1  0  0  0  d
1     1  0  0  0  d
2     1  0  0  0  d
3     1  0  0  0  d
4     1  0  0  0  d
...  .. .. .. .. ..
4204  1  0  0  0  d
4205  1  0  0  0  d
4206  1  0  0  0  d
4207  1  0  0  0  d
4208  1  0  0  0  d

[4209 rows x 5 columns]
['v', 'w', 'q', 'r', 'd', 's', 'n', 'p', 'm', 'i']


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

      v  w  q  r  d  s  n  p  m  i  X5
0     0  0  0  0  0  0  0  0  0  0   u
1     0  0  0  0  0  0  0  0  0  0   y
2     0  0  0  0  0  0  0  0  0  0   x
3     0  0  0  0  0  0  0  0  0  0   x
4     0  0  0  0  0  0  0  0  0  0   h
...  .. .. .. .. .. .. .. .. .. ..  ..
4204  0  0  0  0  0  0  0  0  0  0  aa
4205  0  0  0  0  0  0  0  0  0  0  aa
4206  0  0  0  0  0  0  0  0  0  0  aa
4207  0  0  0  0  0  0  0  0  0  0  aa
4208  0  0  0  0  0  0  0  0  0  0  aa

[4209 rows x 11 columns]
['g', 'j', 'd', 'i', 'l', 'a', 'h', 'k', 'c', 'b']


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

      g  j  d  i  l  a  h  k  c  b X6
0     0  1  0  0  0  0  0  0  0  0  j
1     0  0  0  0  1  0  0  0  0  0  l
2     0  1  0  0  0  0  0  0  0  0  j
3     0  0  0  0  1  0  0  0  0  0  l
4     0  0  1  0  0  0  0  0  0  0  d
...  .. .. .. .. .. .. .. .. .. .. ..
4204  0  0  1  0  0  0  0  0  0  0  d
4205  0  0  0  0  0  0  1  0  0  0  h
4206  1  0  0  0  0  0  0  0  0  0  g
4207  0  0  0  0  1  0  0  0  0  0  l
4208  1  0  0  0  0  0  0  0  0  0  g

[4209 rows x 11 columns]


### Ordinal Number Encoding

In [37]:
import datetime

In [38]:
dt.time

<IPython.core.display.Javascript object>

datetime.time

In [41]:
today_date = datetime.datetime.today()

In [42]:
today_date

datetime.datetime(2020, 8, 14, 13, 21, 21, 427841)

In [43]:
today_date - datetime.timedelta(2)

datetime.datetime(2020, 8, 12, 13, 21, 21, 427841)

In [44]:
# list comprehension
new_day = [today_date - datetime.timedelta(i) for i in range(0,15)]

In [45]:
new_day

[datetime.datetime(2020, 8, 14, 13, 21, 21, 427841),
 datetime.datetime(2020, 8, 13, 13, 21, 21, 427841),
 datetime.datetime(2020, 8, 12, 13, 21, 21, 427841),
 datetime.datetime(2020, 8, 11, 13, 21, 21, 427841),
 datetime.datetime(2020, 8, 10, 13, 21, 21, 427841),
 datetime.datetime(2020, 8, 9, 13, 21, 21, 427841),
 datetime.datetime(2020, 8, 8, 13, 21, 21, 427841),
 datetime.datetime(2020, 8, 7, 13, 21, 21, 427841),
 datetime.datetime(2020, 8, 6, 13, 21, 21, 427841),
 datetime.datetime(2020, 8, 5, 13, 21, 21, 427841),
 datetime.datetime(2020, 8, 4, 13, 21, 21, 427841),
 datetime.datetime(2020, 8, 3, 13, 21, 21, 427841),
 datetime.datetime(2020, 8, 2, 13, 21, 21, 427841),
 datetime.datetime(2020, 8, 1, 13, 21, 21, 427841),
 datetime.datetime(2020, 7, 31, 13, 21, 21, 427841)]

In [46]:
pd.DataFrame(new_day)

<IPython.core.display.Javascript object>

Unnamed: 0,0
0,2020-08-14 13:21:21.427841
1,2020-08-13 13:21:21.427841
2,2020-08-12 13:21:21.427841
3,2020-08-11 13:21:21.427841
4,2020-08-10 13:21:21.427841
5,2020-08-09 13:21:21.427841
6,2020-08-08 13:21:21.427841
7,2020-08-07 13:21:21.427841
8,2020-08-06 13:21:21.427841
9,2020-08-05 13:21:21.427841


In [51]:
data= pd.DataFrame(new_day)
data.columns = ["Day"]
data.head()

<IPython.core.display.Javascript object>

Unnamed: 0,Day
0,2020-08-14 13:21:21.427841
1,2020-08-13 13:21:21.427841
2,2020-08-12 13:21:21.427841
3,2020-08-11 13:21:21.427841
4,2020-08-10 13:21:21.427841


In [57]:
data["Weekday"] =  data["Day"].dt.weekday_name
data.head(10)

Unnamed: 0,Day,Weekday
0,2020-08-14 13:21:21.427841,Friday
1,2020-08-13 13:21:21.427841,Thursday
2,2020-08-12 13:21:21.427841,Wednesday
3,2020-08-11 13:21:21.427841,Tuesday
4,2020-08-10 13:21:21.427841,Monday
5,2020-08-09 13:21:21.427841,Sunday
6,2020-08-08 13:21:21.427841,Saturday
7,2020-08-07 13:21:21.427841,Friday
8,2020-08-06 13:21:21.427841,Thursday
9,2020-08-05 13:21:21.427841,Wednesday


In [64]:
dictionary = {'Monday':1, 'Tuesday':2, 'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6,'Sunday':7}

In [65]:
data["Weekday_ordinal"] = data["Weekday"].map(dictionary)

In [66]:
data.head()

Unnamed: 0,Day,Weekday,Weekday_ordinal
0,2020-08-14 13:21:21.427841,Friday,5
1,2020-08-13 13:21:21.427841,Thursday,4
2,2020-08-12 13:21:21.427841,Wednesday,3
3,2020-08-11 13:21:21.427841,Tuesday,2
4,2020-08-10 13:21:21.427841,Monday,1
