In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import seaborn as sns 
sns.set()

In [6]:
## Reading the dataset
train = pd.read_csv('F:/ML_DataScience_ImageProcessing/DataSets/Income_Qualification_Dataset/train.csv')
test = pd.read_csv('F:/ML_DataScience_ImageProcessing/DataSets/Income_Qualification_Dataset/test.csv')

In [7]:
print('Shape of train dataset {}'.format(train.shape))
print('Shape of test dataset {}'.format(test.shape))

Shape of train dataset (9557, 143)
Shape of test dataset (23856, 142)


In [8]:
train.head()

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,...,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq,Target
0,ID_279628684,190000.0,0,3,0,1,1,0,,0,...,100,1849,1,100,0,1.0,0.0,100.0,1849,4
1,ID_f29eb3ddd,135000.0,0,4,0,1,1,1,1.0,0,...,144,4489,1,144,0,1.0,64.0,144.0,4489,4
2,ID_68de51c94,,0,8,0,1,1,0,,0,...,121,8464,1,0,0,0.25,64.0,121.0,8464,4
3,ID_d671db89c,180000.0,0,5,0,1,1,1,1.0,0,...,81,289,16,121,4,1.777778,1.0,121.0,289,4
4,ID_d56d6f5f5,180000.0,0,5,0,1,1,1,1.0,0,...,121,1369,16,121,4,1.777778,1.0,121.0,1369,4


In [9]:
test.head()

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,...,age,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq
0,ID_2f6873615,,0,5,0,1,1,0,,1,...,4,0,16,9,0,1,2.25,0.25,272.25,16
1,ID_1c78846d2,,0,5,0,1,1,0,,1,...,41,256,1681,9,0,1,2.25,0.25,272.25,1681
2,ID_e5442cf6a,,0,5,0,1,1,0,,1,...,41,289,1681,9,0,1,2.25,0.25,272.25,1681
3,ID_a8db26a79,,0,14,0,1,1,1,1.0,0,...,59,256,3481,1,256,0,1.0,0.0,256.0,3481
4,ID_a62966799,175000.0,0,4,0,1,1,1,1.0,0,...,18,121,324,1,0,1,0.25,64.0,,324


In [10]:
## Understand the Types of Features 

print(train.dtypes.value_counts())

int64      130
float64      8
object       5
dtype: int64


In [13]:
## Identify the Target variable When We dont Know Which OneTo PRedict from Train and Test 

for i in train.columns:
    if i not in test.columns:
        print('Our target variable is ============ {}'.format(i))



In [14]:
## Exploring the Object Type Features From Dataset 
for i in train.columns:
    a = train[i].dtype
    if a == 'object':
        print(i)

Id
idhogar
dependency
edjefe
edjefa


* Id = Unique ID's which is alpha Numeric
* idhogar = Household level identifier
* dependency = Dependency rate, calculated = (number of members of the household younger than 19 or older than 64)/(number of member of household between 19 and 64)
* edjefe = years of education of male head of household, based on the interaction of escolari (years of education), head of household and gender, yes=1 and no=0
* edjefa = years of education of female head of household, based on the interaction of escolari (years of education), head of household and gender, yes=1 and no=0

In [15]:
## Drop the unnecessary Variables 
train.drop(['Id','idhogar'],axis=1,inplace=True)

In [16]:
## value Count for Dependency Feature 
train['dependency'].value_counts()

yes          2192
no           1747
.5           1497
2             730
1.5           713
.33333334     598
.66666669     487
8             378
.25           260
3             236
4             100
.75            98
.2             90
1.3333334      84
.40000001      84
2.5            77
5              24
3.5            18
1.25           18
.80000001      18
2.25           13
.71428573      12
1.75           11
1.2            11
.83333331      11
.22222222      11
.2857143        9
.60000002       8
1.6666666       8
.16666667       7
6               7
Name: dependency, dtype: int64

In [17]:
## mapping function for Features Which Contains 'Yes' and 'No' as Categorical Variables 
def mapper(i):
    
    if i=='yes':
        return (float(1))
    elif i == 'no':
        return (float(0))
    else:
        return (float(i))

In [18]:
train['dependency']=train['dependency'].apply(mapper)

In [19]:
train['edjefe']=train['edjefe'].apply(mapper)

train['edjefa']=train['edjefa'].apply(mapper)

In [20]:
## Identifying the variables with Zero Variance 
var_df = pd.DataFrame(np.var(train,0),columns=['variance'])
var_df.sort_values(by='variance').head(15)

print('Below are the columns with variance 0')

col = list((var_df[var_df['variance']==0]).index)

print(col)

Below are the columns with variance 0
['elimbasu5']


In [21]:
var_df.head()

Unnamed: 0,variance
v2a1,22628960000.0
hacdor,0.03663663
rooms,2.155917
hacapo,0.02308838
v14a,0.005204396


In [22]:
# Check If there are Any Biases in Dataset 
contingency_tab = pd.crosstab(train['r4t3'],train['hogar_total'])

observed_Val = contingency_tab.values

import scipy.stats


b = scipy.stats.chi2_contingency(contingency_tab)

expected_values = b[3]

no_of_rows = len(contingency_tab.iloc[0:2,0])
no_of_columns = len(contingency_tab.iloc[0,0:2])

df = (no_of_rows-1)*(no_of_columns-1)

print('Degree of Freedom:',df)

from scipy.stats import chi2

chi_square = sum([(o-e)**2./e for o,e in zip(observed_Val,expected_values)])
chi_square_statistic = chi_square[0]+chi_square[1]

print('chi_square_statistic: ', chi_square_statistic)
alpha = 0.05

critical_value = chi2.ppf(q=1-alpha,df=df)

p_value = 1-chi2.cdf(x=chi_square_statistic,df=df)

print('p-value',p_value)
print('Significance Level:',alpha)
print('Degree of Freedom:',df)
print('chi_square_statistic',chi_square_statistic)
print('critical_value',critical_value)

if chi_square_statistic>=critical_value:
    print('Reject H0, There is a relationship between 2 categorical variables')
else:
    print('Retain H0, There is no relationship between 2 categorical variables')

if p_value<=alpha:
    print('Reject H0, There is a relationship between 2 categorical variables')
else:
    print('Retain H0, There is no relationship between 2 categorical variables')

Degree of Freedom: 1
chi_square_statistic:  17022.072400560897
p-value 0.0
Significance Level: 0.05
Degree of Freedom: 1
chi_square_statistic 17022.072400560897
critical_value 3.841458820694124
Reject H0, There is a relationship between 2 categorical variables
Reject H0, There is a relationship between 2 categorical variables


In [23]:
contingency_tab = pd.crosstab(train['tipovivi3'],train['v2a1'])

observed_Val = contingency_tab.values

import scipy.stats


b = scipy.stats.chi2_contingency(contingency_tab)

expected_values = b[3]

no_of_rows = len(contingency_tab.iloc[0:2,0])
no_of_columns = len(contingency_tab.iloc[0,0:2])

df = (no_of_rows-1)*(no_of_columns-1)

print('Degree of Freedom:',df)

from scipy.stats import chi2

chi_square = sum([(o-e)**2./e for o,e in zip(observed_Val,expected_values)])
chi_square_statistic = chi_square[0]+chi_square[1]

print('chi_square_statistic: ', chi_square_statistic)
alpha = 0.05

critical_value = chi2.ppf(q=1-alpha,df=df)

p_value = 1-chi2.cdf(x=chi_square_statistic,df=df)

print('p-value',p_value)
print('Significance Level:',alpha)
print('Degree of Freedom:',df)
print('chi_square_statistic',chi_square_statistic)
print('critical_value',critical_value)

if chi_square_statistic>=critical_value:
    print('Reject H0, There is a relationship between 2 categorical variables')
else:
    print('Retain H0, There is no relationship between 2 categorical variables')

if p_value<=alpha:
    print('Reject H0, There is a relationship between 2 categorical variables')
else:
    print('Retain H0, There is no relationship between 2 categorical variables')

Degree of Freedom: 1
chi_square_statistic:  54.04781105990782
p-value 1.9562129693895258e-13
Significance Level: 0.05
Degree of Freedom: 1
chi_square_statistic 54.04781105990782
critical_value 3.841458820694124
Reject H0, There is a relationship between 2 categorical variables
Reject H0, There is a relationship between 2 categorical variables


In [24]:
contingency_tab = pd.crosstab(train['v18q'],train['v18q1'])

observed_Val = contingency_tab.values

import scipy.stats


b = scipy.stats.chi2_contingency(contingency_tab)

expected_values = b[3]

no_of_rows = len(contingency_tab.iloc[0:2,0])
no_of_columns = len(contingency_tab.iloc[0,0:2])

df = (no_of_rows-1)*(no_of_columns-1)

print('Degree of Freedom:',df)

from scipy.stats import chi2

chi_square = sum([(o-e)**2./e for o,e in zip(observed_Val,expected_values)])
chi_square_statistic = chi_square[0]+chi_square[1]

print('chi_square_statistic: ', chi_square_statistic)
alpha = 0.05

critical_value = chi2.ppf(q=1-alpha,df=df)

p_value = 1-chi2.cdf(x=chi_square_statistic,df=df)

print('p-value',p_value)
print('Significance Level:',alpha)
print('Degree of Freedom:',df)
print('chi_square_statistic',chi_square_statistic)
print('critical_value',critical_value)

if chi_square_statistic>=critical_value:
    print('Reject H0, There is a relationship between 2 categorical variables')
else:
    print('Retain H0, There is no relationship between 2 categorical variables')

if p_value<=alpha:
    print('Reject H0, There is a relationship between 2 categorical variables')
else:
    print('Retain H0, There is no relationship between 2 categorical variables')

Degree of Freedom: 0
chi_square_statistic:  0.0
p-value nan
Significance Level: 0.05
Degree of Freedom: 0
chi_square_statistic 0.0
critical_value nan
Retain H0, There is no relationship between 2 categorical variables
Retain H0, There is no relationship between 2 categorical variables


 * Conclusion There Is Bias In Dataset 

In [25]:
train['parentesco1'].value_counts()

0    6584
1    2973
Name: parentesco1, dtype: int64

In [26]:
pd.crosstab(train['edjefe'],train['edjefa'])

edjefa,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0,21.0
edjefe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,435,69,84,152,136,176,947,179,217,237,...,72,52,120,188,113,76,3,4,2,5
1.0,123,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2.0,194,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3.0,307,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4.0,137,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5.0,222,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6.0,1845,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7.0,234,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8.0,257,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9.0,486,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


* Count How many Values Are  null in the colummns 

In [27]:
train.isna().sum().value_counts()

0       136
5         2
7928      1
6860      1
7342      1
dtype: int64

In [28]:
train['Target'].isna().sum()

0

In [29]:
float_col = []

for i in train.columns:
    a = train[i].dtype
    
    if a == 'float64':
        float_col.append(i)

print(float_col)

['v2a1', 'v18q1', 'rez_esc', 'dependency', 'edjefe', 'edjefa', 'meaneduc', 'overcrowding', 'SQBovercrowding', 'SQBdependency', 'SQBmeaned']


In [30]:
train[float_col].isna().sum()

v2a1               6860
v18q1              7342
rez_esc            7928
dependency            0
edjefe                0
edjefa                0
meaneduc              5
overcrowding          0
SQBovercrowding       0
SQBdependency         0
SQBmeaned             5
dtype: int64

In [31]:
train['v18q1'].value_counts()

1.0    1586
2.0     444
3.0     129
4.0      37
5.0      13
6.0       6
Name: v18q1, dtype: int64

In [32]:
pd.crosstab(train['tipovivi1'],train['v2a1'])

v2a1,0.0,12000.0,13000.0,14000.0,15000.0,16000.0,17000.0,20000.0,23000.0,25000.0,...,570540.0,600000.0,620000.0,684648.0,700000.0,770229.0,800000.0,855810.0,1000000.0,2353477.0
tipovivi1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,29,3,4,3,3,2,4,22,5,21,...,25,11,3,3,7,3,4,11,7,2


In [33]:
pd.crosstab(train['v18q1'],train['v18q'])

v18q,1
v18q1,Unnamed: 1_level_1
1.0,1586
2.0,444
3.0,129
4.0,37
5.0,13
6.0,6


* 'v2a1','v18q1','rez_esc' have more than 50% null values.

* Because of v18q1 there are families with their own house so they won't pay rent

* Dropping tipovivi=1 & vi8q

In [34]:
train.drop(['tipovivi3','v18q','rez_esc','elimbasu5'],axis=1,inplace=True)

## Filing the Na values with 0 or the mean values of the Columns accoding to the data 

In [35]:
train['v2a1'].fillna(0,inplace=True)
train['v18q1'].fillna(0,inplace=True)
train['meaneduc'].fillna(np.mean(train['meaneduc']),inplace=True)
train['SQBmeaned'].fillna(np.mean(train['SQBmeaned']),inplace=True)

In [36]:
int_col = []

for i in train.columns:
    a = train[i].dtype
    
    if a == 'int64':
        int_col.append(i)

print(int_col)

['hacdor', 'rooms', 'hacapo', 'v14a', 'refrig', 'r4h1', 'r4h2', 'r4h3', 'r4m1', 'r4m2', 'r4m3', 'r4t1', 'r4t2', 'r4t3', 'tamhog', 'tamviv', 'escolari', 'hhsize', 'paredblolad', 'paredzocalo', 'paredpreb', 'pareddes', 'paredmad', 'paredzinc', 'paredfibras', 'paredother', 'pisomoscer', 'pisocemento', 'pisoother', 'pisonatur', 'pisonotiene', 'pisomadera', 'techozinc', 'techoentrepiso', 'techocane', 'techootro', 'cielorazo', 'abastaguadentro', 'abastaguafuera', 'abastaguano', 'public', 'planpri', 'noelec', 'coopele', 'sanitario1', 'sanitario2', 'sanitario3', 'sanitario5', 'sanitario6', 'energcocinar1', 'energcocinar2', 'energcocinar3', 'energcocinar4', 'elimbasu1', 'elimbasu2', 'elimbasu3', 'elimbasu4', 'elimbasu6', 'epared1', 'epared2', 'epared3', 'etecho1', 'etecho2', 'etecho3', 'eviv1', 'eviv2', 'eviv3', 'dis', 'male', 'female', 'estadocivil1', 'estadocivil2', 'estadocivil3', 'estadocivil4', 'estadocivil5', 'estadocivil6', 'estadocivil7', 'parentesco1', 'parentesco2', 'parentesco3', 'pa

In [37]:
train[int_col].isna().sum().value_counts()

0    127
dtype: int64

In [38]:
train['Target'].value_counts()

4    5996
2    1597
3    1209
1     755
Name: Target, dtype: int64

In [39]:
povlvl = train[train['v2a1'] != 0]

In [40]:
plevel = povlvl.groupby('area1')['v2a1'].apply(np.median)

In [41]:
plevel

area1
0     80000.0
1    140000.0
Name: v2a1, dtype: float64

* For rural area ppl paying less than 80000 is under poverty level
* For urban area ppl paying less than 140000 is under poverty level

In [42]:
def povl(x):
    if x<80000:
        return('Below Poverty Level')
    elif x>140000:
        return('Above Poverty Level')
    elif x<140000:
        return('Below Poverty Level: Urban, Above Poverty Level: Rural')

In [43]:
c = povlvl['area1'].apply(povl)

In [44]:
c.shape

(2668,)

In [45]:
pd.crosstab(c,povlvl['area1'])

area1,0,1
area1,Unnamed: 1_level_1,Unnamed: 2_level_1
0,445,0
1,0,2223


In [46]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [47]:
X_data = train.drop('Target',axis=1)
Y_data = train['Target']

In [48]:
X_data_col = X_data.columns

In [49]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

X_data_1 = ss.fit_transform(X_data)
X_data_1 = pd.DataFrame(X_data_1,columns=X_data_col)

In [54]:
#### Now we will proceed on to model fitting

X_train,X_test,y_train,y_test = train_test_split(X_data_1,Y_data,test_size=0.3,random_state=42)

## Grid Search and Pipeline will Help in model Selection in iterative Fashion  

In [55]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [56]:
rfc = RandomForestClassifier(random_state=0)


parameters = {'n_estimators':[10,50,100,300],'max_depth':[3,5,10,15]}

grid = zip([rfc],[parameters])

best_ = None

for i, j in grid:
    a = GridSearchCV(i,param_grid=j,cv=3,n_jobs=1)
    a.fit(X_train,y_train)
    if best_ is None:
        best_=a
    elif a.best_score_ > best_.best_score_:
        best_=a

print('Best CV score: ',best_.best_score_)
print('Model Parameters: ', best_.best_params_)
print('Best Estimator: ', best_.best_estimator_)



Best CV score:  0.8415303503685955
Model Parameters:  {'max_depth': 15, 'n_estimators': 100}
Best Estimator:  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=15, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)


* According to Best Cross Validation Score the model parameters Will be Selected. 

In [57]:
RFC = best_.best_estimator_
model = RFC.fit(X_train,y_train)
pred = model.predict(X_test)

In [58]:
print('Model score of training data : {}'.format(model.score(X_train,y_train)))
print('Model score of test data : {}'.format(model.score(X_test,y_test)))

Model score of training data : 0.983704589624757
Model score of test data : 0.8783124128312413


In [59]:
feature_importance = pd.DataFrame(model.feature_importances_,X_data_col,columns=['feature_importance'])

In [60]:
Top50Features = feature_importance.sort_values(by='feature_importance',ascending=False).head(50)

In [61]:
Top50Features

Unnamed: 0,feature_importance
meaneduc,0.054295
SQBmeaned,0.051607
SQBdependency,0.031485
dependency,0.029908
SQBovercrowding,0.02635
overcrowding,0.022996
qmobilephone,0.022713
SQBhogar_nin,0.022431
SQBedjefe,0.022054
hogar_nin,0.021978


In [62]:
from sklearn.metrics import accuracy_score,confusion_matrix

In [63]:
confusion_matrix(y_test,pred)

array([[ 179,   12,    0,   53],
       [   5,  361,    9,   97],
       [   2,   20,  222,  136],
       [   0,   11,    4, 1757]], dtype=int64)

In [64]:
accuracy_score(y_test,pred)

0.8783124128312413