# Costa Rican Household Poverty Level Prediction Problem

In [2]:
# import the libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
#load the dataset
train_data = pd.read_csv('C:\\Users\CityComp\\Desktop\\train.csv')
test_data  = pd.read_csv('C:\\Users\CityComp\\Desktop\\test.csv')

In [4]:
print ("Train Dataset: Rows, Columns: ", train_data.shape)

Train Dataset: Rows, Columns:  (9557, 143)


In [5]:
# take a look at the data information
train_data.info()    
train_data.describe()
train_data.columns[train_data.dtypes == object]
train_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9557 entries, 0 to 9556
Columns: 143 entries, Id to Target
dtypes: float64(8), int64(130), object(5)
memory usage: 10.4+ MB


Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,...,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq,Target
0,ID_279628684,190000.0,0,3,0,1,1,0,,0,...,100,1849,1,100,0,1.0,0.0,100.0,1849,4
1,ID_f29eb3ddd,135000.0,0,4,0,1,1,1,1.0,0,...,144,4489,1,144,0,1.0,64.0,144.0,4489,4
2,ID_68de51c94,,0,8,0,1,1,0,,0,...,121,8464,1,0,0,0.25,64.0,121.0,8464,4
3,ID_d671db89c,180000.0,0,5,0,1,1,1,1.0,0,...,81,289,16,121,4,1.777778,1.0,121.0,289,4
4,ID_d56d6f5f5,180000.0,0,5,0,1,1,1,1.0,0,...,121,1369,16,121,4,1.777778,1.0,121.0,1369,4


# we have to care about the error in the data to get better result

In [7]:


# Groupby the household and figure out the number of unique values
all_equal = train_data.groupby('idhogar')['Target'].apply(lambda x: x.nunique() == 1)

# Households where targets are not all equal
not_equal = all_equal[all_equal != True]
print('There are {} households where the family members do not all have the same target.'.format(len(not_equal)))

# Iterate through each household
for household in not_equal.index:
    # Find the correct label (for the head of household)
    true_target = int(train_data[(train_data['idhogar'] == household) & (train_data['parentesco1'] == 1.0)]['Target'])
    
# Set the correct label for all members in the household
    train_data.loc[train_data['idhogar'] == household, 'Target'] = true_target

There are 0 households where the family members do not all have the same target.


In [8]:
    # deal with missing data (for non object types)
# first: check the columns that contains missing values (nan)
train_data.isna().any()
train_data['rez_esc'].isnull().sum() # number of missing data in this column


7928

In [9]:
print ("Top Columns having missing values")
missmap = train_data.isnull().sum().to_frame().sort_values(0, ascending = False)
missmap.head()

Top Columns having missing values


Unnamed: 0,0
rez_esc,7928
v18q1,7342
v2a1,6860
SQBmeaned,5
meaneduc,5


In [10]:
#here we want to fill missing value , the  featuers have larg data  we want to fill it with the mean value 
#the small data with zero
from sklearn.preprocessing import Imputer
imputer1 = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0, )

train_data['v2a1'] = imputer1.fit_transform(train_data[['v2a1']]).ravel()
train_data['v18q1'] = imputer1.fit_transform(train_data[['v18q1']]).ravel()

train_data['rez_esc'] = train_data['rez_esc'].fillna(0.0)
train_data['meaneduc'] = train_data['meaneduc'].fillna(0.0)
train_data['SQBmeaned'] = train_data['SQBmeaned'].fillna(0.0)


train_data.drop(['v18q1'], axis = 1,inplace = True)
train_data.drop(['rez_esc'], axis = 1,inplace = True)
train_data.drop(['v2a1'], axis = 1,inplace = True)

# for object data types, we have 3 column with categurical Features data: dependency, edjefe, edjefa


In [11]:
train_data['dependency'] = train_data['dependency'].replace('yes',1)
train_data['dependency'] = train_data['dependency'].replace('no',0)
train_data['edjefe'] = train_data['edjefe'].replace('yes',1)
train_data['edjefe'] = train_data['edjefe'].replace('no',0)
train_data['edjefa'] = train_data['edjefa'].replace('yes',1)
train_data['edjefa'] = train_data['edjefa'].replace('no',0)
#here we finish pre-processing train data

In [12]:
#Test data: 
print ("Top Columns having missing values")
missmap_test = test_data.isnull().sum().to_frame().sort_values(0, ascending = False)
missmap_test.head()


Top Columns having missing values


Unnamed: 0,0
rez_esc,19653
v18q1,18126
v2a1,17403
meaneduc,31
SQBmeaned,31


In [13]:
test_data.drop(['v18q1'], axis = 1,inplace = True)
test_data.drop(['rez_esc'], axis = 1,inplace = True)
test_data.drop(['v2a1'], axis = 1,inplace = True)
test_data['meaneduc'] = imputer1.fit_transform(test_data[['meaneduc']]).ravel()
test_data['SQBmeaned'] = imputer1.fit_transform(test_data[['SQBmeaned']]).ravel()

# for object data types, we have 3 column with categurical Features : dependency, edjefe, edjefa


In [14]:
test_data['dependency'] = test_data['dependency'].replace('yes',1)
test_data['dependency'] = test_data['dependency'].replace('no',0)
test_data['edjefe'] = test_data['edjefe'].replace('yes',1)
test_data['edjefe'] = test_data['edjefe'].replace('no',0)
test_data['edjefa'] = test_data['edjefa'].replace('yes',1)
test_data['edjefa'] = test_data['edjefa'].replace('no',0)

In [15]:
# arrange the dataset
X_train = train_data.iloc[:, 1:139]
y_train = train_data.iloc[:, 139]
#drop the 'idhogar' feature
X_train.drop(['idhogar'], axis = 1,inplace = True)

X_test = test_data.iloc[:, 1:139]
#drop the 'idhogar' feature
X_test.drop(['idhogar'], axis = 1,inplace = True)

# Run Naive Bayes algorithm With accuracy = 0.375

In [16]:
# Run Naive Bayes algorithm

'''from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)


# Predicting the Test set results
y_pred = classifier.predict(X_test)'''


'from sklearn.naive_bayes import GaussianNB\nclassifier = GaussianNB()\nclassifier.fit(X_train, y_train)\n\n\n# Predicting the Test set results\ny_pred = classifier.predict(X_test)'

# Run K-neighbors Algorithm With accuracy = 0.316

In [17]:
#                ---Run K-neighbors Algorithm --  :
'''from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5, metric= 'minkowski', p = 2)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)'''

"from sklearn.neighbors import KNeighborsClassifier\nclassifier = KNeighborsClassifier(n_neighbors=5, metric= 'minkowski', p = 2)\nclassifier.fit(X_train, y_train)\ny_pred = classifier.predict(X_test)"

# Random forest Algorithm With accuracy = 0.380

In [20]:
#                     --- Run Random forest Algorithm ---
#  - Random Forest is a flexible, easy to use machine learning algorithm that produces, 
#even without hyper-parameter tuning, a great result most of the time. It is also one of the most used algorithms, 
#because it’s simplicity and the fact that it can be used for both classification and regression tasks.
#  - How it works:
#Random Forest is a supervised learning algorithm. Like you can already see from it’s name,
#it creates a forest and makes it somehow random. The „forest“ it builds, is an ensemble of Decision Trees,
#most of the time trained with the “bagging” method. The general idea of the bagging method is that
# a combination of learning models increases the overall result.
##  - Random forest builds multiple decision trees and merges them together to get a more accurate and stable prediction.

#Table of Contents:
from sklearn.ensemble import RandomForestClassifier
classifier =  RandomForestClassifier(n_estimators=5, criterion= 'entropy')
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
##it give accuracy : 0.380

In [21]:
# create submission file 

Y_id = test_data['Id'] 
sbt = pd.DataFrame({'Id':Y_id, 'Target': y_pred})
sbt.to_csv('submission.csv', index=False)