In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#import numpy as np # linear algebra
#import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/costa-rican-household-poverty-prediction/test.csv.zip
/kaggle/input/costa-rican-household-poverty-prediction/sample_submission.csv
/kaggle/input/costa-rican-household-poverty-prediction/test.csv
/kaggle/input/costa-rican-household-poverty-prediction/train.csv.zip
/kaggle/input/costa-rican-household-poverty-prediction/codebook.csv
/kaggle/input/costa-rican-household-poverty-prediction/train.csv
/kaggle/input/costa-rican-household-poverty-prediction/sample_submission.csv.zip
/kaggle/input/costa-rican-household-poverty-prediction/codebook.xlsx


# 1. Import Libraries and Data

In [2]:
# Import Libraries
import os,sys
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
import matplotlib.cm as cm
#sb.set() # set the default Seaborn style for graphics

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_sample_weight
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV

import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [3]:
#print(os.listdir("../input"))

In [4]:
train_data = pd.read_csv('/kaggle/input/costa-rican-household-poverty-prediction/train.csv')
# Drop target data
train_target = train_data['Target']
train_data.drop(['Target'],axis=1,inplace=True)


In [5]:
train_data.head()

Unnamed: 0,Id,v2a1,hacdor,rooms,hacapo,v14a,refrig,v18q,v18q1,r4h1,...,age,SQBescolari,SQBage,SQBhogar_total,SQBedjefe,SQBhogar_nin,SQBovercrowding,SQBdependency,SQBmeaned,agesq
0,ID_279628684,190000.0,0,3,0,1,1,0,,0,...,43,100,1849,1,100,0,1.0,0.0,100.0,1849
1,ID_f29eb3ddd,135000.0,0,4,0,1,1,1,1.0,0,...,67,144,4489,1,144,0,1.0,64.0,144.0,4489
2,ID_68de51c94,,0,8,0,1,1,0,,0,...,92,121,8464,1,0,0,0.25,64.0,121.0,8464
3,ID_d671db89c,180000.0,0,5,0,1,1,1,1.0,0,...,17,81,289,16,121,4,1.777778,1.0,121.0,289
4,ID_d56d6f5f5,180000.0,0,5,0,1,1,1,1.0,0,...,37,121,1369,16,121,4,1.777778,1.0,121.0,1369


# 2. Feature Engineering based on Data Exploration

In [6]:
# Check columns with nan values and their nan values counts - missing data
def findNanCol(orig_df,print_flag=False):
    df_missing = orig_df.isnull()
    col_list_missing_data = []
    for col in df_missing.columns.values.tolist():
        try:
            x=df_missing[col].value_counts()[1]
            if print_flag:
                print("Column {}: {} missing value counts".format(col,x))
            col_list_missing_data.append(col)
        except:
            continue
    return col_list_missing_data


# Get all col with type Objects to check if there more than one type and for labelencoder
def findObjectTypeCol(orig_df,print_flag=False):
    obj_col_list = []
    for col in orig_df.columns.values.tolist():
        if orig_df[col].dtypes == "object":
            if print_flag:
                print(col)
            obj_col_list.append(col)
    return obj_col_list

# Fill NA values in dataset appropriately
def fillMissingValues(train_data):
    
    # Fill the nans in v2a1 with 0s as most have paid for their own house and thus no rent payment.
    train_data['v2a1'] = train_data['v2a1'].fillna(0).astype('float64')
    
    # Fill nan with 0s as only v18q with 0s have nan values for v18q1
    # Furthermore, we will get the avg num of laptops per household member which is a better feature.
    # The new feature will be created in 3rd stage of feature engineering.
    train_data['v18q1'] = train_data['v18q1'].fillna(0).astype('float64')
    
    # Fill nans in rez_esc with zeros as this feature is meant for age between 7 and 19 as per definition.
    train_data['rez_esc'] = train_data['rez_esc'].fillna(0).astype('float64')
    
    # It turns out that the number of household members 18+ is zero which have givesn the nan value.
    # Therefore, we will convert "meanedu" values to zero.
    train_data['meaneduc'] = train_data['meaneduc'].fillna(0).astype('float64')
    train_data['SQBmeaned'] = train_data['SQBmeaned'].fillna(0).astype('float64')
    
    return train_data

# Calculate depenedncy rate for those missing values
def getDependencyRate(train_data):
    for i,val in enumerate(train_data['dependency']):
        if val == 'no':
            train_data.loc[i,'dependency'] = 0
        elif val == 'yes':
            total_num = float(train_data.loc[i,'hogar_total'])
            num_abv_65_and_below_19 = float(train_data.loc[i,'hogar_nin']) + float(train_data.loc[i,'hogar_mayor'])
            train_data.loc[i,'dependency'] = num_abv_65_and_below_19/(total_num - num_abv_65_and_below_19)

    train_data['dependency'] = train_data['dependency'].astype('float64')
    return train_data

# Encodes features with more than one data type
def encoder(train_data):
    dic = {'yes' : '1', 'no' : '0'}
    
    train_data.drop(['Id'],axis=1,inplace=True)
    train_data['idhogar'] = LabelEncoder().fit_transform(train_data['idhogar'])
    train_data = getDependencyRate(train_data)
    train_data['edjefe']= train_data['edjefe'].replace(dic).astype('float64')
    train_data['edjefa'] = train_data['edjefa'].replace(dic).astype('float64')
    
    return train_data

# Converts str type columns to float
def convertToFloat(train_data,obj_col_list,col_list_missing_data):
    obj_col_list.extend(col_list_missing_data)
    for col in train_data.columns.values.tolist():
        if col not in obj_col_list:
            train_data[col] = train_data[col].astype('float64')
    return train_data

# Adds and modifies new features
def addAndModifyFeatures(train_data):
    ### Definition:
    # r4t3: Total number of individuals in houshold including domestic employees/friends/tenants.
    # hogar_total = tamgog = hhsize: Total number of houshold members excluding domestic employees/friends/tenants.
    # tamviv: Unclear definition so will be dropped.
    
    ### Add new features
    
    train_data['v2a1_per_room'] = train_data['v2a1']/train_data['rooms']
    # It is logical for a household to account only for tables among household members.
    train_data['v18q1_per_household_member'] = train_data['v18q1']/train_data['hogar_total']
    # It is logical for a household to account only for mobile phones among household members.
    train_data['qmobileph_per_household_member'] = train_data['qmobilephone']/train_data['hogar_total']
    # It is logical for a household to account only for rooms among household members. Having non
    # household members to stay is subjective and not considered as essentials.
    train_data['rooms_per_household_member'] = train_data['rooms']/train_data['hogar_total']
     # It is logical for a household to account only for bedrooms among household members. Having non
    # household members to stay is subjective and not considered as essentials.
    train_data['bedroom_per_household_member'] = train_data['bedrooms']/train_data['hogar_total']
    # Number of non household members in the house(domestic employees/friends/tenants)
    train_data['Num_of_non_household_members'] = train_data['r4t3']-train_data['hogar_total']
    # Proportion of household adults aged btw 19 and 65 ---> (adults-old_aged)/total
    train_data['hogar_adul_btw_19_and_65'] = (train_data['hogar_adul']-train_data['hogar_mayor'])/train_data['hogar_total']
    
    
    ### Modify to obtain proportion features
    
    # The feature values below includes non houshold mmbers as mentioned in data exploration notebook.
    # Therefore, the values are divided by total number of people in house.
    train_data['r4h1'] = train_data['r4h1']/train_data['r4t3']
    train_data['r4h2'] = train_data['r4h2']/train_data['r4t3']
    train_data['r4h3'] = train_data['r4h3']/train_data['r4t3']
    train_data['r4m1'] = train_data['r4m1']/train_data['r4t3']
    train_data['r4m2'] = train_data['r4m2']/train_data['r4t3']
    train_data['r4m3'] = train_data['r4m3']/train_data['r4t3']
    train_data['r4t1'] = train_data['r4t1']/train_data['r4t3']
    train_data['r4t2'] = train_data['r4t2']/train_data['r4t3']

    # The feature values below excludes non houshold mmbers as verified in data exploration notebook.
    # Therefore, the values are divided by number of houehold members.
    train_data['hogar_nin'] = train_data['hogar_nin']/train_data['hogar_total']
    train_data['hogar_adul'] = train_data['hogar_adul']/train_data['hogar_total']
    train_data['hogar_mayor'] = train_data['hogar_mayor']/train_data['hogar_total'] 
    
    ### Remove feature col as some are duplicates and 
    # most of these col are total number which is included in proportion calculation
    train_data.drop(['idhogar','v18q1','rooms','bedrooms','r4t3','hogar_total','agesq','hhsize','tamviv','tamhog'],axis=1,inplace=True)
    
    return train_data

In [7]:
# Applies feature engineering in one function
def applyFE(train_data):
    # List of columns that have missing values and their corresponding number of missing value counts
    col_list_missing_data = findNanCol(train_data)
    
    # List of columns that needs to be verified as they in object type
    obj_col_list = findObjectTypeCol(train_data)
    
    # Convert columns not having missing data or not having many object types to float
    train_data = convertToFloat(train_data,obj_col_list,col_list_missing_data)
    
    ### STAGE 1
    ### Dealing with features with missing values.
    # Fills missing values
    train_data = fillMissingValues(train_data)
    
    ### STAGE 2
    ### Dealing with features with object types.
    # Encodes features with more than one data type
    train_data = encoder(train_data)
    
    ### STAGE 3
    ### Dealing with new and modified features.
    # Encodes features with more than one data type
    train_data = addAndModifyFeatures(train_data)
    
    return train_data

# Get components of predicted labels
def getConstituentsOfPredicted(cf,pred_label,angle=45):

    # cf[actual-1][predicted-1]
    p13 = cf[0][pred_label-1]
    p23 = cf[1][pred_label-1]
    p33 = cf[2][pred_label-1]
    p43 = cf[3][pred_label-1]

    # Data to plot
    pred_labels = ['extreme poverty', 'moderate poverty', 'vulnerable households','non vulnerable households']
    sizes = [p13, p23, p33, p43]
    colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue']

    # Plot
    plt.pie(sizes, labels=pred_labels, colors=colors, autopct='%1.0f%%', shadow=False, startangle=angle, textprops={'fontsize': 14})
    plt.title("Constituents of Costa Rican Household Poverty Levels predicted as "+pred_labels[pred_label-1],fontsize=20)

    plt.axis('equal')
    plt.show()
    
    return p13,p23,p33,p43

# Get components of actual labels
def getPredictionOfActualLabels(p,cf,actual_label,angle=45):
    
    # Data to plot
    labels = ['extreme poverty', 'moderate poverty', 'vulnerable households','non vulnerable households']
    sizes = [p[0], p[1], p[2], p[3]]
    colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue']

    # Plot
    plt.pie(sizes, labels=labels, colors=colors, autopct='%1.0f%%', shadow=False, startangle=angle, textprops={'fontsize': 14})
    plt.title("Predictions of Costa Rican Household Poverty Levels with actual labels as "+labels[actual_label-1],fontsize=20)

    plt.axis('equal')
    plt.show()
    

In [8]:
# Feature Engineering for train data
train_data = applyFE(train_data)

In [9]:
x_train = train_data.copy()
y_train = train_target.copy()

# 3. Model: RandomForest

In [10]:
# Hyperparameters that are tuned for gridsearch
param_grid = { 
    'n_estimators': [270],
    'max_depth' : [14]
}

# Model initializer
rfc=RandomForestClassifier(class_weight = 'balanced_subsample', n_estimators = 200,max_depth = 16)
##rfc=RandomForestClassifier(class_weight = 'balanced', random_state=0)

# Specific scoring method
scorer = metrics.make_scorer(metrics.f1_score, average = 'weighted')
# GridSearch
rf_cv = GridSearchCV(estimator=rfc, param_grid=param_grid, scoring=scorer, cv= 3)
# Fit train data
rf_cv.fit(x_train, y_train)

GridSearchCV(cv=3,
             estimator=RandomForestClassifier(class_weight='balanced_subsample',
                                              max_depth=16, n_estimators=200),
             param_grid={'max_depth': [14], 'n_estimators': [270]},
             scoring=make_scorer(f1_score, average=weighted))

In [11]:
## View the accuracy score
print('Best score for training data:', rf_cv.best_score_,"\n") 

## View the best parameters for the model found using grid search
print('Best n_estimators:',rf_cv.best_estimator_.n_estimators,"\n") 
print('Best max_depth:',rf_cv.best_estimator_.max_depth,"\n")

Best score for training data: 0.5674948247064505 

Best n_estimators: 270 

Best max_depth: 14 



In [12]:
# Predict validate data and evaluate for validate data
y_pred_validate = rf_cv.predict(x_train)
print("Classification report for rf model %s:\n%s\n"
      % (rf_cv, metrics.classification_report(y_train, y_pred_validate)))

Classification report for rf model GridSearchCV(cv=3,
             estimator=RandomForestClassifier(class_weight='balanced_subsample',
                                              max_depth=16, n_estimators=200),
             param_grid={'max_depth': [14], 'n_estimators': [270]},
             scoring=make_scorer(f1_score, average=weighted)):
              precision    recall  f1-score   support

           1       1.00      0.99      0.99       755
           2       0.99      0.99      0.99      1597
           3       0.97      1.00      0.98      1209
           4       1.00      0.99      1.00      5996

    accuracy                           0.99      9557
   macro avg       0.99      0.99      0.99      9557
weighted avg       0.99      0.99      0.99      9557




# Submit for RF

In [13]:
# Test
# Read the test CSV Data
path = '/kaggle/input/costa-rican-household-poverty-prediction/test.csv'
test_data = pd.read_csv(path)
test_data = applyFE(test_data)
y_pred = rf_cv.predict(test_data)
# Read submission file
path = '/kaggle/input/costa-rican-household-poverty-prediction/sample_submission.csv'
test = pd.read_csv(path)
test['Target'] = y_pred
test.to_csv("submission.csv", index= False)
#gc.collect()