In [1]:
#import needed library: pandas,nump and matplot
import numpy as np
import pandas as pd
import matplotlib as plt

In [3]:
#get the train & test files
datatrain = pd.read_csv("train.csv")
datatest = pd.read_csv("test.csv")

In [8]:
#change display options
pd.options.display.max_columns = 150
pd.options.display.max_rows = 150

In [4]:
#Show datatest info
datatest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23856 entries, 0 to 23855
Columns: 142 entries, Id to agesq
dtypes: float64(8), int64(129), object(5)
memory usage: 25.8+ MB


In [9]:
#So there is a five objects, let's see if there is null values
datatest.isna().sum()

Id                     0
v2a1               17403
hacdor                 0
rooms                  0
hacapo                 0
v14a                   0
refrig                 0
v18q                   0
v18q1              18126
r4h1                   0
r4h2                   0
r4h3                   0
r4m1                   0
r4m2                   0
r4m3                   0
r4t1                   0
r4t2                   0
r4t3                   0
tamhog                 0
tamviv                 0
escolari               0
rez_esc            19653
hhsize                 0
paredblolad            0
paredzocalo            0
paredpreb              0
pareddes               0
paredmad               0
paredzinc              0
paredfibras            0
paredother             0
pisomoscer             0
pisocemento            0
pisoother              0
pisonatur              0
pisonotiene            0
pisomadera             0
techozinc              0
techoentrepiso         0
techocane              0


In [10]:
#As we saw there is a five coulmns that have a missings values 
#Handling missing values in v18q1 it's by replacing NaN values => Zeros
datatrain['v18q1'].replace(np.NaN,0,inplace=True)
datatest['v18q1'].replace(np.NaN,0,inplace=True)

In [11]:
#For Handling missing values in meaneduc we make an imputer to do that like follows:
def impute_meaneduc(coulmns):
    meaneduc = coulmns[0]
    instlevel9 = coulmns[1]
    if pd.isna(meaneduc):
        if instlevel9 == 0:
            meaneduc = 9.121911
        elif instlevel9 == 1:
            meaneduc = 16.244444
    return meaneduc

In [12]:
#Apply 'meaneduc' column on impute_meaneduc function
datatrain['meaneduc'] = datatrain[['meaneduc','instlevel9']].apply(impute_meaneduc, axis=1)
datatest['meaneduc'] = datatest[['meaneduc','instlevel9']].apply(impute_meaneduc, axis=1)

In [13]:
#For Handling missing values in 'SQBmeaned' which is the square root for 'meaneduc' like follows:
datatrain['SQBmeaned'].replace(np.NaN,datatrain['meaneduc']**2, inplace = True)
datatest['SQBmeaned'].replace(np.NaN,datatest['meaneduc']**2, inplace = True)

In [14]:
#For Handling missing values in 'rez_esc' we make an imputer to do that like follows:
def impute_rez_esc(coulmns):
    rez_esc = coulmns[0]
    age = coulmns[1]
    escolari = coulmns[2]
    if pd.isna(rez_esc):
        if age < 7:
            return 0
        else:
            return age-escolari-7
    return rez_esc

In [15]:
#Apply 'rez_esc' column on impute_rez_esc function
datatrain['rez_esc'] = datatrain[['rez_esc','age','escolari']].apply(impute_rez_esc, axis=1)
datatest['rez_esc'] = datatest[['rez_esc','age','escolari']].apply(impute_rez_esc, axis=1)

In [16]:
#For Handling missing values in 'v2a1' we make an imputer to do that like follows:
def impute_v2a1(coulmns):
    v2a1 = coulmns[0]
    rooms = coulmns[1]
    paredpreb = coulmns[2]
    pisonotiene = coulmns[3]
    if pd.isna(v2a1):
        if rooms == 1:
            if paredpreb == 0:
                if pisonotiene == 0:
                    return 76667
                elif pisonotiene == 1:
                    return 15000
            elif paredpreb == 1:
                return 55378
        elif rooms == 2:
            if paredpreb == 0:
                return 93342
            elif paredpreb == 1:
                return 95342
        elif rooms == 3:
            if paredpreb == 0:
                if pisonotiene == 0:
                    return 122383
                elif pisonotiene == 1:
                    return 65000
            elif paredpreb == 1:
                return 97795
        elif rooms == 4:
            if paredpreb == 0:
                return 126042
            elif paredpreb == 1:
                return 101857
        elif rooms == 5:
            if paredpreb == 0:
                return 157052
            elif paredpreb == 1:
                return 137609
        elif rooms == 6:
            if paredpreb == 0:
                return 230395
            elif paredpreb == 1:
                return 93108
        elif rooms == 7:
            return 287232
        elif rooms == 8:
            if paredpreb == 0:
                return 434370
            elif paredpreb == 1:
                return 200000
        elif rooms == 9:
            return 605221
        elif rooms == 10:
            return 570540
        else:
            return 600000
    return v2a1

In [17]:
#Apply 'v2a1' column on impute_v2a1 function
datatrain['v2a1'] = datatrain[['v2a1','rooms','paredpreb','pisonotiene']].apply(impute_v2a1, axis = 1)
datatest['v2a1'] = datatest[['v2a1','rooms','paredpreb','pisonotiene']].apply(impute_v2a1, axis = 1)

In [18]:
#here we will save the 'Id' before droping it, because we want use it in submission file
y_id = datatest.iloc[:,0]

In [19]:
#Droping unwanted coulmns
datatrain.drop(['Id','idhogar'], axis = 1, inplace = True)
datatest.drop(['Id','idhogar'], axis = 1, inplace = True)

In [20]:
#Now For deails with the three objects remaining, first we will change yes->1, no->0 then convert the column to float
datatrain['dependency'].replace('no',0, inplace = True)
datatrain['dependency'].replace('yes',1, inplace = True)
datatrain['edjefa'].replace('no',0, inplace = True)
datatrain['edjefa'].replace('yes',1, inplace = True)
datatrain['edjefe'].replace('no',0, inplace = True)
datatrain['edjefe'].replace('yes',1, inplace = True)

datatest['dependency'].replace('no',0, inplace = True)
datatest['dependency'].replace('yes',1, inplace = True)
datatest['edjefa'].replace('no',0, inplace = True)
datatest['edjefa'].replace('yes',1, inplace = True)
datatest['edjefe'].replace('no',0, inplace = True)
datatest['edjefe'].replace('yes',1, inplace = True)

# convert object coulmn to float column
datatrain.iloc[:,98:99] = datatrain.iloc[:,98:99].astype(str).astype(float)
datatrain.iloc[:,99:100] = datatrain.iloc[:,99:100].astype(str).astype(float)
datatrain.iloc[:,100:101] = datatrain.iloc[:,100:101].astype(str).astype(float)

datatest.iloc[:,98:99] = datatest.iloc[:,98:99].astype(str).astype(float)
datatest.iloc[:,99:100] = datatest.iloc[:,99:100].astype(str).astype(float)
datatest.iloc[:,100:101] = datatest.iloc[:,100:101].astype(str).astype(float)

In [21]:
#Split training data to x_train and y_train,x will handle all datatrain except the target and y will handel the target
x_train = datatrain.iloc[:,:-1]
y_train = datatrain.iloc[:,-1]

In [None]:
#using XGBoost algorithm to predict the target in datatest, with follows parameters
from xgboost import XGBClassifier
classifier = XGBClassifier(max_depth=100,n_estimators=500, learning_rate=1.0)
classifier.fit(x_train,y_train)

In [None]:
#Now the algorithm will predict the target
y_pred = classifier.predict(datatest)

In [None]:
#create submission file wich contains 'Id' and 'Target' that was predicted
sbmt = pd.DataFrame({'Id':y_id, 'Target':y_pred})

sbmt.to_csv('submission.csv',index=False)

In [None]:
#Finish the project