# Import Library
## All Dependencies


In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
!pip install lightgbm
from lightgbm import LGBMClassifier



# Data Loading


In [2]:
train_df = pd.read_csv("/content/drive/MyDrive/Zindi Women/Train (3).csv")
test_df = pd.read_csv("/content/drive/MyDrive/Zindi Women/Test (2).csv")



# Data Preprocessing

In [3]:
test_df['target'] = 'test' #for easy separation

#combine train and test to avoid repetition
data = pd.concat([train_df,test_df]).reset_index(drop=True)

# Categorical Data Conversion


In [4]:
#using get dummies function from pandas
data = pd.get_dummies(columns=['gender','race', 'dwelling', 'dwelling_type',
       'province_code', 'metro_code', 'nationality', 'RTH',
       'marital_st', 'Lang_inside', 'Lang_outside', 'Education', 'lw_work',
       'lw_business', 'help_on_household', 'job_or_business', 'nature_of_work'],data=data,drop_first=True)

In [5]:
#columns to drop after data conversion from categorical to numerical
bad_cols = ["Education_Grade 7/Standard 5/AET 3",
"Education_Higher degree (Master's; Doctorate)",
"metro_code_WC - Non Metro",
"help_on_household_No",
"nature_of_work_Casual",
"Lang_outside_Tshivenda",
"Lang_inside_Xitsonga",
"Education_Grade 2/Sub B/Class 2",
"RTH_Grandchild/great grandchild of person 01",
"province_code_Limpopo",
"dwelling_Dwelling/house/flat/room in backyard",
"Lang_inside_Sesotho",
"Lang_inside_Sepedi",
"job_or_business_Unspecified",
"Lang_inside_SiSwati",
"Lang_outside_SiSwati",
"Lang_inside_Tshivenda",
"Lang_inside_Other (Specify )",
"Education_Do not know",
"metro_code_MP - Non Metro",
"lw_business_No",
"lw_business_Unspecified",
"RTH_Father/mother/stepfather/stepmother of person 01",
"metro_code_NW - Non Metro",
"help_on_household_Unspecified",
"help_on_household_Yes",
"metro_code_NC - Non Metro",
"metro_code_GP - City of Johannesburg",
"metro_code_LP - Non Metro",
"lw_work_Unspecified",
"province_code_Western Cape",
"nature_of_work_Do not know",
"nature_of_work_Not applicable",
"province_code_Free State",
"nature_of_work_Seasonal",
"dwelling_Other (specify)",
"RTH_Grandparent/great grandparent of person 01",
"marital_st_Unspecified",
"RTH_Unspecified",
"Lang_inside_IsiNdebele",
"Education_Certificate with less than Grade 12/Std 10",
"Lang_outside_Unspecified",
"Lang_outside_Khoi, Nama and San languages",
"Lang_outside_IsiNdebele",
"Lang_inside_Sign language",
"Education_Grade R/0",
"Lang_inside_Khoi, Nama and San languages",
"Education_N4/NTC 4",
"Education_Diploma with less than Grade 12/Std 10",
"Education_N5/NTC 5",
"Education_N6/NTC 6",
"Education_NTC 1/N1/NC (V) /Level 2",
"Education_NTC 2/N2/NC (V)/Level 3",
"Education_NTC 3/N3/NC (V)/Level 4",
"Education_Other",
"Education_Post-Higher Diploma (Technikon/University of Technology; Master's; Doctoral)",
"nature_of_work_Unspecified"]

In [6]:
data = data.drop(columns = bad_cols, axis=1)

In [7]:
#splitting back into train and test dataframes
train_df = data.loc[(data['target'] != 'test')].reset_index(drop = True)
test_df = data.loc[(data['target'] == 'test')].reset_index(drop = True)

In [8]:
#dropping the redundant ID column and the target variable from train data
features = train_df.drop(["target", "ID"], axis=1).columns

# Modelling


In [9]:
def metric(y, pred):
    return roc_auc_score(y, pred, labels=[0, 1])
# feature columns
X = train_df.drop(["target", "ID"], axis=1) 
# the label/target column
y = train_df["target"].apply(int) 

In [10]:
#cross validation
n_skf=12
kf=StratifiedKFold(n_skf)
seed=2020


In [11]:
score_list = []
score = 0
test_oofs = []

#multiple iterations with stratified kfold
for i, (tr_idx, vr_idx) in enumerate(kf.split(X,y)):
    X_train, y_train = X.loc[tr_idx, features], y.loc[tr_idx]
    xval, yval = X.loc[vr_idx, features], y.loc[vr_idx]
    
    
    #model fitting 
    model=LGBMClassifier(num_leaves=25,max_depth=14,reg_alpha=1,reg_lambda=2,subsample=0.7,subsample_freq=1,colsample_bytree=0.3,n_estimators=4000,learning_rate=0.005)
    model.fit(X_train, y_train, eval_set=[(xval,yval)],early_stopping_rounds=300, verbose=100,eval_metric='auc')
    
    #model prediction
    p = model.predict_proba(xval)[:, 1]
    sc = metric(yval, p)
    score_list.append(sc)
    score += sc/n_skf
    
    #prediction on test data
    pred=model.predict_proba(test_df[features])[:, 1]

    #append last prediction to the test_oof empty list
    test_oofs.append(pred)
    
    print('Fold {} : {}'.format(i, sc))
    
print()
print()
print('Avg log :',score)

Training until validation scores don't improve for 300 rounds.
[100]	valid_0's auc: 0.651597	valid_0's binary_logloss: 0.269296
[200]	valid_0's auc: 0.664773	valid_0's binary_logloss: 0.26668
[300]	valid_0's auc: 0.661109	valid_0's binary_logloss: 0.265552
[400]	valid_0's auc: 0.659261	valid_0's binary_logloss: 0.264972
[500]	valid_0's auc: 0.657513	valid_0's binary_logloss: 0.264869
Early stopping, best iteration is:
[204]	valid_0's auc: 0.665345	valid_0's binary_logloss: 0.266631
Fold 0 : 0.6653445378151261
Training until validation scores don't improve for 300 rounds.
[100]	valid_0's auc: 0.580756	valid_0's binary_logloss: 0.27026
[200]	valid_0's auc: 0.584605	valid_0's binary_logloss: 0.269095
[300]	valid_0's auc: 0.585311	valid_0's binary_logloss: 0.268847
Early stopping, best iteration is:
[3]	valid_0's auc: 0.645092	valid_0's binary_logloss: 0.272494
Fold 1 : 0.6450924369747899
Training until validation scores don't improve for 300 rounds.
[100]	valid_0's auc: 0.650857	valid_0's

# Creating Submission File

In [12]:
submission_df = pd.DataFrame()
submission_df["ID"] = test_df["ID"]

In [13]:
submission_df["target"] = np.mean(test_oofs, axis=0)
submission_df.to_csv("hopes.csv", index=False)

# Blending The Two Best Models

In [14]:
#import the second best model file, giving it a variable name 'umoja'
umoja= pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Umoja.csv')
#Note: 'hopes' is the best performing model
hopes = pd.read_csv("hopes.csv")

In [15]:
#create a dataframe for the submission files
blend = pd.DataFrame()
blend["ID"] = umoja.ID #set the ID accordingly

#fit the 'umoja' and 'hopes' target values into separate columns in the 'blend' dataframe
blend["light"] =hopes["target"]
blend["umoja"] = umoja["target"]

In [16]:
#Giving the first best prediction an estimation value of 0.9 while the second best performing model a score of 0.08
blend["target"] = blend.light*(0.9)+blend.umoja*(0.08)

#drop 'light and umoja' columns after blending
blend = blend.drop(columns= ["light","umoja"])

#save the csv file as 'Blend_umoja8.csv'
blend.to_csv("Blend_umoja8.csv", index=False)