In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import ExtraTreesClassifier

In [2]:
input_df1 = pd.read_csv('C://Users//saish//Documents//driven_data//A_hhold_train.csv', index_col='id')
print('A records', input_df1.shape)

A records (8203, 345)


In [3]:
input_df2 = pd.read_csv('C://Users//saish//Documents//driven_data//B_hhold_train.csv', index_col='id')
print('B records', input_df2.shape)

B records (3255, 442)


In [4]:
input_df3 = pd.read_csv('C://Users//saish//Documents//driven_data//C_hhold_train.csv', index_col='id')
print('C records', input_df3.shape)

C records (6469, 164)


# Data imbalance check

In [5]:
input_df1.poor.value_counts()

False    4500
True     3703
Name: poor, dtype: int64

In [6]:
input_df2.poor.value_counts()

False    3004
True      251
Name: poor, dtype: int64

In [7]:
input_df3.poor.value_counts()

False    5496
True      973
Name: poor, dtype: int64

# Pre-process

In [8]:
def data_preprocess(input_df1, enforce_cols=None):
    print('Initial Input Shape', input_df1.shape)
    numeric = input_df1.select_dtypes(include=['int64', 'float64'])
    input_df1[numeric.columns] = (numeric - numeric.mean()) / numeric.std()
    print('After standardization Input Shape', input_df1.shape)
    input_df1 = pd.get_dummies(input_df1)
    print('After encoding Input Shape', input_df1.shape)
    
    """
    processing for test set
    setdiffid(a,b) = give values that are in 'a' but not 'b'
    """
    if enforce_cols is not None:
        to_drop = np.setdiff1d(input_df1.columns, enforce_cols)
        to_add = np.setdiff1d(enforce_cols, input_df1.columns)
        
        input_df1.drop(to_drop, axis=1, inplace=True)
        input_df1 = input_df1.assign(**{c: 0 for c in to_add})
        
    return input_df1

In [9]:
aX_train = data_preprocess(input_df1.drop('poor', axis=1))
ay_train = np.ravel(input_df1.poor)
bX_train = data_preprocess(input_df2.drop('poor', axis=1))
by_train = np.ravel(input_df2.poor)
cX_train = data_preprocess(input_df3.drop('poor', axis=1))
cy_train = np.ravel(input_df3.poor)

Initial Input Shape (8203, 344)
After standardization Input Shape (8203, 344)
After encoding Input Shape (8203, 859)
Initial Input Shape (3255, 441)
After standardization Input Shape (3255, 441)
After encoding Input Shape (3255, 1432)
Initial Input Shape (6469, 163)
After standardization Input Shape (6469, 163)
After encoding Input Shape (6469, 795)


In [10]:
print('shape of input 1 after pre-processing', aX_train.shape)
print('shape of input 2 after pre-processing', bX_train.shape)
print('shape of input 3 after pre-processing', cX_train.shape)

shape of input 1 after pre-processing (8203, 859)
shape of input 2 after pre-processing (3255, 1432)
shape of input 3 after pre-processing (6469, 795)


# Removing Null Values

In [11]:
bX_train = bX_train.fillna(bX_train.mean())
#bX_train.isnull().sum().sort_values(ascending=False)

In [None]:
#cX_train.isnull().sum().sort_values(ascending=False)

# OverSampling

In [None]:
from imblearn.over_sampling import SMOTE
b_in, b_out = SMOTE().fit_sample(np.asarray(bX_train), np.asarray(by_train))
c_in, c_out = SMOTE().fit_sample(np.asarray(cX_train), np.asarray(cy_train))

from collections import Counter 
print(sorted(Counter(b_out).items()))
print(sorted(Counter(c_out).items()))

# DownSample

from sklearn.utils import resample
b_in, b_out = resample()

# Test Data creation

In [12]:
test_df1 = pd.read_csv('C://Users//saish//Documents//driven_data//A_hhold_test.csv', index_col='id')
print('A records', test_df1.shape)
test_df2 = pd.read_csv('C://Users//saish//Documents//driven_data//B_hhold_test.csv', index_col='id')
print('B records', test_df2.shape)
test_df3 = pd.read_csv('C://Users//saish//Documents//driven_data//C_hhold_test.csv', index_col='id')
print('C records', test_df3.shape)
##
a_test = data_preprocess(test_df1, enforce_cols=aX_train.columns)
##
b_test = data_preprocess(test_df2, enforce_cols=bX_train.columns)
b_test = b_test.fillna(b_test.mean())
#b_test.isnull().sum().sort_values(ascending=False)
##
c_test = data_preprocess(test_df3, enforce_cols=cX_train.columns)
c_test = c_test.fillna(c_test.mean())
#c_test.isnull().sum().sort_values(ascending=False)

A records (4041, 344)
B records (1604, 441)
C records (3187, 163)
Initial Input Shape (4041, 344)
After standardization Input Shape (4041, 344)
After encoding Input Shape (4041, 851)
Initial Input Shape (1604, 441)
After standardization Input Shape (1604, 441)
After encoding Input Shape (1604, 1419)
Initial Input Shape (3187, 163)
After standardization Input Shape (3187, 163)
After encoding Input Shape (3187, 773)


# Model

# Dimensionality reduction

In [None]:
#Country A
a_svd = TruncatedSVD(algorithm='randomized', n_components=500, n_iter=10, random_state=42)
a_tr_svd = a_svd.fit_transform(aX_train, ay_train)
a_ts_svd = a_svd.transform(a_test)
print('A - country variance',a_svd.explained_variance_ratio_.sum()) 
print('A - Input Dimension',a_tr_svd.shape)

#Country B
b_svd = TruncatedSVD(algorithm='randomized', n_components=1000, n_iter=10, random_state=42)
b_tr_svd = b_svd.fit_transform(bX_train, by_train)
b_ts_svd = b_svd.transform(b_test)
print('B - country variance',b_svd.explained_variance_ratio_.sum()) 
print('B - Input Dimension',b_tr_svd.shape)

#Country C
c_svd = TruncatedSVD(algorithm='randomized', n_components=600, n_iter=10, random_state=42)
c_tr_svd = c_svd.fit_transform(cX_train, cy_train)
c_ts_svd = c_svd.transform(c_test)
print('C - country variance',c_svd.explained_variance_ratio_.sum()) 
print('C - Input Dimension',c_tr_svd.shape)

In [None]:
tr_x, ts_x, tr_y, ts_y = train_test_split(a_tr_svd, ay_train, test_size = 0.2, random_state=42)
#model_a = RandomForestClassifier(n_estimators=300, random_state=42)
#model_a = LogisticRegression()
model_a = AdaBoostClassifier(n_estimators= 500)
#model_a = GradientBoostingClassifier()
#model_a = tree.DecisionTreeClassifier()
#model_a = GaussianNB()
#model_a = ExtraTreesClassifier()
model_a.fit(tr_x, tr_y)
tr_pred_a = model_a.predict(ts_x)
#
accuracy = model_a.score(ts_x, ts_y)
print("In-sample accuracy:",accuracy)
#classification report
print(classification_report(ts_y,tr_pred_a))
#
a_pred = model_a.predict_proba(a_ts_svd)

In [None]:
#bX_train_int, by_train_int = shuffle(b_tr_svd, by_train, random_state=0)
tr_x, ts_x, tr_y, ts_y = train_test_split(b_tr_svd, by_train, test_size = 0.2, random_state=42)
#model_b = RandomForestClassifier(n_estimators=200, random_state=42)
#model_b = LogisticRegression()
model_b = AdaBoostClassifier(n_estimators= 500)
#model_b = GradientBoostingClassifier()
#model_b = GradientBoostingClassifier()
#model_b = tree.DecisionTreeClassifier()
#model_b = GaussianNB()
#model_b = ExtraTreesClassifier()
model_b.fit(tr_x, tr_y)
tr_pred_b = model_b.predict(ts_x)
#
accuracy = model_b.score(ts_x, ts_y)
print("In-sample accuracy:",accuracy)
#classification report
print(classification_report(ts_y,tr_pred_b))
#
b_pred = model_b.predict_proba(b_ts_svd)

In [None]:
#cX_train_int, cy_train_int = shuffle(c_in, c_out, random_state=0)
tr_x, ts_x, tr_y, ts_y = train_test_split(c_tr_svd, cy_train, test_size = 0.2, random_state=42)
#model_c = RandomForestClassifier(n_estimators=200, random_state=42)
#model_c = LogisticRegression()
model_c = AdaBoostClassifier(n_estimators= 900)
#model_c = GradientBoostingClassifier()
#model_c = tree.DecisionTreeClassifier()
#model_c = GaussianNB()
model_c = ExtraTreesClassifier()
model_c.fit(tr_x, tr_y)
tr_pred_c = model_c.predict(ts_x)
#
accuracy = model_c.score(ts_x, ts_y)
print("In-sample accuracy:",accuracy)
#classification report
print(classification_report(ts_y,tr_pred_c))
#
c_pred = model_c.predict_proba(c_ts_svd)

# change prediction format

In [None]:
def make_country_sub(preds, test_feat, country):
    # make sure we code the country correctly
    country_codes = ['A', 'B', 'C']
    
    # get just the poor probabilities
    country_sub = pd.DataFrame(data=preds[:, 1],  # proba p=1
                               columns=['poor'], 
                               index=test_feat.index)

    
    # add the country code for joining later
    country_sub["country"] = country
    return country_sub[["country", "poor"]]

In [None]:
a_sub = make_country_sub(a_pred, a_test, 'A')
b_sub = make_country_sub(b_pred, b_test, 'B')
c_sub = make_country_sub(c_pred, c_test, 'C')

In [None]:
sub_fl = pd.concat([a_sub, b_sub, c_sub])

In [None]:
sub_fl.head()

In [None]:
sub_fl.to_csv('submission_ab.csv')