# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn .pipeline import Pipeline
from sklearn.metrics import roc_auc_score
# magic word for producing visualizations in notebook
%matplotlib inline

# Loading the data

In [2]:
train_data=pd.read_csv('../capstone_data/Udacity_MAILOUT_052018_TRAIN.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
test_data=pd.read_csv('../capstone_data/Udacity_MAILOUT_052018_TEST.csv')

In [4]:
train_data.head()

Unnamed: 0,LNR,AGER_TYP,AKT_DAT_KL,ALTER_HH,ALTER_KIND1,ALTER_KIND2,ALTER_KIND3,ALTER_KIND4,ALTERSKATEGORIE_FEIN,ANZ_HAUSHALTE_AKTIV,...,VK_DHT4A,VK_DISTANZ,VK_ZG11,W_KEIT_KIND_HH,WOHNDAUER_2008,WOHNLAGE,ZABEOTYP,RESPONSE,ANREDE_KZ,ALTERSKATEGORIE_GROB
0,1763,2,1.0,8.0,,,,,8.0,15.0,...,5.0,2.0,1.0,6.0,9.0,3.0,3,0,2,4
1,1771,1,4.0,13.0,,,,,13.0,1.0,...,1.0,2.0,1.0,4.0,9.0,7.0,1,0,2,3
2,1776,1,1.0,9.0,,,,,7.0,0.0,...,6.0,4.0,2.0,,9.0,2.0,3,0,1,4
3,1460,2,1.0,6.0,,,,,6.0,4.0,...,8.0,11.0,11.0,6.0,9.0,1.0,3,0,2,4
4,1783,2,1.0,9.0,,,,,9.0,53.0,...,2.0,2.0,1.0,6.0,9.0,3.0,3,0,1,3


# Preprocessing Data

In [5]:
def Attribute_Unknown_Dict(path,df):
    '''
    Purpose: Map all the attributes to their unknown values in a dictionary
    
    Input: PATH TO DIAS Attributes - Values 2017.xlsx file, in str format
    
    Output:
    new_dict:Dictionary with attributes as key and the unknown meaning values in a list
    '''
    #reading the xlsx file and storing it in a dataframe
    attributes=pd.read_excel(path,skiprows=1)
    
    #formatting
    attributes=attributes.drop(['Unnamed: 0'],axis=1,inplace=False)
    
    #Droping all the rows with nan values. Since, all the first values refer to the unknown meaning category\
    #Only keeping the first row for each attribute works
    attributes=attributes.dropna()
    
    new_dict={}
    for i in range(attributes.shape[0]):
        #checking if it corresponds to the unknown value
        if ('unknown' in attributes['Meaning'].iloc[i].split()) or(attributes['Meaning'].iloc[i] == 'no transaction known'):
            new_list=[]
            if type(attributes['Value'].iloc[i])==int:
                new_list.append(attributes['Value'].iloc[i])
            else:
                for j in attributes['Value'].iloc[i].split(','):
                    new_list.append(int(j))

            new_dict[attributes['Attribute'].iloc[i]]=new_list
    
    #These columns are not present in  the actual dataset
    new_dict.pop('BIP_FLAG')
    new_dict.pop('GEOSCORE_KLS7')
    new_dict.pop('HAUSHALTSSTRUKTUR')
    new_dict.pop('SOHO_FLAG')
    new_dict.pop('WACHSTUMSGEBIET_NB')
    
    
    #Removing name '_RZ' from the end of each column name
    for i in new_dict:
        if i not in df.columns:
            new_dict[i[:-3]]=new_dict.pop(i)
    
    #removing the last key_value pair
    new_dict.pop('')
    
    
    return new_dict

In [6]:
def Map_unknown_to_NAN(df,attribute_dict):
    '''
    Replace the unknown values with NAN values in the df
    ARGS:
    df: Dataframe on which the mapping takes place like azdias
    attribute_dict: Dict with attribute as keys and values which are to replaced with NAN
    
    Output:
    df: transformed df with more null values
    
    '''
    #Replacing 
    for key,val in attribute_dict.items():
        for j in val:
            df[key]=df[key].replace(j,np.nan)
                
    return df

In [7]:
def cat_to_num(df):
    '''
    converting columns which are categorical to numerical and droping other categorical columns
    INput: 
    df: DataFrame to be processed
    
    Output:
    df: After droping and converting categorical columns
    
    '''
    
    #converting CAMEO_INTL_2015 AND CAMEO_DEUB_2015 into numerical values
    intl=[]
    deug=[]
    for i in range(len(df['CAMEO_INTL_2015'])):
        if type(df['CAMEO_INTL_2015'].iloc[i])==str and df['CAMEO_INTL_2015'].iloc[i][0]=='X':
            intl.append(np.nan)
        else:
            intl.append(float(df['CAMEO_INTL_2015'].iloc[i]))
        if type(df['CAMEO_INTL_2015'].iloc[i])==str and df['CAMEO_DEUG_2015'].iloc[i][0]=='X':
            deug.append(np.nan)
        else:
            deug.append(float(df['CAMEO_DEUG_2015'].iloc[i]))
    
    #droping the original columns
    df=df.drop(['CAMEO_INTL_2015','CAMEO_DEUG_2015'],axis=1,inplace=False)
    #Adding new columns
    df['CAMEO_INTL_2015']=intl
    df['CAMEO_DEUG_2015']=deug
    
    #droping 'LNR' AND 'VERDICHTUNGSRAUM' columns
    #droping the 'LP_FAMILIE_GROB' column because it very similar 'LP_FAMILIE_FEIN'

    df=df.drop(['LNR','VERDICHTUNGSRAUM','LP_FAMILIE_GROB'],axis=1,inplace=False)
    
    return df

In [8]:
def mixed_categories(df):
    '''
    Spliting mixed attributes into individua;
    Input:
    df: DataFrame to be processed
    
    Output:
    df
    '''
    
    df['WEALTH']=df['CAMEO_INTL_2015'].apply(lambda x:x/10)
    df['LIFE_CYCLE']=df['CAMEO_INTL_2015'].apply(lambda x:x%10)
    
    mainstream=[1.0, 3.0, 5.0, 8.0, 10.0, 12.0, 14.0]
    avantgarde=[2.0, 4.0, 6.0, 7.0, 9.0, 11.0, 13.0, 15.0]
    
    main=df['PRAEGENDE_JUGENDJAHRE'].isin([1.0, 3.0, 5.0, 8.0, 10.0, 12.0, 14.0])
    avar=df['PRAEGENDE_JUGENDJAHRE'].isin([2.0, 4.0, 6.0, 7.0, 9.0, 11.0, 13.0, 15.0])
    
    df.loc[main,'MOVEMENT']=1.0
    df.loc[avar,'MOVEMENT']=2.0
    
    df=df.drop(['CAMEO_INTL_2015','PRAEGENDE_JUGENDJAHRE', 'EINGEFUEGT_AM','D19_LETZTER_KAUF_BRANCHE'],axis=1,inplace=False)
    
    df['OST_WEST_KZ'] = df['OST_WEST_KZ'].replace({'O':1.0, 'W':2.0})
    
    new_df=pd.get_dummies(df,columns=['CAMEO_DEU_2015'])
    
    new_df=new_df.drop(['CAMEO_DEU_2015_XX'],axis=1,inplace=False)
    
    return new_df

In [9]:
def impute(df):
    '''
    Replacing all the null values with mean of the column
    
    '''
    
    fill_mean=lambda col:col.fillna(col.mean())
    df=df.apply(fill_mean,axis=0)
    
    return df

# Cleaning Training data

In [10]:
#Removing RESPONSE column from Training_data and storing it in train_labels
train_labels=train_data['RESPONSE']
train_data=train_data.drop(['RESPONSE'],axis=1,inplace=False)

In [11]:
#Implementing cleaning functions (same as preprocessing)
attribute_dict=Attribute_Unknown_Dict('../capstone_data/DIAS Attributes - Values 2017.xlsx',train_data)
train_data=Map_unknown_to_NAN(train_data,attribute_dict)

In [12]:
#Listing the columns with more than 50% of null values and droping it from training_df
drop_cols=list(train_data.isnull().sum(axis=0)[train_data.isnull().sum(axis=0)>0.50*(train_data.shape[0])]\
                    .reset_index()['index'])

In [13]:
#Implementing cleaning functions (same as preprocessing)
train_data=train_data.drop(columns=drop_cols,axis=1,inplace=False)
train_data=cat_to_num(train_data)
train_data=mixed_categories(train_data)
train_data=impute(train_data)

In [14]:
train_data.head()

Unnamed: 0,AGER_TYP,AKT_DAT_KL,ALTER_HH,ALTERSKATEGORIE_FEIN,ANZ_HAUSHALTE_AKTIV,ANZ_HH_TITEL,ANZ_KINDER,ANZ_PERSONEN,ANZ_STATISTISCHE_HAUSHALTE,ANZ_TITEL,...,CAMEO_DEU_2015_7E,CAMEO_DEU_2015_8A,CAMEO_DEU_2015_8B,CAMEO_DEU_2015_8C,CAMEO_DEU_2015_8D,CAMEO_DEU_2015_9A,CAMEO_DEU_2015_9B,CAMEO_DEU_2015_9C,CAMEO_DEU_2015_9D,CAMEO_DEU_2015_9E
0,2.0,1.0,8.0,8.0,15.0,0.0,0.0,1.0,13.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,4.0,13.0,13.0,1.0,0.0,0.0,2.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,1.0,9.0,7.0,0.0,0.049574,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,2.0,1.0,6.0,6.0,4.0,0.0,0.0,2.0,4.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,2.0,1.0,9.0,9.0,53.0,0.0,0.0,1.0,44.0,0.0,...,0,0,0,0,0,0,0,0,0,0


# Cleaning Testing data

In [15]:
#Storing the LNR column for submission on Kaggle
test_lnr=test_data['LNR']

In [16]:

test_data=Map_unknown_to_NAN(test_data,attribute_dict)

In [17]:
#Droping the same columns from testing data 
test_data=test_data.drop(columns=drop_cols,axis=1,inplace=False)

In [18]:
#Implementing cleaning functions (same as preprocessing)
test_data=cat_to_num(test_data)
test_data=mixed_categories(test_data)
test_data=impute(test_data)

In [19]:
test_data.head()

Unnamed: 0,AGER_TYP,AKT_DAT_KL,ALTER_HH,ALTERSKATEGORIE_FEIN,ANZ_HAUSHALTE_AKTIV,ANZ_HH_TITEL,ANZ_KINDER,ANZ_PERSONEN,ANZ_STATISTISCHE_HAUSHALTE,ANZ_TITEL,...,CAMEO_DEU_2015_7E,CAMEO_DEU_2015_8A,CAMEO_DEU_2015_8B,CAMEO_DEU_2015_8C,CAMEO_DEU_2015_8D,CAMEO_DEU_2015_9A,CAMEO_DEU_2015_9B,CAMEO_DEU_2015_9C,CAMEO_DEU_2015_9D,CAMEO_DEU_2015_9E
0,2.0,1.0,7.0,6.0,2.0,0.0,0.0,2.0,2.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1.651514,1.0,12.433248,0.0,20.0,0.0,0.0,1.0,21.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,2.0,9.0,16.0,11.0,2.0,0.0,0.0,4.0,2.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,1.651514,7.0,12.433248,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,1.0,1.0,21.0,13.0,1.0,0.0,0.0,4.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
#Checking that test and train data have same columns 
sum(train_data.columns==test_data.columns)

367

# Supervised Model

In [21]:
#Checking total proportion of positive response from training data
train_labels.sum()/train_labels.shape[0]

0.012383036171500396

    Since, only 1.2% of the total responses are customers, the TRAIN-TEST split of the training data should maintain this ratio across all folds. To do this, we can use Stra

In [None]:
from sklearn.model_selection import StratifiedKFold
SF1=StratifiedKFold(n_splits=7)
SF1.get_n_splits(train_data,train_labels)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor,AdaBoostRegressor
from xgboost.sklearn import XGBRegressor # Extreme Gradient Boosting
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR

In [None]:
pipeline1=Pipeline([('scaler',StandardScaler()),
                   ('clf',GradientBoostingRegressor())])
pipeline2=Pipeline([('scaler',StandardScaler()),
                   ('clf',AdaBoostRegressor())])
pipeline3=Pipeline([('scaler',StandardScaler()),
                   ('clf',XGBRegressor())])
pipeline4=Pipeline([('scaler',StandardScaler()),
                   ('clf',MLPRegressor())])

In [None]:
def predict_score(pipeline,column_name,train_data,train_labels,SF):
    '''
    To predict the predict scores of the pipeline
    Input: 
    pipeline : model to fit 
    column_name : model name in str
    '''
    scores=[]

    for train,test in SF.split(train_data,train_labels):

        pipeline.fit(train_data.iloc[train],train_labels.iloc[train])

        y_pred=pipeline.predict(train_data.iloc[test])

        score=roc_auc_score(train_labels.iloc[test],y_pred)

        scores.append(score)

        print(score)
    
    df=pd.DataFrame(scores,columns=[column_name])
    return df

In [None]:
 df1=predict_score(pipeline1,'Gradient',train_data,train_labels)

In [None]:
df2=predict_score(pipeline2,'AdaBoostRegressor',train_data,train_labels)

In [None]:
df3=predict_score(pipeline3,'XGBRegressor',train_data,train_labels)

In [None]:
df4=predict_score(pipeline4,'MLPRegressor',train_data,train_labels)

In [None]:
scores=pd.concat([df1,df2,df3,df4],axis=1)

In [None]:
scores.describe()

In [None]:
pipeline2.get_params()

In [None]:
sk=StratifiedKFold(n_splits=2,shuffle=True)
parameters_1={'clf__learning_rate':[0.001,0.01,1],
             'clf__n_estimators' : [50,100,200,300]}

In [None]:
cv=GridSearchCV(pipeline2,param_grid=parameters_1,scoring='roc_auc',cv=sk,return_train_score=True,n_jobs=-1)
cv.fit(train_data,train_labels)

In [None]:
cv.best_params_

In [None]:
cv.best_score_

In [None]:
train_labels.sum()/train_labels.shape[0]

In [None]:
customers = pd.read_csv('../capstone_data/Udacity_CUSTOMERS_052018.csv')

In [None]:
customers.head()

In [None]:
customers=customers.drop(['CUSTOMER_GROUP', 'ONLINE_PURCHASE', 'PRODUCT_GROUP'], axis=1)
customers=Map_unknown_to_NAN(customers,attribute_dict)
customers=customers.drop(columns=drop_cols,axis=1,inplace=False)
customers=cat_to_num(customers)
customers=mixed_categories(customers)
customers=impute(customers)

In [None]:
(customers.columns == train_data.columns).sum()

In [None]:
#Adding 5000 datapoints from customer_dataframe to train_dataframe

customers['RESPONSE']=1

In [None]:
customers.head()

In [None]:
cust_labels=customers['RESPONSE']
customers=customers.drop(['RESPONSE'],axis=1,inplace=False)

In [None]:
customers_part=customers.iloc[:20000]

In [None]:
train_data_2=pd.concat([train_data,customers_part]).reset_index(drop=True)

In [None]:
train_labels_2=pd.concat([train_labels,cust_labels[:20000]]).reset_index(drop=True)

In [None]:
train_data_2.shape[0]==train_labels_2.shape[0]

In [None]:
train_data_2.shape

In [None]:
train_labels_2.sum()/train_labels_2.shape[0]

In [None]:
unique_indices=list(train_data_2.duplicated()[train_data_2.duplicated()==False].index)

In [None]:
train_labels_2=train_labels_2.iloc[unique_indices]

In [None]:
train_data_2=train_data_2.iloc[unique_indices]

In [None]:
train_labels_2.sum()/train_labels_2.shape[0]

In [None]:
SF2=StratifiedKFold(n_splits=7)
SF2.get_n_splits(train_data_2,train_labels_2)

In [None]:
 df1=predict_score(pipeline1,'Gradient',train_data_2,train_labels_2,SF2)

In [None]:
df2=predict_score(pipeline2,'AdaBoostRegressor',train_data_2,train_labels_2,SF2)

In [None]:
df3=predict_score(pipeline3,'XGBRegressor',train_data_2,train_labels_2,SF2)

In [None]:
scores_2=pd.concat([df1,df2,df3],axis=1)
scores_2.describe()

In [None]:
pipeline3.get_params()

In [None]:
sk=StratifiedKFold(n_splits=2,shuffle=True)
parameters_2={'clf__learning_rate':[0.001,0.01,0.3,1],
             'clf__n_estimators' : [100,200,300],
             'clf__gamma':[0.1,0,1]}

In [None]:
cv=GridSearchCV(pipeline3,param_grid=parameters_2,scoring='roc_auc',cv=sk,return_train_score=True,n_jobs=-1)
cv.fit(train_data_2,train_labels_2)

In [None]:
cv.best_params_

In [None]:
cv.best_score_

# Final Model

In [None]:
xgb=XGBRegressor(learning_rate=0.001,n_estimators=300,gamma=0.1)
ada=AdaBoostRegressor(n_estimators=200,learning_rate=0.001)
final_pipeline=Pipeline([('scaler',StandardScaler()),
                   ('clf',xgb)])

In [None]:
final_pipeline.fit(train_data_2,train_labels_2)

In [None]:
test_data.head()

In [None]:
y_pred=final_pipeline.predict(test_data)

In [None]:
y_pred.shape

In [None]:
test_lnr.shape[0]

In [None]:
sample=pd.read_csv('../capstone_data/Arvato_Capstone_Example_Submission.csv')

In [None]:
sample.shape

In [None]:
name ='../capstone_data/Submission_4.csv'
f=open(name,'w')
f.write('LNR')
f.write(',')
f.write('RESPONSE')
f.write('\n')

for i in range(test_lnr.shape[0]):
    f.write(str(test_lnr.iloc[i]))
    f.write(',')
    f.write(str(y_pred[i]))
    f.write('\n')
f.close()