In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import gc; gc.enable()

from utilities import DfLowMemory, DfLowMemoryTest
from utilities import CleanData

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from category_encoders import WOEEncoder, TargetEncoder
from xgboost import XGBClassifier
from imxgboost.imbalance_xgb import imbalance_xgboost as imb_xgb

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

from collections import defaultdict

import warnings
warnings.filterwarnings('ignore')

In [2]:
# reduce the memory of the dataset
df_train = DfLowMemory('train_ver2.csv')

In [None]:
# cleaning the data set so that there are no missing values
df_train = CleanData(df_train)

In [None]:
df_test = pd.read_csv('test_ver2.csv')

In [None]:
for c in df_test.columns.tolist():
    df_test[c].fillna(df_test[c].mode()[0], inplace=True)


In [None]:
df_train_may15 = df_train[df_train['fecha_dato'] == '2015-05-28']
df_train_June15 = df_train[df_train['fecha_dato'] == '2015-06-28']
df_train_may16 = df_train[df_train['fecha_dato'] == '2016-05-28']

In [None]:
df = pd.concat([df_train_may15, df_train_June15, df_train_may16])

In [None]:
df.drop_duplicates(subset='ncodpers',keep='last', inplace=True)

In [None]:
targets = [c for c in df.columns.tolist() if '_ult1' in c]

In [None]:
# there are the columns that are going to be used to train the model (feature columns)
used_cols = [c for c in df.columns.tolist() if c not in ['ncodpers', 'fecha_dato', 'fecha_alta',
'ind_ahor_fin_ult1','ind_aval_fin_ult1','ind_cco_fin_ult1','ind_cder_fin_ult1','ind_cno_fin_ult1','ind_ctju_fin_ult1','ind_ctma_fin_ult1',
 'ind_ctop_fin_ult1','ind_ctpp_fin_ult1','ind_deco_fin_ult1','ind_deme_fin_ult1','ind_dela_fin_ult1','ind_ecue_fin_ult1','ind_fond_fin_ult1',
 'ind_hip_fin_ult1','ind_plan_fin_ult1','ind_pres_fin_ult1','ind_reca_fin_ult1','ind_tjcr_fin_ult1','ind_valo_fin_ult1','ind_viv_fin_ult1',
 'ind_nomina_ult1', 'ind_nom_pens_ult1','ind_recibo_ult1']]

In [None]:
used_test_cols = [c for c in df_test.columns.tolist() if c not in ['ncodpers', 'fecha_dato', 'fecha_alta']]

used_cols = [c for c in df.columns.tolist() if c not in ['ncodpers', 'fecha_dato', 'fecha_alta']]

## Model Training

In [None]:
# isntantiating a dictionary to put the model results
id_preds = defaultdict(list)

#getting the ids of the customers in the dataset
ids = df_test['ncodpers'].values

In [None]:
df = df.reset_index()

In [None]:
df_test['renta'].replace(to_replace='         NA', value=None, inplace=True)

In [None]:
predictions = []
# This is just showing the training of the first 24 products. In the future I am going to train all 24 products
for target in targets: #[:1]
    X = df[used_cols].values
    y = df[target]

    encoder = TargetEncoder()
    X = encoder.fit_transform(X, y)
    
#     train_size = int(len(X) * 0.7)
#     y_train = int(len(y) * 0.7)
#     X_train, X_test = X[0:train_size], X[train_size:len(X)]
#     y_train, y_test = y[0:train_size], y[train_size:len(X)]
    
#     model = imb_xgb(special_objective='focal', focal_gamma=2.0, 
#                     num_round=50, max_depth=6, eta=0.1)
    model= LogisticRegression()
    model.fit(X.values, y)
    
    X_test = df_test[used_test_cols].values
    X_test = encoder.transform(X_test)
    
    y_pred = model.predict_proba(X_test.values)[:,1]
    #y_pred = model.predict_proba(X_test)
    
#     for i, cust_id in enumerate(ids):
#         id_preds[cust_id].append(y_pred[i])

    # for each target in targets, predict for the customer-base
    # and collect the predicted probabilities
    predictions.append(y_pred)

In [None]:
ids = df_test['ncodpers']

In [None]:
# for predictions. Each prediction has the probability of all the customer by product
# T - transposing gives me the lsit of products as rows and customers as columns
# argsort returns the index of these products
testing = np.argsort(predictions.T, axis=1)

In [None]:
# creating a dataframe with the testing
user_frame = pd.DataFrame(list(testing))

In [None]:
# this matches the indexes of the products with the names of the products
target_dict_names = {}

for i, target in enumerate(targets):
     target_dict_names[i] = target

In [None]:
# This changes all the indexes in the dataframe with the actual names of the products
user_frame.apply(lambda x: pd.Series(x).map(target_dict_names))

In [None]:
# I am going to add the users ids to the dataframe and then change it so that the id is in the first column

df_new['user_id'] = df_test['ncodpers']

cols = list(df_new)
# move the column to head of list using index, pop and insert
cols.insert(0, cols.pop(cols.index('user_id')))
df_new = df_new.ix[:, cols]

In [None]:
predictions = np.array(predictions).T

In [None]:
predictions = np.array(predictions).T

In [None]:
df_new['user_id'] = ids

In [None]:
df_new = pd.DataFrame()

In [None]:
predictions[0]

In [None]:
i=0
for prediction in predictions:   
    df_new[targets[0]+'_pred'] = prediction
    i += 1

In [None]:
df_new.head(2)

In [None]:
y0 = targets[0]

In [None]:
p0 = 'pred0'

In [None]:
pred0 = predictions[:,0]

In [None]:
predictions[0]

In [None]:
predictions[0]

In [None]:
testing = np.argsort(predictions.T, axis=1)

In [None]:
testing

In [None]:
df_new = pd.DataFrame()

In [None]:
df_new = pd.DataFrame(predictions.reshape(-1, len(predictions)))

In [None]:
df_new.head()

In [None]:
# adding the users to the dataframe
df_new['user_id'] = df_test['ncodpers']

In [None]:
cols = list(df_new)
# move the column to head of list using index, pop and insert
cols.insert(0, cols.pop(cols.index('user_id')))
df_new = df_new.ix[:, cols]

In [None]:
df_new.head()

In [None]:
df_new.head(2)

In [None]:
y0, y1,y2, y3, y4, y5, y6 = targets[0], targets[1], targets[2], targets[3], targets[4], targets[5], targets[6]
pred0, pred1, pred2, pred3, pred4, pred5, pred6 = predictions[:,0], predictions[:,1], predictions[:,2], predictions[:,3], predictions[:,4], predictions[:,5], predictions[:,6]
p0, p1, p2, p3, p4, p5, p6 = 'pred0', 'pred1', 'pred2', 'pred3', 'pred4', 'pred5', 'pred6'
df[p0] = pred0
df[p1] = pred1
df[p2] = pred2
df[p3] = pred3
df[p4] = pred4
df[p5] = pred5
df[p6] = pred6

df[[y0, y1, y2, y3, y4, y5, y6, p0, p1, p2, p3, p4, p5, p6]].head()

In [None]:
df[[y0, y1,y2,y3,y4,y5,y6, p0, p1,p2,p3,p4,p5,p6]].describe()

In [None]:
roc_auc_score(df_new[pred0], df_new['ind_ahor_fin_ult1'])

In [None]:
roc_auc_score(df[y1], df[p1])

In [None]:
i=0
for target in targets[:7]:
    print(target)
    print(roc_auc_score(df[target], predictions[:,i]))
    i+=1

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
balanced_accuracy_score(df[y0], df[p0] > 0.107)

In [None]:
df.columns[24:]

In [None]:
already_active = {}
for row in df_train[train_size].values:
    row = list(row)
    id = row.pop(0)
    active = [c[0] for c in zip(df[train_size].columns[24:], row) if c[1] > 0]
    already_active[id] = active

In [None]:
from sklearn.metrics import classification_report, balanced_accuracy_score, precision_score, auc

In [None]:
balanced_accuracy_score(df[y0], df[p0] > 0.107)

In [None]:
precision_score(df[y0], df[p0] > 0.2)

In [None]:
del df[p0]; del df[p1]; gc.collect()

In [None]:
preds = dict(id_preds)

In [None]:
predictions

In [None]:
customer_and_probabilities = {}
i = 0

for probability in predictions:
    customer_and_probabilities[]

In [None]:
list_of_prods = {}

for customer in preds.keys():
    list_of_prods[customer] = np.argsort(preds[customer])

In [None]:
list_of_prods

In [None]:
target_names = ['Saving Account', 'Guarantees', 'Current Accounts', 'Derivada Account', 'Payroll Account', 'Junior Account',
               'More Popular Account', 'Particular Account', 'Particular Plus Account', 'Short-term deposits',
               'Medium-term deposits', 'Long-term deposits', 'e-account', 'Funds', 'Mortgage', 'Pensions', 'Loans', 'Taxes',
               'Credit Card', 'Securities', 'Home Account', 'Payroll', 'Pensions2', 'Direct Debit']

In [None]:
target_dict_names = {}

for i, target in enumerate(targets):
     target_dict_names[i] = target

In [None]:
user_frame = pd.DataFrame(list(testing))

In [None]:
user_frame.head(2)

In [None]:
user_frame.apply(lambda x: pd.Series(x).map(target_dict_names))

In [None]:
user_frame['prods'] = user_frame['user'].map(list_of_prods)

In [None]:
trial_frame = user_frame

In [None]:
b['user_id'] = user_frame['user']

In [None]:
cols = list(b)
# move the column to head of list using index, pop and insert
cols.insert(0, cols.pop(cols.index('user_id')))
b = b.ix[:, cols]

In [None]:
df_final = b.iloc[:,1:8]

In [None]:
df_final.head()

In [None]:
df_final.to_csv('final.csv',index=False,sep=' ', quotechar= ' ')

In [None]:
ncodpers = a.iloc[:,0:1]

In [None]:
ncodpers.to_csv('ncodpers.csv', index=False)

In [None]:
df_final.shape

In [None]:
a.head()

In [None]:
df_test1 = pd.read_csv('test_data.csv')

In [None]:
df_test1.shape

In [None]:
df_train[:train_size].columns[24:],row[24:]

In [None]:
row[1]

In [None]:
for row in df[:train_size].values:
    row = list(row)
    id=row[1]
    
    active = [c[0] for c in zip(df_train[:train_size].columns[24:], row[24:]) if c[1] > 0]
    already_active[id] = active

In [None]:
# add 7 products(that user don't have yet), higher probability first -> train_pred   
train_preds = {}
for id, p in id_preds.items():
    # Here be dragons
    preds = [i[0] for i in sorted([i for i in zip(df_train.columns[1:], p) if i[0] not in already_active[id]],
                                  key=lambda i:i [1], 
                                  reverse=True)[:7]]

In [None]:

# add 7 products(that user don't have yet), higher probability first -> train_pred   
train_preds = {}
for id, p in id_preds.items():
    # Here be dragons
    preds = [i[0] for i in sorted([i for i in zip(df_train.columns[1:], p) if i[0] not in already_active[id]],
                                  key=lambda i:i [1], 
                                  reverse=True)[:7]]
    train_preds[id] = preds
    
test_preds = []
for row in sample.values:
    id = row[0]
    p = train_preds[id]
    test_preds.append(' '.join(p))

In [None]:
frames = []
for dato in df_train['fecha_dato'].unique().tolist():
    # get 10% sample for each day
    df_dato = df_train.loc[df_train['fecha_dato'] == dato].copy().sample(frac=0.10)

    # add to frames list
    frames.append(df_dato)
    gc.collect()

In [None]:
trial = pd.concat(frames)

In [None]:
trial['fecha_dato'] = pd.to_datetime(trial['fecha_dato'], format="%Y-%m-%d")
trial['fecha_alta'] = pd.to_datetime(trial['fecha_alta'], format="%Y-%m-%d")

In [None]:
trial['YearMonth'] = trial['fecha_dato'].map(lambda x: 100*x.year + x.month)

In [None]:
import seaborn as sns

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(16, 6))
fig.patch.set_facecolor('#f2efe5')
ax.patch.set_facecolor('#f2efe5')
#plt.figure(figsize=(16, 6))
g = sns.barplot(x= trial['YearMonth'], y = trial['ncodpers'])
g.set_xticklabels(g.get_xticklabels(), rotation=90)

g.set(xlabel='Date of transaction', ylabel='Number of customers')


In [None]:
trial['YearMonth_fecha_alta'] = trial['fecha_alta'].map(lambda x: 100*x.year + x.month)

In [None]:
df_train.iloc[:,0:24].head(2)

In [None]:
gc.collect()

In [None]:
d = pd.read_csv('train_ver2.csv')

In [None]:
d.iloc[:,0:24].isnull().sum()

In [None]:
targets = [c for c in df_train.columns.tolist() if '_ult1' in c]

In [None]:
trial.to_csv('trial.csv', index=False)