In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import gc; gc.enable()

from utilities import DfLowMemory, DfLowMemoryTest
from utilities import CleanData, CleanTestData

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from category_encoders import WOEEncoder, TargetEncoder
from xgboost import XGBClassifier
from imxgboost.imbalance_xgb import imbalance_xgboost as imb_xgb

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
df_train = DfLowMemory('train_ver2.csv')

In [3]:
df_train = CleanData(df_train)

In [68]:
df_test = pd.read_csv('test_data.csv')

In [69]:
df_test = CleanTestData(df_test)

From BreakfastPirate
I’m only training on about 37,000 accounts – only the accounts that added a new product in June 2015. I found that the distribution of products added varied a lot by month, and June seemed to be an unusual month. Since we are predicting June 2016, I trained on only June 2015.

I only used accounts that added a new product in June 2015. We are not trying to determine who will add a new product, we are only trying to predict which products they would add if they did. So I excluded all accounts that didn’t add a product.

In [4]:
# Selecting only May2015, June2015, May2016 

df_train_may15 = df_train[df_train['fecha_dato'] == '2015-05-28']

In [5]:
df_train_June15 = df_train[df_train['fecha_dato'] == '2015-06-28']

In [6]:
df_train_may16 = df_train[df_train['fecha_dato'] == '2016-05-28']

In [7]:
df15 = pd.merge(df_train_June15, df_train_may15, how='left', on='ncodpers', suffixes=('','_may'))

In [8]:
#dropping the features with _prev
todrop = [c for c in df15.columns if '_may' in c and '_ult1' not in c]

In [9]:
df15.drop(columns=todrop, inplace=True)

In [10]:
may_targets = [c for c in df15.columns if '_ult1_may' in c]

In [11]:
targets = [c for c in df15.columns if '_ult' in c and '_ult1_may' not in c]

In [None]:
# Now, using the idea proposed by BreakFastPirate, I am going to drop any rows that did not add a new product in June2015

In [12]:
#df15.isnull().sum()
# There are some null values on some customers in May. This is because in May they were not customers yet. So I am going to fill those with 0

may_customers = [col for col in df15.columns.tolist() if col in may_targets]
june_customers = [col for col in df15.columns.tolist() if col in targets]

In [13]:
# Now, using the idea proposed by BreakFastPirate, I am going to keep only the accounts that added a new product in June 2015
# it makes sense to me that most customers have chosen to keep the services that they had without acquiring new ones. It went from 630248 to 33312
def new_product(x):
    for col in june_customers:
        # dropping when they are equal and when they dropped the product
        if x[col+'_may'] < x[col]:
            return True
    return False

df15['new_product'] = df15.apply(new_product, axis=1)

In [14]:
df15= df15[df15['new_product'] == True]

In [15]:
df15.shape

(33312, 73)

In [19]:
df15.drop(columns=may_targets, inplace=True)

In [21]:
df15.drop(columns='new_product', inplace=True)

In [32]:
df15 = df15.drop_duplicates(['ncodpers'], keep='last')

In [159]:
df15.shape

(33312, 48)

## Merging may16

In [177]:
df16 = pd.merge(df_train_may16, df15, how='left', on='ncodpers', suffixes=('','_may16'))

In [178]:
#dropping the features with _may16
todrop16 = [c for c in df16.columns if '_may16' in c and '_ult1' not in c]
df16.drop(columns=todrop16, inplace=True)

In [179]:
may16_targets = [c for c in df16.columns if '_ult1_may16' in c]

In [180]:
# There are some null values on some customers in May16. This is because in May they were not customers yet. 
#So I am going to fill those with 0
for column in may16_targets:
    df16[column].fillna(0, inplace=True)

In [181]:
def new_product(x):
    for col in targets:
        # dropping when they are equal and when they dropped the product
        if x[col+'_may16'] < x[col]:
            return True
    return False

df16['new_product'] = df16.apply(new_product, axis=1)

In [182]:
df16= df16[df16['new_product'] == True]

In [183]:
df16.drop(columns=may16_targets, inplace=True)

In [184]:
df16.drop(columns='new_product', inplace=True)

In [185]:
df16 = df16.drop_duplicates(['ncodpers'], keep='last')

In [186]:
df16.shape

(673087, 48)

## Making sure that all June users are in the database
### Preparing the test data with the items that the users already have and 

In [187]:
df_test_trial = pd.merge(df_test, df16, how='left', on='ncodpers', suffixes=('','_June16'))

In [201]:
#dropping the features with _may16
todropj16 = [c for c in df_test_trial.columns if '_June16' in c and '_ult1' not in c]
df_test_trial.drop(columns=todropj16, inplace=True)

In [202]:
june16_targets = [c for c in df_test_trial.columns if '_ult1_June16' in c]

In [205]:
# There are some null values on some customers in May16. This is because in May they were not customers yet. 
#So I am going to fill those with 0
for column in df_test_trial.columns.tolist():
    df_test_trial[column].fillna(0, inplace=True)

ValueError: fill value must be in categories

In [206]:
df_test_trial.isnull().sum()

fecha_dato                    0
ncodpers                      0
ind_empleado                  0
pais_residencia               0
sexo                          0
age                           0
fecha_alta                    0
ind_nuevo                     0
antiguedad                    0
indrel                        0
ult_fec_cli_1t                0
indrel_1mes                   0
tiprel_1mes                   0
indresi                       0
indext                        0
conyuemp                      0
canal_entrada                 0
indfall                       0
tipodom                       0
cod_prov                      0
nomprov                       0
ind_actividad_cliente         0
renta                         0
segmento                      0
ind_ahor_fin_ult1        256589
ind_aval_fin_ult1        256589
ind_cco_fin_ult1         256589
ind_cder_fin_ult1        256589
ind_cno_fin_ult1         256589
ind_ctju_fin_ult1        256589
ind_ctma_fin_ult1        256589
ind_ctop

In [None]:
# making sure that all the June users are in the database

## Training the Model

In [207]:
cols = [c for c in df16.columns.tolist() if c not in ['fecha_dato', 'fecha_alta']]

In [208]:
from collections import defaultdict

In [209]:
id_preds = defaultdict(list)

In [210]:
id_preds

defaultdict(list, {})

In [213]:
predictions = {}
models = {}
ids = df16['ncodpers'].values
encoder = TargetEncoder()

for target in targets:
    used_cols = [c for c in cols if c not in target]
    X = df16[used_cols].values
    y = df16[target]
    
    X = encoder.fit_transform(X,y)
    
    model = imb_xgb(special_objective='focal', focal_gamma=2.0)
    model.fit(X.values, y.values)
    models[target] = model
    
    #predict the probability that the customer is going to buy the product
    y_pred = model.predict_sigmoid(X.values)
    predictions[target] = y_pred
    
    for id, p in zip(ids, y_pred):
        id_preds[id].append(y_pred)
        
    print(target)
    print(roc_auc_score(y, y_pred))
    
    

KeyboardInterrupt: 

In [None]:
## Submission

In [None]:
# check if customer already have each product or not. 
already_active = {}
for row in df16.values:
    row = list(row)
    id = row.pop(0)
    active = [c[0] for c in zip(df16.columns[1:], row) if c[1] > 0]
    already_active[id] = active

# add 7 products(that user don't have yet), higher probability first -> train_pred   
train_preds = {}
for id, p in id_preds.items():
    # Here be dragons
    preds = [i[0] for i in sorted([i for i in zip(df16.columns[1:], p) if i[0] not in already_active[id]],
                                  key=lambda i:i [1], 
                                  reverse=True)[:7]]
    train_preds[id] = preds
    
test_preds = []
for row in sample.values:
    id = row[0]
    p = train_preds[id]
    test_preds.append(' '.join(p))

In [None]:
sample = pd.read_csv('df_sample.csv')

In [None]:
sample['added_products'] = test_preds
sample.to_csv('kaggle_test.csv', index=False)

In [None]:
df_test = DfLowMemoryTest('test_ver2.csv')

In [None]:
df_test.groupby('nomprov')['renta'].mean()

In [None]:
import

In [None]:
df_test = pd.re

In [None]:
# since we have more users than items it is easier to estimate item-item similarity

In [None]:
#need to save the cleaned dataframe

In [None]:
for row in csv.DictReader(in_file_name):
		# use only the four months as specified by breakfastpirate #
		if row['fecha_dato'] not in ['2015-05-28', '2015-06-28', '2016-05-28', '2016-06-28']:

In [None]:
for i, row in customer_id.iterrows():
    cust = row['ncodpers']
    used_products = set(target[np.array(row[1:])==1])
    customer_dict[cust] = used_products

In [None]:
# need to have the customer in the test set
# drop customers who mantained the product (this is only during modeling.)

In [None]:
targets = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1', 'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1', 'ind_ctma_fin_ult1',
 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1', 'ind_ecue_fin_ult1', 'ind_fond_fin_ult1',
 'ind_hip_fin_ult1', 'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1', 'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
 'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']

In [None]:
used_cols = [c for c in trial.columns.tolist() if c not in [target, 'fecha_dato', 'fecha_alta']]

In [None]:
predictions = []
models = []
encoder = TargetEncoder()

for t in target:
    X = trial[used_cols].values
    y = trial[t]
    
    X = encoder.fit_transform(X,y)
    
    train_size = int(len(X) * 0.7)
    y_train = int(len(y) * 0.7)
    X_train, X_test = X[0:train_size], X[train_size:len(X)]
    y_train, y_test = y[0:train_size], y[train_size:len(X)]
    
    model = imb_xgb(special_objective='weighted', imbalance_alpha=2)
    model.fit(X_train.values, y_train.values)
    models.append(model)
    
    y_pred = model.predict(X_test.values)
    predictions.append(y_pred)