In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import gc; gc.enable()

from utilities import DfLowMemory
from utilities import CleanData

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from category_encoders import WOEEncoder, TargetEncoder
from xgboost import XGBClassifier
from imxgboost.imbalance_xgb import imbalance_xgboost as imb_xgb

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

In [2]:
df_train = DfLowMemory('train_ver2.csv')

In [3]:
df_train = CleanData(df_train)

From BreakfastPirate
I’m only training on about 37,000 accounts – only the accounts that added a new product in June 2015. I found that the distribution of products added varied a lot by month, and June seemed to be an unusual month. Since we are predicting June 2016, I trained on only June 2015.

I only used accounts that added a new product in June 2015. We are not trying to determine who will add a new product, we are only trying to predict which products they would add if they did. So I excluded all accounts that didn’t add a product.

In [4]:
# Selecting only May2015, June2015, May2016 

df_train_may15 = df_train[df_train['fecha_dato'] == '2015-05-28']

In [5]:
df_train_June15 = df_train[df_train['fecha_dato'] == '2015-06-28']

In [None]:
df_train_may16 = df_train[df_train['fecha_dato'] == '2016-05-28']

In [6]:
df15 = pd.merge(df_train_June15, df_train_may15, how='left', on='ncodpers', suffixes=('','_may'))

In [7]:
#dropping the features with _prev
todrop = [c for c in df15.columns if '_may' in c and '_ult1' not in c]

In [8]:
df15.drop(columns=todrop, inplace=True)

In [9]:
may_targets = [c for c in df15.columns if '_ult1_may' in c]

In [10]:
targets = [c for c in df15.columns if '_ult' in c and '_ult1_may' not in c]

In [None]:
# Now, using the idea proposed by BreakFastPirate, I am going to drop any rows that did not add a new product in June2015

In [11]:
#df15.isnull().sum()
# There are some null values on some customers in May. This is because in May they were not customers yet. So I am going to fill those with 0

may_customers = [col for col in df15.columns.tolist() if col in may_targets]
june_customers = [col for col in df15.columns.tolist() if col in targets]

In [12]:
# Now, using the idea proposed by BreakFastPirate, I am going to keep only the accounts that added a new product in June 2015
# it makes sense to me that most customers have chosen to keep the services that they had without acquiring new ones. It went from 630248 to 33312
def new_product(x):
    for col in june_customers:
        # dropping when they are equal and when they dropped the product
        if x[col+'_may'] < x[col]:
            return True
    return False

df15['new_product'] = df15.apply(new_product, axis=1)

In [13]:
df15= df15[df15['new_product'] == True]

In [14]:
df15.shape

(33312, 73)

## Getting only the users that were present on the test set

In [None]:
df_test = DfLowMemoryTest('test_ver2.csv')

In [None]:
df_test.groupby('nomprov')['renta'].mean()

In [None]:
import

In [None]:
df_test = pd.re

In [None]:
#need to save the cleaned dataframe

In [None]:
for row in csv.DictReader(in_file_name):
		# use only the four months as specified by breakfastpirate #
		if row['fecha_dato'] not in ['2015-05-28', '2015-06-28', '2016-05-28', '2016-06-28']:

In [None]:
for i, row in customer_id.iterrows():
    cust = row['ncodpers']
    used_products = set(target[np.array(row[1:])==1])
    customer_dict[cust] = used_products

In [None]:
# need to have the customer in the test set
# drop customers who mantained the product (this is only during modeling.)

In [None]:
targets = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1', 'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1', 'ind_ctma_fin_ult1',
 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1', 'ind_ecue_fin_ult1', 'ind_fond_fin_ult1',
 'ind_hip_fin_ult1', 'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1', 'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
 'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']

In [None]:
used_cols = [c for c in trial.columns.tolist() if c not in [target, 'fecha_dato', 'fecha_alta']]

In [None]:
predictions = []
models = []
encoder = TargetEncoder()

for t in target:
    X = trial[used_cols].values
    y = trial[t]
    
    X = encoder.fit_transform(X,y)
    
    train_size = int(len(X) * 0.7)
    y_train = int(len(y) * 0.7)
    X_train, X_test = X[0:train_size], X[train_size:len(X)]
    y_train, y_test = y[0:train_size], y[train_size:len(X)]
    
    model = imb_xgb(special_objective='weighted', imbalance_alpha=2)
    model.fit(X_train.values, y_train.values)
    models.append(model)
    
    y_pred = model.predict(X_test.values)
    predictions.append(y_pred)