# Problem Statement

Santander wants to find which customers will make a specific transaction in the future, irrespective of the amount of money transacted.

#### This kernels consists of : 

* Loading the Preprocessed Training Dataset
* Run the predictions using the saved models:
     * Decision Tree Regressor


In [13]:
import gc
import os
import time
import math
import subprocess
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import cross_val_score, train_test_split, KFold, StratifiedKFold

from sklearn.metrics import roc_auc_score
#from bayes_opt import BayesianOptimization
from sklearn.model_selection import StratifiedKFold

# Importing all models

# Classification
from sklearn.linear_model import LogisticRegression, ElasticNet, Lasso, Ridge
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, \
    RandomForestClassifier, VotingClassifier
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor, \
    RandomForestRegressor, VotingRegressor


#import lightgbm as lgb
#import xgboost as xgb
#import catboost as cat
#from catboost import Pool, CatBoostClassifier

import warnings
#print(os.listdir("../input"))
warnings.simplefilter('ignore')

In [14]:
from datetime import datetime

# current date and time
now = datetime.now()

timestamp = str(datetime.timestamp(now))
#print("timestamp =", timestamp)

In [15]:
total_start_time = time.time()
#total_start_time

# Importing Data and Reducing Memory

In [16]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [17]:
#train = import_data("train_ver2.csv")
test = import_data("s3://santander-starter-train/PreProcessedtrainingdata/testingdatapreprocessed/Data_testing_PreProcessed.csv")
#sub = import_data("sample_submission.csv")

print("\n\nTest Size : \t{}".format(test.shape))


Memory usage of dataframe is 163.13 MB
Memory usage after optimization is: 39.01 MB
Decreased by 76.1%


Test Size : 	(929615, 23)


In [18]:
test.head(50)
#test

Unnamed: 0,date,customer_code,emp_status,residence_code,sex,age,date_first_prod,new_customer,seniority_inmonths,prime_customer,...,is_residence,is_foreigner,spouse,channel,deceased,primary_address,province_code,status_ai,gincome,social_segment
0,20160628,15889,4,0,1,56,19950116,0,256,1,...,2,1,1,5,0,1,28.0,1,326124.90625,1
1,20160628,1170544,1,0,2,36,20130828,0,34,1,...,2,1,2,5,0,1,3.0,0,134254.0,3
2,20160628,1170545,1,0,1,22,20130828,0,34,1,...,2,1,2,1,0,1,15.0,1,134254.0,3
3,20160628,1170547,1,0,2,22,20130828,0,34,1,...,2,1,2,1,0,1,8.0,0,148402.984375,3
4,20160628,1170548,1,0,2,22,20130828,0,34,1,...,2,1,2,1,0,1,7.0,0,106885.796875,3
5,20160628,1170550,1,0,1,22,20130828,0,34,1,...,2,1,2,1,0,1,8.0,0,134254.0,3
6,20160628,1170552,1,0,2,51,20130828,0,34,1,...,2,1,2,4,0,1,35.0,1,96395.882812,3
7,20160628,1170553,1,0,2,22,20130828,0,34,1,...,2,1,2,1,0,1,45.0,0,134254.0,3
8,20160628,1170555,1,0,1,22,20130828,0,34,1,...,2,1,2,1,0,1,28.0,0,134254.0,3
9,20160628,1170557,1,0,2,22,20130828,0,34,1,...,2,1,2,1,0,1,15.0,1,68322.71875,3


In [19]:
test.columns #PrinceG : Test data has only the columns to be used as features

Index(['date', 'customer_code', 'emp_status', 'residence_code', 'sex', 'age',
       'date_first_prod', 'new_customer', 'seniority_inmonths',
       'prime_customer', 'prime_lastdate', 'customer_type',
       'customer_relation', 'is_residence', 'is_foreigner', 'spouse',
       'channel', 'deceased', 'primary_address', 'province_code', 'status_ai',
       'gincome', 'social_segment'],
      dtype='object')

In [20]:
traintesting = pd.DataFrame(test.values,columns=['date', 'customer_code', 'emp_status', 'residence_code', 'sex', 'age',
       'date_first_prod', 'new_customer', 'seniority_inmonths',
       'prime_customer', 'prime_lastdate', 'customer_type',
       'customer_relation', 'is_residence', 'is_foreigner', 'spouse',
       'channel', 'deceased', 'primary_address', 'province_code', 'status_ai',
       'gincome', 'social_segment'])

In [21]:
## verify which all columns have Null / NaN values
traintesting.isnull().sum()

date                  0
customer_code         0
emp_status            0
residence_code        0
sex                   0
age                   0
date_first_prod       0
new_customer          0
seniority_inmonths    0
prime_customer        0
prime_lastdate        0
customer_type         0
customer_relation     0
is_residence          0
is_foreigner          0
spouse                0
channel               0
deceased              0
primary_address       0
province_code         0
status_ai             0
gincome               0
social_segment        0
dtype: int64

In [22]:
import boto3
import numpy as np
import pickle
s3_resource = boto3.resource('s3')
filename = 'ProdRecommed.npy'
dicttprodrec = s3_resource.Object('santander-starter-train','dictionaries/ProdRecommed.npy').download_file(filename)
ProdRecommed_dict= np.load("ProdRecommed.npy",allow_pickle='TRUE').item()
#ProdRecommed_dict[0]

In [23]:
import boto3
import numpy as np
import pickle
s3_resource = boto3.resource('s3')
filename = 'Channel.npy'
dictchannel = s3_resource.Object('santander-starter-train','dictionaries/Channel.npy').download_file(filename)
Channel_dict= np.load("Channel.npy",allow_pickle='TRUE').item()
#Channel_dict[0]

In [24]:
filename = 'Residence_Code.npy'
dicttprodrec = s3_resource.Object('santander-starter-train','dictionaries/Residence_Code.npy').download_file(filename)
Residence_Code= np.load('Residence_Code.npy',allow_pickle='TRUE').item()
#Residence_Code[0]

## Predictions using the Decision Tree Regressor Algorithm Model

In [25]:
## Download the Model and run Prediction
import boto3
import pickle
s3_resource = boto3.resource('s3')
filename = 'dtr.sav'
objectfile = s3_resource.Object('santander-starter-train','modelsagemaker/dtr.sav').download_file(filename)
dtrmodel = pickle.load(open(filename, 'rb'))
y_predict= dtrmodel.predict(traintesting)
#print(y_predict)

In [26]:

#import pickle
#filename = 'dtr.sav'
#pickle.dump(reg, open(filename, 'wb'))
# some time later...
 
# load the model from disk
#loaded_model = pickle.load(open(filename, 'rb'))
#result = loaded_model.score(x_test,y_test)
#y_predict= loaded_model.predict(traintesting)
print(y_predict)

[694.   0.   0. ... 492.   0. 436.]


In [27]:

traintesting['ProdRecommedPredict'] = y_predict
traintesting['ProdRecommedPredict'].astype('str')
#ProdRecommed_dict[0].astype('str')
traintesting['ProdRecommedPredict'] = traintesting['ProdRecommedPredict'].map(ProdRecommed_dict[0])

In [28]:
traintesting

Unnamed: 0,date,customer_code,emp_status,residence_code,sex,age,date_first_prod,new_customer,seniority_inmonths,prime_customer,...,is_foreigner,spouse,channel,deceased,primary_address,province_code,status_ai,gincome,social_segment,ProdRecommedPredict
0,20160628.0,15889.0,4.0,0.0,1.0,56.0,19950116.0,0.0,256.0,1.0,...,1.0,1.0,5.0,0.0,1.0,28.0,1.0,326124.906250,1.0,"Current Accounts, particular Plus Account, Cre..."
1,20160628.0,1170544.0,1.0,0.0,2.0,36.0,20130828.0,0.0,34.0,1.0,...,1.0,2.0,5.0,0.0,1.0,3.0,0.0,134254.000000,3.0,Current Accounts
2,20160628.0,1170545.0,1.0,0.0,1.0,22.0,20130828.0,0.0,34.0,1.0,...,1.0,2.0,1.0,0.0,1.0,15.0,1.0,134254.000000,3.0,Current Accounts
3,20160628.0,1170547.0,1.0,0.0,2.0,22.0,20130828.0,0.0,34.0,1.0,...,1.0,2.0,1.0,0.0,1.0,8.0,0.0,148402.984375,3.0,Current Accounts
4,20160628.0,1170548.0,1.0,0.0,2.0,22.0,20130828.0,0.0,34.0,1.0,...,1.0,2.0,1.0,0.0,1.0,7.0,0.0,106885.796875,3.0,Current Accounts
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
929610,20160628.0,660237.0,1.0,0.0,1.0,55.0,19990421.0,0.0,206.0,1.0,...,1.0,2.0,5.0,0.0,1.0,28.0,1.0,128643.570312,1.0,"Current Accounts, Payroll Account, e-account, ..."
929611,20160628.0,660238.0,1.0,0.0,1.0,30.0,20061129.0,0.0,115.0,1.0,...,1.0,2.0,4.0,0.0,1.0,26.0,0.0,134254.000000,3.0,
929612,20160628.0,660240.0,1.0,0.0,1.0,52.0,20061129.0,0.0,115.0,1.0,...,1.0,2.0,67.0,0.0,1.0,33.0,1.0,72765.273438,3.0,"Current Accounts, particular Account"
929613,20160628.0,660243.0,1.0,0.0,1.0,32.0,20061129.0,0.0,115.0,1.0,...,1.0,2.0,4.0,0.0,1.0,33.0,0.0,147488.875000,3.0,Current Accounts


## Saving the file with ProductRecommendations

In [30]:
name = 's3://santander-starter-train/RecommendedProductTestingDatSet/testdata_Results_ProdRecommnd_dtr'+timestamp+'.csv'
traintesting.to_csv(name,index=False)
print('Done. Time elapsed: {:.2f}s'.format(time.time() - total_start_time))

Done. Time elapsed: 255.00s
