# Problem Statement

Santander wants to find which customers will make a specific transaction in the future, irrespective of the amount of money transacted.

In [1]:
import boto3
import numpy as np
import pickle
s3_resource = boto3.resource('s3')
filename = 'ProdRecommed.npy'
dicttprodrec = s3_resource.Object('santander-starter-train','dictionaries/ProdRecommed.npy').download_file(filename)
ProdRecommed_dict= np.load("ProdRecommed.npy",allow_pickle='TRUE').item()
#ProdRecommed_dict[0]

In [2]:
import boto3
import numpy as np
import pickle
s3_resource = boto3.resource('s3')
filename = 'Channel.npy'
dictchannel = s3_resource.Object('santander-starter-train','dictionaries/Channel.npy').download_file(filename)
Channel_dict= np.load("Channel.npy",allow_pickle='TRUE').item()
#Channel_dict[0]

In [3]:
filename = 'Residence_Code.npy'
dicttprodrec = s3_resource.Object('santander-starter-train','dictionaries/Residence_Code.npy').download_file(filename)
Residence_Code= np.load('Residence_Code.npy',allow_pickle='TRUE').item()
#Residence_Code[0]

In [4]:
import gc
import os
import time
import math
import subprocess
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import cross_val_score, train_test_split, KFold, StratifiedKFold

from sklearn.metrics import roc_auc_score
#from bayes_opt import BayesianOptimization
from sklearn.model_selection import StratifiedKFold

# Importing all models

# Classification
from sklearn.linear_model import LogisticRegression, ElasticNet, Lasso, Ridge
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, \
    RandomForestClassifier, VotingClassifier
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor, \
    RandomForestRegressor, VotingRegressor


#import lightgbm as lgb
#import xgboost as xgb
#import catboost as cat
#from catboost import Pool, CatBoostClassifier

import warnings
#print(os.listdir("../input"))
warnings.simplefilter('ignore')

In [5]:
total_start_time = time.time()

# Importing Data and Reducing Memory

In [6]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

In [7]:
#train = import_data("train_ver2.csv")
test = import_data("s3://santander-starter-train/testingdatasets/test_ver2.csv")
#sub = import_data("sample_submission.csv")

#print("\n\nTrain Size : \t{}\nTest Size : \t{}".format(train.shape, test.shape))


Memory usage of dataframe is 170.22 MB
Memory usage after optimization is: 58.01 MB
Decreased by 65.9%


In [8]:
traintesting = pd.DataFrame(test.values,columns=['date', 'customer_code', 'emp_status', 'residence_code', 'sex', 'age', 'date_first_prod', 'new_customer', 'seniority_inmonths',
'prime_customer', 'prime_lastdate', 'customer_type', 'customer_relation', 'is_residence', 'is_foreigner', 'spouse', 'channel',
'deceased', 'primary_address', 'province_code', 'province_name', 'status_ai', 'gincome', 'social_segment'])

In [9]:
traintesting.drop(columns=['province_name'],axis=1,inplace=True) #Province Code shall be used instead

In [10]:
## verify which all columns have Null / NaN values
traintesting.isnull().sum()

date                       0
customer_code              0
emp_status                 0
residence_code             0
sex                        5
age                        0
date_first_prod            0
new_customer               0
seniority_inmonths         0
prime_customer             0
prime_lastdate        927932
customer_type             23
customer_relation         23
is_residence               0
is_foreigner               0
spouse                929511
channel                 2081
deceased                   0
primary_address            0
province_code           3996
status_ai                  0
gincome                    0
social_segment          2248
dtype: int64

In [11]:
traintesting['date'] = traintesting['date'].replace('-','',regex=True).str.strip(' ,')

In [12]:
traintesting['date_first_prod'] = traintesting['date_first_prod'].replace('-','',regex=True).str.strip(' ,')
traintesting['prime_lastdate'] = traintesting['prime_lastdate'].replace('-','',regex=True).str.strip(' ,')



In [13]:
traintesting['date_first_prod']

0         19950116
1         20130828
2         20130828
3         20130828
4         20130828
            ...   
929610    19990421
929611    20061129
929612    20061129
929613    20061129
929614    20061129
Name: date_first_prod, Length: 929615, dtype: object

In [14]:
## Load Dictionaries
#Channel_dict = np.load('Channel.npy',allow_pickle='TRUE').item()
#Residence_Code= np.load('Residence_Code.npy',allow_pickle='TRUE').item()
#ProdRecommed_dict= np.load('ProdRecommed.npy',allow_pickle='TRUE').item()
#Residence_Code[0]

In [14]:
##Create a new dictionary for ProductRecommendation Column and map it: Prince G
#daf1 = pd.DataFrame(traintesting['residence_code'].unique())
#di1= daf1.to_dict()
#di1[0].items()
new_dict_res_code = dict([(value, key) for key, value in Residence_Code[0].items()]) 
#new_dict_res_code
traintesting['residence_code'] = traintesting['residence_code'].map(new_dict_res_code)
traintesting['residence_code']
#new_dict_res_code
#traintesting['residence_code'].unique()

0         0
1         0
2         0
3         0
4         0
         ..
929610    0
929611    0
929612    0
929613    0
929614    0
Name: residence_code, Length: 929615, dtype: int64

In [15]:
##Create a new dictionary for ProductRecommendation Column and map it: Prince G
#daf2 = pd.DataFrame(traintesting['channel'].unique())
#di2= daf2.to_dict()
#di1[0].items()
new_dict_channel_code = dict([(value, key) for key, value in Channel_dict[0].items()]) 
#new_dict_res_code
traintesting['channel'] = traintesting['channel'].map(new_dict_channel_code)
traintesting['channel']
#new_dict_res_code
#traintesting['residence_code'].unique()

0          5
1          5
2          1
3          1
4          1
          ..
929610     5
929611     4
929612    67
929613     4
929614     5
Name: channel, Length: 929615, dtype: int64

In [16]:
##Mapping the columns to be used for algorithm to process
traintesting['customer_type'] = traintesting['customer_type'].map({'1.0':1, '1':1, '2.0':2, '2':2, '3.0':3, '3':3, '4.0':4, '4':4, 'P':'5'})
traintesting['emp_status'] = traintesting['emp_status'].map({'N':1, 'A':2, 'B':3, 'F':4, 'S':5})
traintesting['sex'] = traintesting['sex'].map({'V':'1', 'H':'2'})
traintesting['deceased'] = traintesting['deceased'].map({'N':'0', 'S':'1'})
traintesting['is_foreigner'] = traintesting['is_foreigner'].map({'N':'1', 'S':'2'})
traintesting['is_residence'] = traintesting['is_residence'].map({'N':'1', 'S':'2'})
traintesting['spouse'] = traintesting['spouse'].map({'N':'1', 'S':'2'})
traintesting['social_segment'] = traintesting['social_segment'].map({'01 - TOP':'1', '02 - PARTICULARE':'2', '03 - UNIVERSITARIO':'3'})
traintesting['customer_relation'] = traintesting['social_segment'].map({'A':'1', 'I':'2', 'P':'3', 'R':'4', 'N':'5'})
#traintesting.replace(' NA',"0",inplace=True)

In [17]:
traintesting.replace(' NA',"NaN",inplace=True)
traintesting.replace('     NA',"NaN",inplace=True)
#traintesting.replace('',"0",inplace=True)

In [18]:
#traintesting.dropna(axis=1)
traintesting['spouse'].replace('NaN','2',inplace=True)
traintesting['spouse'].unique()
traintesting['spouse'].fillna('2',inplace=True)


In [19]:
traintesting['social_segment'].fillna('3',inplace=True)



In [20]:

#traintesting['age'].fillna(traintesting['age'].mean(),inplace=True)#Gross Income is considered for Mean of Age
traintesting['emp_status'].fillna('1',inplace=True)
traintesting['residence_code'].fillna('1',inplace=True)
traintesting['deceased'].fillna('0',inplace=True)
traintesting['new_customer'].fillna('0.0',inplace=True)
traintesting['status_ai'].fillna('0.0',inplace=True)
traintesting['channel'].fillna('1',inplace=True)
traintesting['prime_customer'].fillna('1.0',inplace=True)
traintesting['customer_type'].fillna('1',inplace=True)
traintesting['sex'].fillna('1',inplace=True)
traintesting['is_residence'].fillna('1',inplace=True) #Replace NaN with value 1 means he is not a resident
traintesting['is_foreigner'].fillna('1',inplace=True) #Replace NaN with value 1 means he is not a foreigner
traintesting['customer_relation'].fillna('0',inplace=True)
traintesting['primary_address'].fillna('0',inplace=True)#Replace NaN with 0 as customer has not given Primary address
traintesting['province_code'].fillna('0.0',inplace=True)#Replace NaN with 0.0 , the value not existing in Province Code list
#traintesting['province_name'].fillna('NotPROVIDED',inplace=True)#Replace NaN with NotPROVIDED , the value not existing in Province Name list
traintesting['date_first_prod'].fillna('0',inplace=True)#Replace NaN with 0 , to identfy that date not existing, in such cases the product offering may differ
traintesting['prime_lastdate'].fillna('0',inplace=True)#Replace NaN with 0 , to identfy that date not existing, in such cases the product offering may differ


In [21]:
#traintesting['gincome'].fillna(traintesting['gincome'].mean(),inplace=True)#Gross Income is considered for Mean of incomes. This may be wrong assumption
#traintesting['gincome'].astype('float64')
traintesting['gincome'].replace('         NA','134254',inplace=True)
traintesting['gincome'].fillna('134254.3',inplace=True)
traintesting['gincome']

0           326124.90
1              134254
2              134254
3           148402.98
4           106885.80
             ...     
929610      128643.57
929611         134254
929612       72765.27
929613      147488.88
929614         134254
Name: gincome, Length: 929615, dtype: object

In [22]:
traintesting['age'].value_counts()

24     50706
23     49604
22     47674
21     46322
25     41429
       ...  
114        6
127        1
117        1
118        1
164        1
Name: age, Length: 118, dtype: int64

In [23]:
traintesting['age'].fillna('25',inplace=True)#May be a wromg assumption from Business prespective
traintesting['age'].replace('NaN','25',inplace=True)#May be a wromg assumption from Business prespective

In [24]:
traintesting['seniority_inmonths'].fillna('100',inplace=True)#May be a wromg assumption from Business prespective
traintesting['seniority_inmonths'].replace('NaN','100',inplace=True)#May be a wromg assumption from Business prespective
traintesting['seniority_inmonths'].unique()

array([    256,      34,      11,       5,      33,       4,      30,
            32,      15,      18,      25,      23,      16,      26,
            31,      13,      27,      29,      20,       8,       7,
            24,      14,       3,       1,      10,      28,      22,
            21,       2,      19,      17,      12,       9,       6,
            37,      35,      36,      38,     150,      41,      40,
            39,       0,      47,      48,      45,      42,      44,
            46,      49,      43,      51,      50,      52,      55,
            54,      53,      56,     186,     168,     128,     120,
           220,      75,     172,     103,     159,     145,     135,
           160,      93,     156,     157,     170,     148,     175,
           176,     167,     104,     123,     133,     147,     112,
            87,      99,     129,     171,     134,     144,     169,
            72,     149,     106,     166,     113,     184,     139,
           187,     

In [25]:
traintesting['sex'].unique()

array(['1', '2'], dtype=object)

In [26]:
traintesting['age'].unique()

array([ 56,  36,  22,  51,  41,  33,  23,  43,  63,  62,  32,  58,  71,
        31,  30,  59,  45,  37,  39,  38,  46,  34,  35,  42,  29,  88,
        64,  48,  72,  47,  27,  24,  50,  49,  57,  67,  25,  28,  13,
        40,  53,  54,  26,  11,  21,  60,  44,  55,   7,  52,  66,  90,
        73,  78,  79,  61,  69,  12,   9,  65,  77,  83,  81,  10,   5,
        18,  17,  87,  70,  80,  75,  68,  74,  16,   4,  82,  91,  76,
        19,  15,   6,  97,  89,  85,  86,  14,  20,   8,  84,  95,  93,
       100,  96,  92,  98,  94, 105, 102, 101, 104, 103,  99,   3,   2,
       116, 106, 107, 109, 110, 117, 108, 113, 111, 112, 114, 164, 118,
       127])

In [27]:
traintesting['social_segment'].value_counts()

3    893654
1     35961
Name: social_segment, dtype: int64

In [28]:
traintesting['gincome'].value_counts()

134254         227965
  451931.22       354
  463625.16       111
  128318.52        91
  181042.20        91
                ...  
  253983.03         1
   65224.68         1
   68072.52         1
  131335.38         1
   76026.81         1
Name: gincome, Length: 516403, dtype: int64

In [29]:
traintesting['gincome'] = traintesting['gincome'].astype(np.float64)
traintesting['gincome'].unique()

array([326124.9 , 134254.  , 148402.98, ..., 139164.12, 100647.45,
        72765.27])

In [30]:
## verify which all columns have Null / NaN values
traintesting.isnull().sum()

date                  0
customer_code         0
emp_status            0
residence_code        0
sex                   0
age                   0
date_first_prod       0
new_customer          0
seniority_inmonths    0
prime_customer        0
prime_lastdate        0
customer_type         0
customer_relation     0
is_residence          0
is_foreigner          0
spouse                0
channel               0
deceased              0
primary_address       0
province_code         0
status_ai             0
gincome               0
social_segment        0
dtype: int64

In [31]:
#traintesting = traintesting.astype('float64')

In [32]:
traintesting.to_csv(r's3://santander-starter-train/PreProcessedtrainingdata/testingdatapreprocessed/Data_testing_PreProcessed.csv',index = False)

## Preprocessed data is loaded sucessfully