# Problem Statement

- Predict the value of transaction for potential customers
- This is a <b>Regression</b> problem
- URL of the problem: https://www.kaggle.com/c/santander-value-prediction-challenge

### Importing Libraries

In [1]:
# Linear algebra
import numpy as np  

# For EDA and cleaning the data
import pandas as pd

# For visualizations
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# For building a model
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.feature_selection import SelectKBest, chi2

import warnings
warnings.filterwarnings('ignore')



### Loading the data

In [2]:
santander_df = pd.read_csv('train.csv')

<b>There are no null values in the dataset</b>

### Miscellaneous

In [3]:
santander_df.head()

Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


<b>There are 12 important features in this data</b><br>
<b>We'll use SelectKBest to select those 12 important features</b>

### Splitting the data

In [4]:
X = santander_df.drop(['ID', 'target'], axis=1)
y = santander_df.target

In [5]:
chi_select = SelectKBest(score_func=chi2, k=12)

<b>Before applying the fit method, all the labels in the target variable must be of the same dtype</b>

In [6]:
y = np.array(y).astype('int')

In [7]:
y.dtype

dtype('int64')

In [8]:
chi_select.fit(X, y)

SelectKBest(k=12, score_func=<function chi2 at 0x7fcd08f438c8>)

In [9]:
chi_support = chi_select.get_support() # Contains the values either True or False, True means the feature has been
                                       # selected

In [10]:
chi_features = X.loc[:, chi_support].columns.tolist() # Storing the selected features

### 12 Important Features

In [11]:
chi_features

['f1c272f04',
 '2b85882ad',
 'c059f2574',
 '07cb6041d',
 'ede70bfea',
 '3be4dad48',
 '4c835bd02',
 'df6a71cc7',
 '3d23e8abd',
 'ff3b49c1d',
 '4ceef6dbd',
 '9fa984817']

In [12]:
X = santander_df[chi_features]  # Limiting our X to only 12 selected features

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=6)

### Building the model

In [14]:
reg_tree = DecisionTreeRegressor(random_state=2) # No HyperParameters

In [15]:
reg_tree.fit(X_train, y_train)  # Training the model

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=2, splitter='best')

In [16]:
predictions = reg_tree.predict(X_test) # Predicting on the unseen data

<b>Making the dtype of predictions as int</b>

In [17]:
predictions = np.array(predictions).astype('int')

In [18]:
mse = metrics.mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 96030186281232.66


### Submission

In [19]:
test_df = pd.read_csv('test.csv')

In [20]:
test_df.head()

Unnamed: 0,ID,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000137c73,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00021489f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0004d7953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,00056a333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00056d8eb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
chi_features

['f1c272f04',
 '2b85882ad',
 'c059f2574',
 '07cb6041d',
 'ede70bfea',
 '3be4dad48',
 '4c835bd02',
 'df6a71cc7',
 '3d23e8abd',
 'ff3b49c1d',
 '4ceef6dbd',
 '9fa984817']

In [22]:
X = test_df[chi_features]

In [23]:
X

Unnamed: 0,f1c272f04,2b85882ad,c059f2574,07cb6041d,ede70bfea,3be4dad48,4c835bd02,df6a71cc7,3d23e8abd,ff3b49c1d,4ceef6dbd,9fa984817
0,0.000000,0.0,0.000000,0.0,0.0,834202.531199,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00
1,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00
2,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,3004000.0,0.0,0.0,0.000000e+00,0.000000e+00
3,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00
4,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00
5,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00
6,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00
7,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00
8,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00
9,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00


In [24]:
target = reg_tree.predict(X)

In [25]:
target

array([ 400000.       , 6227533.8925421, 3750000.       , ...,
        465000.       , 6227533.8925421, 6227533.8925421])

In [26]:
target = pd.Series(target)

In [27]:
target

0        4.000000e+05
1        6.227534e+06
2        3.750000e+06
3        6.227534e+06
4        6.227534e+06
5        6.227534e+06
6        6.227534e+06
7        6.227534e+06
8        6.227534e+06
9        6.227534e+06
10       6.227534e+06
11       6.227534e+06
12       6.227534e+06
13       6.227534e+06
14       6.227534e+06
15       6.227534e+06
16       6.227534e+06
17       6.227534e+06
18       6.227534e+06
19       6.227534e+06
20       6.227534e+06
21       6.227534e+06
22       6.227534e+06
23       6.227534e+06
24       6.227534e+06
25       6.227534e+06
26       6.227534e+06
27       6.227534e+06
28       6.227534e+06
29       4.800000e+05
             ...     
49312    6.227534e+06
49313    6.227534e+06
49314    6.440000e+05
49315    6.227534e+06
49316    6.227534e+06
49317    4.000000e+04
49318    6.227534e+06
49319    6.227534e+06
49320    6.227534e+06
49321    5.000000e+05
49322    6.227534e+06
49323    6.227534e+06
49324    6.227534e+06
49325    6.227534e+06
49326    7

In [28]:
ID = test_df.ID

In [29]:
ID

0        000137c73
1        00021489f
2        0004d7953
3        00056a333
4        00056d8eb
5        0005fc190
6        000787e86
7        0008510a0
8        000895faf
9        000986fba
10       0009efcc5
11       000dd8f00
12       000e1cdc2
13       00103739c
14       00134b367
15       001788d0d
16       0019c8ad7
17       001a267b9
18       001a85486
19       001c4de06
20       001f9014b
21       00216fb94
22       00235690e
23       00240739d
24       0024c3747
25       00297052d
26       002ab8f85
27       002bed5c0
28       002bf3974
29       002d23a26
           ...    
49312    ffd52b8f2
49313    ffd6dea20
49314    ffd6e2cf8
49315    ffd6e56de
49316    ffd849299
49317    ffda13e24
49318    ffdbd30e2
49319    ffdbedc99
49320    ffe462df4
49321    ffe4c3a4b
49322    ffe511bf1
49323    ffe77e420
49324    ffe868610
49325    ffe8ed3d6
49326    ffea1065e
49327    ffea9a398
49328    ffec4707c
49329    ffecfe7f8
49330    ffed9dd0f
49331    ffee0237c
49332    ffef8aa08
49333    fff

In [30]:
submit_df = pd.DataFrame()

In [31]:
submit_df['ID'] = ID
submit_df['target'] = target

In [32]:
submit_df

Unnamed: 0,ID,target
0,000137c73,4.000000e+05
1,00021489f,6.227534e+06
2,0004d7953,3.750000e+06
3,00056a333,6.227534e+06
4,00056d8eb,6.227534e+06
5,0005fc190,6.227534e+06
6,000787e86,6.227534e+06
7,0008510a0,6.227534e+06
8,000895faf,6.227534e+06
9,000986fba,6.227534e+06


In [34]:
submit_df.to_csv('submission.csv', index=False)