# Task 1. Finding the probabilities.

In [1]:
# Importing required packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

### Training the model.

In [2]:
# Importing train.csv data
data = pd.read_csv("train.csv", index_col='ID')

In [3]:
data.head()

Unnamed: 0_level_0,variable_0,variable_1,variable_2,variable_3,variable_4,variable_5,variable_6,variable_7,variable_8,variable_9,variable_10,variable_11,variable_12,variable_13,variable_14,variable_15,variable_16,variable_17,State,target
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,120,2.51,6.6,0,0,9.3,16.69,186.7,4,15.87,98.2,0,146.7,85,99,96,408,2,GA,0
1,94,1.81,8.13,0,0,6.7,30.82,182.4,2,15.5,181.3,0,180.6,108,135,103,415,0,OH,0
2,82,3.0,4.93,0,0,11.1,24.46,194.9,2,16.57,143.9,0,109.6,105,61,94,510,1,DC,0
3,170,2.27,6.05,0,1,8.4,44.18,245.0,3,20.83,259.9,0,134.4,122,68,121,510,3,VA,0
4,93,3.38,3.2,19,0,12.5,23.26,179.5,3,15.26,136.8,1,71.1,105,113,95,510,2,CA,0


In [4]:
# Checking if there are missing values.
data.isna().sum()

variable_0     0
variable_1     0
variable_2     0
variable_3     0
variable_4     0
variable_5     0
variable_6     0
variable_7     0
variable_8     0
variable_9     0
variable_10    0
variable_11    0
variable_12    0
variable_13    0
variable_14    0
variable_15    0
variable_16    0
variable_17    0
State          0
target         0
dtype: int64

In [5]:
# The column named State is categorical. Counting its values.
data["State"].value_counts()

WV    100
NY     74
MN     74
WI     73
WY     73
VA     73
AL     72
OH     70
CT     68
TX     68
KS     66
OR     66
MI     66
VT     64
IN     63
NV     62
MD     62
UT     61
MS     60
ID     60
NJ     60
AZ     60
MT     60
MO     59
NC     59
WA     58
ME     58
NM     58
FL     57
DE     57
CO     56
RI     56
MA     56
OK     55
NE     55
KY     54
SC     54
SD     53
ND     53
NH     53
IL     51
TN     51
HI     49
LA     47
GA     47
DC     46
AK     45
AR     44
PA     41
IA     41
CA     32
Name: State, dtype: int64

In [6]:
# Looks like Variable_16 is catgorical too.(variable_17 looks like categorical too, but when I left it like that accuracy was higher)
data['variable_16'].value_counts()

415    1491
510     756
408     753
Name: variable_16, dtype: int64

In [7]:
# Since there are no states with low counts, we will make dummies them all.
data = pd.get_dummies(data, columns = ["State", "variable_16"])

In [8]:
# Spliting the data into train and validation sets
train, val = train_test_split(data, test_size = 0.3, random_state = 42)

In [9]:
# Spliting train and validation sets into targets(Y) and variables(X).
X_train = train.drop("target", axis = 1)
Y_train = train["target"]
X_val = val.drop("target", axis = 1)
Y_val = val["target"]

In [10]:
X_train.head()

Unnamed: 0_level_0,variable_0,variable_1,variable_2,variable_3,variable_4,variable_5,variable_6,variable_7,variable_8,variable_9,...,State_UT,State_VA,State_VT,State_WA,State_WI,State_WV,State_WY,variable_16_408,variable_16_415,variable_16_510
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
611,20,1.92,7.45,35,0,7.1,29.16,153.1,3,13.01,...,0,0,0,0,0,0,0,0,1,0
530,107,3.16,8.35,0,0,11.7,41.12,126.9,6,10.79,...,0,0,0,0,0,0,0,0,1,0
2787,128,2.92,8.61,32,0,10.8,37.89,262.0,4,22.27,...,0,0,1,0,0,0,0,0,0,1
49,131,2.94,11.44,25,0,10.9,32.76,225.9,6,19.2,...,0,0,0,0,0,0,0,0,1,0
1883,161,2.11,10.06,0,0,7.8,17.92,214.8,5,18.26,...,0,0,0,0,0,0,0,0,1,0


In [11]:
# Standardizing the data.
X_train = StandardScaler().fit(X_train).transform(X_train)
X_val = StandardScaler().fit(X_val).transform(X_val)

In [12]:
# Setting Support Vector Machine model with default hyperparameters.
SVM = SVC(random_state = 42)

In [13]:
# Training with default hyperparameters, and pronting the result.
SVM.fit(X_train, Y_train)
print("Train accuracy: " + str(accuracy_score(Y_train, SVM.predict(X_train))))
print("Validation accuracy: " + str(accuracy_score(Y_val, SVM.predict(X_val))))

Train accuracy: 0.8890476190476191
Validation accuracy: 0.86


In [14]:
# Here, with the help of confusion matrix, we can see that there are lot of false negative values.
confusion_matrix(Y_train, SVM.predict(X_train))

array([[1786,    0],
       [ 233,   81]], dtype=int64)

In [15]:
# Here is the same for the validation set
confusion_matrix(Y_val, SVM.predict(X_val))

array([[765,   4],
       [122,   9]], dtype=int64)

### Don't execute the cell bellow, it runs too long

In [317]:
# Using GridSearchCV to find best hyperparameters for SVM model.
params = {'C' : np.arange(1,51),
          'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],}

grid = GridSearchCV(SVM, params)
grid.fit(X_train, Y_train)

GridSearchCV(estimator=SVC(random_state=42),
             param_grid={'C': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]),
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']})

In [318]:
# The best parameters are.
grid.best_params_

{'C': 3, 'kernel': 'rbf'}

In [16]:
# Setting model with our best parameters. (setting 'probability to True, for returning the probabilities')
# This is the best accuracy from SVM and Logistic Regression models.
SVM = SVC(random_state = 42, kernel = 'rbf', C = 3, probability=True)
SVM.fit(X_train, Y_train)
print("Train accuracy: " + str(accuracy_score(Y_train, SVM.predict(X_train))))
print("Validation accuracy: " + str(accuracy_score(Y_val, SVM.predict(X_val))))

Train accuracy: 0.9557142857142857
Validation accuracy: 0.8811111111111111


In [17]:
# As we can see, we have a better results now.
confusion_matrix(Y_train, SVM.predict(X_train))

array([[1783,    3],
       [  90,  224]], dtype=int64)

In [18]:
confusion_matrix(Y_val, SVM.predict(X_val))

array([[745,  24],
       [ 83,  48]], dtype=int64)

### Returning the probabilites.

In [19]:
# Importing test data.
test = pd.read_csv('test.csv', index_col= "ID")
ID = test.index

In [20]:
test.head()

Unnamed: 0_level_0,variable_0,variable_1,variable_2,variable_3,variable_4,variable_5,variable_6,variable_7,variable_8,variable_9,variable_10,variable_11,variable_12,variable_13,variable_14,variable_15,variable_16,variable_17,State
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
3000,54,2.62,5.21,0,0,9.7,27.17,264.0,7,22.44,159.8,0,115.7,64,99,70,408,2,KY
3001,116,3.35,5.86,0,0,12.4,23.36,120.0,2,10.2,137.4,0,130.3,94,126,64,510,3,RI
3002,92,3.32,10.78,0,0,12.3,31.4,262.0,6,22.27,184.7,0,239.5,73,60,120,415,2,CO
3003,66,3.24,5.95,32,0,12.0,31.93,129.8,3,11.03,187.8,1,132.3,90,117,113,408,2,MS
3004,136,2.65,10.08,0,0,9.8,34.77,208.8,2,17.75,204.5,0,224.0,95,63,119,510,0,DC


In [21]:
# Preprocessing Test data.
test = pd.get_dummies(test, columns = ["State", "variable_16"])
test = StandardScaler().fit(test).transform(test)

In [22]:
# Probabilities for all 333 customers, sorted by descending order.
probs = pd.DataFrame({"ID" : ID, "Probability" : SVM.predict_proba(test)[:,1]}).sort_values('Probability', ascending=False)

In [23]:
# Top 100 of them.
top_100 = probs[:100]

In [25]:
top_100.style.hide_index()

ID,Probability
3185,0.950155
3046,0.897144
3250,0.854872
3076,0.837644
3292,0.826767
3306,0.801864
3268,0.790584
3180,0.788464
3108,0.741
3324,0.734607
