In [8]:
#download all the necessary packages
import numpy as np
import pandas as pd
import sklearn
from sklearn import linear_model
import matplotlib.pyplot as plt
%matplotlib inline

In [9]:
#Check which version of SkiKit Learn is being used. The most recent version is 0.16.1
#http://scikit-learn.org/stable/documentation.html
sklearn.__version__

'0.18.1'

In [10]:
#Get a first look on the training data set
Caravan=pd.read_csv('../../../Data/CaravanData.csv')
Caravan.head()

Unnamed: 0,MOSTYPE,MAANTHUI,MGEMOMV,MGEMLEEF,MOSHOOFD,MGODRK,MGODPR,MGODOV,MGODGE,MRELGE,...,APERSONG,AGEZONG,AWAOREG,ABRAND,AZEILPL,APLEZIER,AFIETS,AINBOED,ABYSTAND,CARAVAN
0,33,1,3,2,8,0,5,1,3,7,...,0,0,0,1,0,0,0,0,0,0
1,37,1,2,2,8,1,4,1,4,6,...,0,0,0,1,0,0,0,0,0,0
2,37,1,2,2,8,0,4,2,4,3,...,0,0,0,1,0,0,0,0,0,0
3,9,1,3,3,3,2,3,2,4,5,...,0,0,0,1,0,0,0,0,0,0
4,40,1,4,2,10,1,4,1,4,7,...,0,0,0,1,0,0,0,0,0,0


The meaning of the cols is given in a separate data dictionary, supplied with the data.
An example of some of the fields:<br>
<b>MOSTYPE</b>:Customer Subtype (Example : <b>33:</b> Lower class large families, <b>37 :</b>Mixed small town dwellers <br>
<b>MAANTHUI</b>:Number of houses<br>
<b>MGEMOMV </b>:Avg size household<br>

All functions in Python are "pass by reference", there is no "pass by value". If you want to make an explicit <b>copy</b> of a pandas object,

In [11]:
#create a copy of the training dataset
data=Caravan.copy()

In [13]:
#splitting the features from the outcome variable
#Create X (=features) and Y (=Output)
data = np.array(data)
X=data[:,:-1]
Y=data[:,-1]


In [14]:
#Create an instance of the logistic regression classifier
logreg=linear_model.LogisticRegression(solver='newton-cg')

In [15]:
#Train the Model
logreg.fit(X,Y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='newton-cg', tol=0.0001,
          verbose=0, warm_start=False)

In [16]:
#the logistic regression has been fitted to the training data
#how well can the trained model forecast the data?

#load the test data
Caravan_Test=pd.read_csv('../../../Data/Caravan_Test.csv')

#create a copy of the data
data_sol=Caravan_Test.copy()
data_np_sol=data_sol.astype(np.float).values
X_Solution=data_np_sol[:,:-1]
Y_Solution=data_np_sol[:,-1]


In [17]:
#With the logistic regression we can calculate probabilities for the test dataset
Y_Prob=logreg.predict_proba(X_Solution)
print(Y_Prob[0:5,:])


[[ 0.98576394  0.01423606]
 [ 0.6932918   0.3067082 ]
 [ 0.91987157  0.08012843]
 [ 0.92615701  0.07384299]
 [ 0.98561759  0.01438241]]


In [22]:
#Y_Prob has 2 columns (one for each class)
#The last col of Y_Prob corresponds to a class =1 
#The shape of Y_Solutions needs to be a col vector as well
Y_Solution.shape=(len(Y_Solution),1)


#Combine Y_Solution and Y_Prob into a single pandas dataframe objext
Prob=pd.DataFrame({'Prob':Y_Prob[:,-1],
                   'Sol':Y_Solution[:,-1] })

#sort the dataframe in decreasing Prob
Prob=Prob.sort_values(by='Prob',ascending=False)


In [23]:
#If we take the 800 top names in our selection, how many did actually had an insurance policy policy ?
v=Prob.astype(np.int32).values[0:800,1]
v.shape=(len(v),1)
print(("The number of households discovered with an insurance policy were {k:0.0f}").format(k=np.sum(v[0:800,-1])))

The number of households discovered with an insurance policy were 118


In [24]:
#How many house actually had an insurance policy in the test set  ?
print(("The number of households with an insurance policy were {k:0.0f}").format(k=sum(Y_Solution[:,-1])))
np.sum(Y_Solution)

The number of households with an insurance policy were 238


238.0

<h3> How do we look compared to the competition ?</h3> <br>
The winning solution was provided by Charles Elkan from the University of California (<u><b>121</b> policy owners</u>).The scores for academic participants versus industry participants were similar; 'Europe' scored on an average <u>98 policy owners </u> versus <u> 89 policy owners <u> for American submissions.