In [1]:
# Imports

# pandas
import pandas as pd
from pandas import Series,DataFrame

# numpy
import numpy as np

# machine learning
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score

#Importing LabelEncoder
from sklearn.preprocessing import LabelEncoder

In [2]:
dataset = pd.read_csv("dataset_classification.csv")

In [3]:
X_train, X_test, y_train, y_test = train_test_split(dataset.drop(['Response'], axis = 1), dataset['Response'], test_size=0.2, random_state=0)

In [4]:
# LabelEncoder and initializing
# To categrize all object variables
le=LabelEncoder()
for col in X_test.columns.values:
    # Encoding only categorical variables
    if X_test[col].dtypes=='object':
        # Using whole data to form an exhaustive list of levels
        data = X_train[col].append(X_test[col])
        le.fit(data.values)
        X_train[col]=le.transform(X_train[col])
        X_test[col]=le.transform(X_test[col])

In [5]:
X_train.head()

Unnamed: 0,State,Coverage,Education,EmploymentStatus,Gender,Location.Code,Marital.Status,Number.of.Open.Complaints,Number.of.Policies,Policy.Type,Policy,Renew.Offer.Type,Sales.Channel,Vehicle.Class,Vehicle.Size
1461,4,2,1,4,1,1,2,0,2,1,5,0,0,0,0
2521,0,0,0,1,0,1,2,0,2,1,4,0,2,0,2
9031,1,1,4,4,1,1,1,0,1,1,5,0,1,0,1
5045,0,0,1,4,1,0,0,0,1,1,4,3,1,0,1
4987,3,0,1,1,1,1,0,0,2,1,5,3,2,5,2


In [6]:
X_test.head()

Unnamed: 0,State,Coverage,Education,EmploymentStatus,Gender,Location.Code,Marital.Status,Number.of.Open.Complaints,Number.of.Policies,Policy.Type,Policy,Renew.Offer.Type,Sales.Channel,Vehicle.Class,Vehicle.Size
2182,1,0,1,1,0,1,0,0,5,1,4,2,0,5,1
7823,3,0,0,4,0,1,2,0,1,0,1,0,2,5,1
1651,4,2,1,1,1,1,1,0,6,1,5,0,1,0,2
888,0,1,0,2,1,1,1,2,1,1,4,1,3,2,1
3844,4,1,1,4,1,1,2,0,1,1,3,3,0,3,2


In [7]:
# Logistic Regression

logreg = LogisticRegression(class_weight="balanced")

logreg.fit(X_train, y_train)

scores_lg = cross_val_score(logreg, X_train, y_train, cv=5)

scores_lg.mean()

0.57342151169684963

In [8]:
# Accuracy of Logistic Regression model's prediction
y_pred_logreg = cross_val_predict(logreg, X_test, y_test, cv=5)
accuracy_score(y_test, y_pred_logreg)

0.61412151067323484

In [9]:
# Area Under Curve Evaluation
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, logreg.predict(X_train))
print (auc(false_positive_rate, true_positive_rate))

0.603052270231


In [10]:
# get Correlation Coefficient for each feature using Logistic Regression
coeff_df_lg = DataFrame(X_train.columns)
coeff_df_lg.columns = ['Features']
coeff_df_lg["Coefficient Estimate"] = pd.Series(logreg.coef_[0])

# preview
coeff_df_lg.sort_values(['Coefficient Estimate'], ascending = False)

Unnamed: 0,Features,Coefficient Estimate
9,Policy.Type,0.108399
4,Gender,0.062892
13,Vehicle.Class,0.050568
2,Education,0.042925
0,State,0.02488
10,Policy,-0.031471
5,Location.Code,-0.038223
8,Number.of.Policies,-0.040444
7,Number.of.Open.Complaints,-0.047194
3,EmploymentStatus,-0.061505


In [11]:
# Random Forests

random_forest = RandomForestClassifier(n_estimators=100)

random_forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [12]:
# Cross Validation
scores_rf = cross_val_score(random_forest, X_train, y_train, cv=5)

scores_rf.mean()

0.9577113394175516

In [13]:
y_pred_rf = cross_val_predict(random_forest, X_test, y_test, cv=5)
accuracy_score(y_test, y_pred_rf)

0.88943623426382046

In [14]:
# get Correlation Coefficient for each feature using Random Forest
coeff_df = DataFrame(X_train.columns)
coeff_df.columns = ['Features']
coeff_df["Coefficient Estimate"] = pd.Series(random_forest.feature_importances_)

# preview
coeff_df.sort_values(['Coefficient Estimate'], ascending = False)

Unnamed: 0,Features,Coefficient Estimate
8,Number.of.Policies,0.109535
3,EmploymentStatus,0.109027
11,Renew.Offer.Type,0.093936
2,Education,0.093257
13,Vehicle.Class,0.075884
10,Policy,0.0661
12,Sales.Channel,0.065162
0,State,0.061985
6,Marital.Status,0.059388
1,Coverage,0.058576
