In [2]:
%matplotlib inline 

import os
import numpy as np
import pandas as pd
import scipy.stats as stats
from matplotlib import pyplot as plt
import sklearn
import statsmodels.api as sm
from scipy import optimize

import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

# special matplotlib argument for improved plots
from matplotlib import rcParams

In [3]:
df=pd.read_csv("breast-cancer.csv", header=None, names=["col1","col2","col3","col4","col5","col6","col7","col8","col9","col10","Result"])

In [4]:
df.head(10)

Unnamed: 0,col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,Result
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
5,1017122,8,10,10,8,7,10,9,7,1,4
6,1018099,1,1,1,1,2,10,3,1,1,2
7,1018561,2,1,2,1,2,1,3,1,1,2
8,1033078,2,1,1,1,2,1,1,1,5,2
9,1033078,4,2,1,1,2,1,2,1,1,2


In [7]:
dataset = df[(df.col1 != '?') & (df.col2 != '?') & (df.col3 != '?') & (df.col4 != '?') & (df.col5 != '?') & (df.col6 != '?') & (df.col7 != '?') & (df.col8 != '?') & (df.col9 != '?') & (df.col10 != '?') & (df.Result != '?')]

In [8]:
Y = dataset.iloc[:, 10]
Y.head()

0    2
1    2
2    2
3    2
4    2
Name: Result, dtype: int64

In [9]:
X = dataset.iloc[:, 1:10]
X.head()

Unnamed: 0,col2,col3,col4,col5,col6,col7,col8,col9,col10
0,5,1,1,1,2,1,3,1,1
1,5,4,4,5,7,10,3,2,1
2,3,1,1,1,2,2,3,1,1
3,6,8,8,1,3,4,3,7,1
4,4,1,1,3,2,1,3,1,1


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn import metrics

In [11]:
x_train, x_cv, y_train, y_cv=train_test_split(X,Y, test_size=0.2, random_state = 0)

In [12]:
def evaluate_metric(model, x_cv, y_cv):
    return metrics.accuracy_score(y_cv, model.predict(x_cv))

In [13]:
def forward_feature_selection(x_train, x_cv, y_train, y_cv, n):
    feature_set = []
    for num_features in range(n):
        metric_list = [] # Choose appropriate metric based on business problem
        model = LogisticRegression(random_state = 0, solver='lbfgs')# You can choose any model you like, this technique is model agnostic
        for feature in x_train.columns:
            if feature not in feature_set:
                f_set = feature_set.copy()
                f_set.append(feature)
                model.fit(x_train[f_set], y_train)
                metric_list.append((evaluate_metric(model, x_cv[f_set], y_cv), feature))

        metric_list.sort(key=lambda x : x[0], reverse = True) # In case metric follows "the more, the merrier"
        feature_set.append(metric_list[0][1])
    return feature_set

In [14]:
f=forward_feature_selection(x_train, x_cv, y_train, y_cv, 4)

In [15]:
f

['col3', 'col2', 'col5', 'col10']

In [16]:
Y = dataset.iloc[:, 10].values

In [17]:
from sklearn.preprocessing import LabelEncoder
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)

In [18]:
X = dataset[['col2','col3','col5','col10']].values

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.30, random_state = 0)

L1 Regularization

In [47]:
L1 = LogisticRegression(penalty='l1', C=0.01)

In [48]:
L1.fit(X_train, Y_train)



LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [49]:
Y_pred = L1.predict(X_test)

In [50]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [51]:
print(confusion_matrix(Y_test, Y_pred))
print(accuracy_score(Y_test, Y_pred))

[[114  16]
 [  2  73]]
0.9121951219512195


L2 Regularization

In [53]:
L2 = LogisticRegression(penalty='l2', C=0.01)
L2.fit(X_train, Y_train)



LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [54]:
Y_pred = L2.predict(X_test)

In [55]:
print(confusion_matrix(Y_test, Y_pred))
print(accuracy_score(Y_test, Y_pred))

[[125   5]
 [ 13  62]]
0.9121951219512195
