### feature selection

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

from joblib import dump, load

In [3]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

In [4]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [5]:
from sklearn.preprocessing import StandardScaler

In [6]:
df = pd.read_csv('https://raw.githubusercontent.com/digipodium/Datasets/main/diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [7]:
df.rename({'DiabetesPedigreeFunction':'pedigree'},axis=1,inplace=True)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,pedigree,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [8]:
featSelector = SelectKBest(chi2, k=5)

In [9]:
X = df.iloc[:,:-1]  
y = df['Outcome']
print(X.shape,y.shape)

(768, 8) (768,)


In [10]:
featSelector.fit(X,y)

SelectKBest(k=5, score_func=<function chi2 at 0x00000228232AC4C0>)

In [11]:
np.set_printoptions(precision=2)

In [12]:
featSelector.scores_

array([ 111.52, 1411.89,   17.61,   53.11, 2175.57,  127.67,    5.39,
        181.3 ])

In [13]:
features = featSelector.transform(X)
print(features.shape)

(768, 5)


In [14]:
features

array([[  6. , 148. ,   0. ,  33.6,  50. ],
       [  1. ,  85. ,   0. ,  26.6,  31. ],
       [  8. , 183. ,   0. ,  23.3,  32. ],
       ...,
       [  5. , 121. , 112. ,  26.2,  30. ],
       [  1. , 126. ,   0. ,  30.1,  47. ],
       [  1. ,  93. ,   0. ,  30.4,  23. ]])

In [15]:
scaler = StandardScaler()
scaledX = scaler.fit_transform(features)
xtrain,xtest,ytrain,ytest = train_test_split(scaledX,y,test_size=.2)
#xtrain.shape,xtest.shape
m = KNeighborsClassifier()
m.fit(xtrain, ytrain)
ypred = m.predict(xtest)
cm= confusion_matrix(ytest, ypred)
print(cm)
print(classification_report(ytest,ypred))

[[83 20]
 [23 28]]
              precision    recall  f1-score   support

           0       0.78      0.81      0.79       103
           1       0.58      0.55      0.57        51

    accuracy                           0.72       154
   macro avg       0.68      0.68      0.68       154
weighted avg       0.72      0.72      0.72       154



### wraper method implementation

In [16]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [17]:
clf = LogisticRegression(solver='liblinear')
rfe = RFE(clf,)
rfe.fit(X,y)

RFE(estimator=LogisticRegression(solver='liblinear'))

In [18]:
print('Features selected',rfe.n_features_)

Features selected 4


In [19]:
rfe.support_

array([ True,  True, False, False, False,  True,  True, False])

In [20]:
rfe.ranking_

array([1, 1, 2, 4, 5, 1, 1, 3])

In [21]:
X.columns.tolist()

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'pedigree',
 'Age']

In [22]:
rfe.get_feature_names_out()

array(['Pregnancies', 'Glucose', 'BMI', 'pedigree'], dtype=object)

In [23]:
features = X[rfe.get_feature_names_out()]
scaler = StandardScaler()
scaledX = scaler.fit_transform(features)
xtrain,xtest,ytrain,ytest = train_test_split(scaledX,y,test_size=.2,random_state=1)
#xtrain.shape,xtest.shape
m = KNeighborsClassifier()
m.fit(xtrain, ytrain)
ypred = m.predict(xtest)
cm= confusion_matrix(ytest, ypred)
print(cm)
print(classification_report(ytest,ypred))

[[85 14]
 [23 32]]
              precision    recall  f1-score   support

           0       0.79      0.86      0.82        99
           1       0.70      0.58      0.63        55

    accuracy                           0.76       154
   macro avg       0.74      0.72      0.73       154
weighted avg       0.75      0.76      0.75       154



In [24]:
model_dict={
    'title':'classification model for ad click prediction',
    'classifier':m,
    'scaler':scaler,
}

dump(model_dict, "clf_ap.pkl")

['clf_ap.pkl']