In [1]:
import pandas as pd
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In this notebook, I have implemented the feature engineering and selection process

In [2]:
data = pd.read_csv('datasets/breastcancer.csv')
data.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,'40-49','premeno','15-19','0-2','yes','3','right','left_up','no','recurrence-events'


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       286 non-null    object
 1   1       286 non-null    object
 2   2       286 non-null    object
 3   3       286 non-null    object
 4   4       278 non-null    object
 5   5       286 non-null    object
 6   6       286 non-null    object
 7   7       285 non-null    object
 8   8       286 non-null    object
 9   9       286 non-null    object
dtypes: object(10)
memory usage: 22.5+ KB


In [4]:
data.isnull().sum()

0    0
1    0
2    0
3    0
4    8
5    0
6    0
7    1
8    0
9    0
dtype: int64

In [5]:
data['7'].value_counts()

'left_low'     110
'left_up'       97
'right_up'      33
'right_low'     24
'central'       21
Name: 7, dtype: int64

In [6]:
data.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,'40-49','premeno','15-19','0-2','yes','3','right','left_up','no','recurrence-events'


In [7]:
X = data.iloc[:,0:9]
y = data.iloc[:,-1]

In [8]:
X.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,'40-49','premeno','15-19','0-2','yes','3','right','left_up','no'


In [9]:
y.head(1)

0    'recurrence-events'
Name: 9, dtype: object

In [10]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, test_size=0.1, random_state=1)

In [11]:
pipe =Pipeline([
    ('si', SimpleImputer(strategy='most_frequent')),
    ('oe', OneHotEncoder(handle_unknown='ignore', sparse=False))
])
pipe.fit(Xtrain)
Xtrain_pipe = pipe.transform(Xtrain)
Xtest_pipe = pipe.transform(Xtest)

In [12]:
le = LabelEncoder()
le.fit(ytrain)
ytrain_le = le.transform(ytrain)
ytest_le = le.transform(ytest)

# without using feature selection

In [16]:
lr_one = LogisticRegression()
lr_one.fit(Xtrain_pipe,ytrain_le)
pred_one = lr_one.predict(Xtest_pipe)
accuracy_one = accuracy_score(pred_one, ytest_le)
accuracy_one

0.6896551724137931

# with using feature selection

In [22]:
featureselection = SelectKBest(score_func=chi2, k=12)
featureselection.fit(Xtrain_pipe,ytrain_le)
Xtrain_fs = featureselection.transform(Xtrain_pipe)
Xtest_fs= featureselection.transform(Xtest_pipe)

In [23]:
Xtrain_fs.shape

(257, 12)

In [25]:
lr = LogisticRegression()
lr.fit(Xtrain_fs, ytrain_le)

LogisticRegression()

In [26]:
pred = lr.predict(Xtest_fs)

In [27]:
accuracy = accuracy_score(pred, ytest_le)
accuracy

0.7241379310344828