In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,precision_score,recall_score,f1_score

In [2]:
df = pd.read_csv('breast-cancer-dataset.csv')
df.head()

Unnamed: 0,S/N,Year,Age,Menopause,Tumor Size (cm),Inv-Nodes,Breast,Metastasis,Breast Quadrant,History,Diagnosis Result
0,1,2019,40,1,2,0,Right,0,Upper inner,0,Benign
1,2,2019,39,1,2,0,Left,0,Upper outer,0,Benign
2,3,2019,45,0,4,0,Left,0,Lower outer,0,Benign
3,4,2019,26,1,3,0,Left,0,Lower inner,1,Benign
4,5,2019,21,1,1,0,Right,0,Upper outer,1,Benign


In [3]:
fig = px.histogram(x =df['Age'], marginal='violin', text_auto=True)
fig.show()

In [4]:
fig = px.histogram(x =df['Menopause'], text_auto=True,color=df['Diagnosis Result'],barmode='group',labels={'x':'menopause', 'y':'count'})
fig.show()

In [5]:
df.isnull().sum()

S/N                 0
Year                0
Age                 0
Menopause           0
Tumor Size (cm)     0
Inv-Nodes           0
Breast              0
Metastasis          0
Breast Quadrant     0
History             0
Diagnosis Result    0
dtype: int64

In [6]:
df['Diagnosis Result'].value_counts()

Diagnosis Result
Benign       120
Malignant     93
Name: count, dtype: int64

In [7]:
encoder = LabelEncoder()
categorical = df.select_dtypes(include=['object', 'category'])
for x in categorical:
    df[x] = encoder.fit_transform(df[x])
df.head()

Unnamed: 0,S/N,Year,Age,Menopause,Tumor Size (cm),Inv-Nodes,Breast,Metastasis,Breast Quadrant,History,Diagnosis Result
0,1,1,40,1,5,1,2,1,3,1,0
1,2,1,39,1,5,1,1,1,4,1,0
2,3,1,45,0,7,1,1,1,2,1,0
3,4,1,26,1,6,1,1,1,1,2,0
4,5,1,21,1,1,1,2,1,4,2,0


In [8]:
x = df.drop(['S/N', 'Year', 'Diagnosis Result'], axis=1)
y = df['Diagnosis Result']
xtrain, xtest, ytrain, ytest = train_test_split(x,y, test_size=0.2,random_state=4)

In [9]:
model1 = LogisticRegression(solver='newton-cholesky')
model1.fit(xtrain,ytrain)

In [10]:
pred1 = model1.predict(xtest)

In [11]:
print(classification_report(ytest,pred1))

              precision    recall  f1-score   support

           0       0.88      1.00      0.94        22
           1       1.00      0.86      0.92        21

    accuracy                           0.93        43
   macro avg       0.94      0.93      0.93        43
weighted avg       0.94      0.93      0.93        43



In [12]:
cm = confusion_matrix(ytest, pred1)
cm

array([[22,  0],
       [ 3, 18]])

In [13]:
model2 = KNeighborsClassifier()
model2.fit(xtrain, ytrain)

In [14]:
pred2 = model2.predict(xtest)
print(classification_report(ytest,pred2))

              precision    recall  f1-score   support

           0       0.74      0.91      0.82        22
           1       0.88      0.67      0.76        21

    accuracy                           0.79        43
   macro avg       0.81      0.79      0.79        43
weighted avg       0.81      0.79      0.79        43



In [15]:
cm = confusion_matrix(ytest, pred2)
cm

array([[20,  2],
       [ 7, 14]])

In [16]:
model3 = DecisionTreeClassifier()
model3.fit(xtrain, ytrain)

In [18]:
pred3 = model3.predict(xtest)
print(classification_report(ytest,pred3))
cm = confusion_matrix(ytest, pred3)
cm


              precision    recall  f1-score   support

           0       0.83      0.91      0.87        22
           1       0.89      0.81      0.85        21

    accuracy                           0.86        43
   macro avg       0.86      0.86      0.86        43
weighted avg       0.86      0.86      0.86        43



array([[20,  2],
       [ 4, 17]])

In [19]:
model4 = RandomForestClassifier()
model4.fit(xtrain, ytrain)

In [20]:
pred4 = model4.predict(xtest)
print(classification_report(ytest,pred4))
cm = confusion_matrix(ytest, pred4)
cm

              precision    recall  f1-score   support

           0       0.91      0.91      0.91        22
           1       0.90      0.90      0.90        21

    accuracy                           0.91        43
   macro avg       0.91      0.91      0.91        43
weighted avg       0.91      0.91      0.91        43



array([[20,  2],
       [ 2, 19]])

In [21]:
x.head()

Unnamed: 0,Age,Menopause,Tumor Size (cm),Inv-Nodes,Breast,Metastasis,Breast Quadrant,History
0,40,1,5,1,2,1,3,1
1,39,1,5,1,1,1,4,1
2,45,0,7,1,1,1,2,1
3,26,1,6,1,1,1,1,2
4,21,1,1,1,2,1,4,2


In [23]:
model4.predict([[40, 1,	5,	1,	2,	1,	3,	1]])


X does not have valid feature names, but RandomForestClassifier was fitted with feature names



array([0])