In [19]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
df_lung_cancer = pd.read_csv("lungcancer.csv")
df_liver_disease = pd.read_csv("liver_disease.csv",encoding="Windows-1254")
df_dementia_disease = pd.read_csv("dementia_dataset.csv")
df_heart_disease = pd.read_csv("heart.csv")

In [21]:
df_dementia_disease.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 373 entries, 0 to 372
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Subject ID  373 non-null    object 
 1   MRI ID      373 non-null    object 
 2   Group       373 non-null    object 
 3   Visit       373 non-null    int64  
 4   MR Delay    373 non-null    int64  
 5   M/F         373 non-null    object 
 6   Hand        373 non-null    object 
 7   Age         373 non-null    int64  
 8   EDUC        373 non-null    int64  
 9   SES         354 non-null    float64
 10  MMSE        371 non-null    float64
 11  CDR         373 non-null    float64
 12  eTIV        373 non-null    int64  
 13  nWBV        373 non-null    float64
 14  ASF         373 non-null    float64
dtypes: float64(5), int64(5), object(5)
memory usage: 43.8+ KB


In [22]:
df_liver_disease = df_liver_disease[[    'Age of the patient',
    'Gender of the patient',
    'Total Bilirubin',
    'Direct Bilirubin',
    'Alkphos Alkaline Phosphotase',
    'Sgpt Alamine Aminotransferase',
    'Sgot Aspartate Aminotransferase',
    'Total Protiens',
    'ALB Albumin',
    'A/G Ratio Albumin and Globulin Ratio',
    'Result']]
df_dementia_disease = df_dementia_disease[[
    'Subject ID','MRI ID',
    'Visit',
    'MR Delay',
    'M/F',
    'Hand',
    'Age',
    'EDUC',
    'SES',
    'MMSE',
    'CDR',
    'eTIV',
    'nWBV',
    'ASF',
    'Group'
]]

In [23]:
df_heart_disease.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


# Preprocessing

In [24]:
df_liver_disease.info()
df_liver_disease=df_liver_disease.iloc[0:1000,:]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30689 entries, 0 to 30688
Data columns (total 11 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Age of the patient                    30687 non-null  float64
 1   Gender of the patient                 29787 non-null  object 
 2   Total Bilirubin                       30041 non-null  float64
 3   Direct Bilirubin                      30128 non-null  float64
 4   Alkphos Alkaline Phosphotase          29893 non-null  float64
 5   Sgpt Alamine Aminotransferase         30151 non-null  float64
 6   Sgot Aspartate Aminotransferase       30227 non-null  float64
 7   Total Protiens                        30226 non-null  float64
 8   ALB Albumin                           30195 non-null  float64
 9   A/G Ratio Albumin and Globulin Ratio  30130 non-null  float64
 10  Result                                30689 non-null  int64  
dtypes: float64(9), 

In [25]:
df_lung_cancer.drop(columns=['GENDER'],inplace=True)

In [26]:
df_liver_disease.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Age of the patient                    1000 non-null   float64
 1   Gender of the patient                 985 non-null    object 
 2   Total Bilirubin                       995 non-null    float64
 3   Direct Bilirubin                      994 non-null    float64
 4   Alkphos Alkaline Phosphotase          993 non-null    float64
 5   Sgpt Alamine Aminotransferase         995 non-null    float64
 6   Sgot Aspartate Aminotransferase       996 non-null    float64
 7   Total Protiens                        992 non-null    float64
 8   ALB Albumin                           989 non-null    float64
 9   A/G Ratio Albumin and Globulin Ratio  992 non-null    float64
 10  Result                                1000 non-null   int64  
dtypes: float64(9), int

In [27]:
df_liver_disease.drop(columns = ["Gender of the patient"],inplace=True )
for col in df_liver_disease.columns.values:
    df_liver_disease[col].fillna(df_liver_disease[col].mean(), inplace=True)


In [28]:
df_liver_disease.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Age of the patient                    1000 non-null   float64
 1   Total Bilirubin                       1000 non-null   float64
 2   Direct Bilirubin                      1000 non-null   float64
 3   Alkphos Alkaline Phosphotase          1000 non-null   float64
 4   Sgpt Alamine Aminotransferase         1000 non-null   float64
 5   Sgot Aspartate Aminotransferase       1000 non-null   float64
 6   Total Protiens                        1000 non-null   float64
 7   ALB Albumin                           1000 non-null   float64
 8   A/G Ratio Albumin and Globulin Ratio  1000 non-null   float64
 9   Result                                1000 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 78.3 KB


In [29]:
df_dementia_disease.head()

Unnamed: 0,Subject ID,MRI ID,Visit,MR Delay,M/F,Hand,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF,Group
0,OAS2_0001,OAS2_0001_MR1,1,0,M,R,87,14,2.0,27.0,0.0,1987,0.696,0.883,Nondemented
1,OAS2_0001,OAS2_0001_MR2,2,457,M,R,88,14,2.0,30.0,0.0,2004,0.681,0.876,Nondemented
2,OAS2_0002,OAS2_0002_MR1,1,0,M,R,75,12,,23.0,0.5,1678,0.736,1.046,Demented
3,OAS2_0002,OAS2_0002_MR2,2,560,M,R,76,12,,28.0,0.5,1738,0.713,1.01,Demented
4,OAS2_0002,OAS2_0002_MR3,3,1895,M,R,80,12,,22.0,0.5,1698,0.701,1.034,Demented


In [30]:
df_dementia_disease.drop(
    columns=[
        "Subject ID",
        "MRI ID",
    ],
    inplace=True,
)
df_dementia_disease["Group"] = df_dementia_disease["Group"].map(
    {"Demented": 1, "Nondemented": 0}
)
df_dementia_disease["M/F"] = df_dementia_disease["M/F"].map({"M": 1, "F": 0})
df_dementia_disease["Hand"] = df_dementia_disease["Hand"].map({"R": 1, "L": 0})
df_dementia_disease["Group"].fillna(df_dementia_disease["Group"].mode())
df_dementia_disease.replace(
    {"Group": {"Demented": 1, "Nondemented": 0, np.nan: 0}}, inplace=True
)
df_dementia_disease.replace(
    {"Group": {"Converted": df_dementia_disease["Group"].mode()}}, inplace=True
)
df_dementia_disease["SES"].fillna(df_dementia_disease["SES"].mean(), inplace=True)
df_dementia_disease["MMSE"].fillna(df_dementia_disease["MMSE"].mean(), inplace=True)

for col in df_dementia_disease.columns.values:
    df_dementia_disease[col].fillna(df_dementia_disease[col].mode(), inplace=True)
df_dementia_disease["Group"].fillna(df_dementia_disease["Group"].mode(), inplace=True)
df_dementia_disease[df_dementia_disease["Group"] != 1]

Unnamed: 0,Visit,MR Delay,M/F,Hand,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF,Group
0,1,0,1,1,87,14,2.0,27.0,0.0,1987,0.696,0.883,0.0
1,2,457,1,1,88,14,2.0,30.0,0.0,2004,0.681,0.876,0.0
5,1,0,0,1,88,18,3.0,28.0,0.0,1215,0.710,1.444,0.0
6,2,538,0,1,90,18,3.0,27.0,0.0,1200,0.718,1.462,0.0
7,1,0,1,1,80,12,4.0,28.0,0.0,1689,0.712,1.039,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
363,3,732,0,1,68,13,2.0,30.0,0.0,1506,0.740,1.165,0.0
364,4,2107,0,1,72,13,2.0,30.0,0.0,1510,0.723,1.162,0.0
370,1,0,0,1,61,13,2.0,30.0,0.0,1319,0.801,1.331,0.0
371,2,763,0,1,63,13,2.0,30.0,0.0,1327,0.796,1.323,0.0


In [31]:
for col in df_dementia_disease.columns.values:
    print(col,df_dementia_disease[col].isna().sum())

Visit 0
MR Delay 0
M/F 0
Hand 0
Age 0
EDUC 0
SES 0
MMSE 0
CDR 0
eTIV 0
nWBV 0
ASF 0
Group 0


In [32]:
# df_heart_disease = pd.get_dummies(df_heart_disease, columns=['ChestPainType'])
label_encoder = LabelEncoder()
df_heart_disease['ChestPainType'] = label_encoder.fit_transform(df_heart_disease['ChestPainType'])
df_heart_disease['ST_Slope'] = label_encoder.fit_transform(df_heart_disease['ST_Slope'])
df_heart_disease['ExerciseAngina'] = label_encoder.fit_transform(df_heart_disease['ExerciseAngina'])
df_heart_disease["RestingECG"] =label_encoder.fit_transform(df_heart_disease['RestingECG'])
df_heart_disease["Sex"] =label_encoder.fit_transform(df_heart_disease['Sex'])

In [33]:
# plt.figure(figsize=(8, 6))
# sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
# plt.title('Correlation Matrix Heatmap')
# plt.show()

In [34]:
def predict_output(algorithm, X_train, y_train, x_test, y_test):
    algorithm.fit(X_train, y_train)
    y_pred = algorithm.predict(x_test)
    return accuracy_score(y_test, y_pred).__round__(3)

In [35]:
from sklearn.model_selection import train_test_split


X_live_disease = pd.DataFrame(df_liver_disease.iloc[:, 0:len(df_liver_disease.columns) - 1])
y_live_disease = df_liver_disease.iloc[:, len(df_liver_disease.columns) - 1]
x_lung_cancer = pd.DataFrame(df_liver_disease.iloc[:, 0:len(df_liver_disease.columns) - 1])
y_lung_cancer = df_liver_disease.iloc[:, len(df_liver_disease.columns) - 1]
x_dementia_disease = pd.DataFrame(df_dementia_disease.iloc[:, 0:len(df_dementia_disease.columns) - 1])
y_dementia_disease = df_dementia_disease.iloc[:, len(df_dementia_disease.columns) - 1]
x_heart_disease = pd.DataFrame(df_heart_disease.iloc[:, 0:len(df_heart_disease.columns) - 1])
y_heart_disease = df_heart_disease.iloc[:, len(df_heart_disease.columns) - 1]


df_liver_disease_X_train,df_liver_disease_X_test,df_liver_disease_y_train,df_liver_disease_y_test = train_test_split(X_live_disease,y_live_disease,test_size=0.2,random_state=42)
df_dementia_disease_X_train,df_dementia_disease_X_test,df_dementia_disease_y_train,df_dementia_disease_y_test = train_test_split(x_dementia_disease,y_dementia_disease,test_size=0.2,random_state=42)
df_heart_disease_X_train,df_heart_disease_X_test,df_heart_disease_y_train,df_heart_disease_y_test = train_test_split(x_heart_disease,y_heart_disease,test_size=0.2,random_state=42)
df_lung_cancer_X_train,df_lung_cancer_X_test,df_lung_cancer_y_train,df_lung_cancer_y_test = train_test_split(x_lung_cancer,y_lung_cancer,test_size=0.2,random_state=42)


logistic_regression = LogisticRegression(max_iter=5000)
logistic_regression.fit(df_liver_disease_X_train,df_liver_disease_y_train)
y_pred = logistic_regression.predict(df_liver_disease_X_test)

print("Logistic Regression Accuracy: ", accuracy_score(df_liver_disease_y_test, y_pred).__round__(3))


support_vector_machine = LinearSVC(max_iter=10000, dual=False)
support_vector_machine.fit(df_dementia_disease_X_train,df_dementia_disease_y_train)
print("Support Vector Machine Accuracy: ", accuracy_score(df_dementia_disease_y_test, support_vector_machine.predict(df_dementia_disease_X_test)).__round__(3))

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(df_lung_cancer_X_train,df_lung_cancer_y_train)
y_pred = knn.predict(df_lung_cancer_X_test)
print("Decision Tree Accuracy: ", accuracy_score(df_lung_cancer_y_test, y_pred).__round__(3))

randomforest = RandomForestClassifier(n_estimators=100)
randomforest.fit(df_heart_disease_X_train,df_heart_disease_y_train)
randomforest.predict(df_heart_disease_X_test)

print("Random Forest Accuracy: ", accuracy_score(df_heart_disease_y_test, randomforest.predict(df_heart_disease_X_test)).__round__(3))

# print("Logistic Regression Accuracy: ", np.mean(LogisticRegression_accuracy))
# print("Decision Tree Accuracy: ", np.mean(DecisionTree_accuracy))
# print("Support Vector Machine Accuracy: ", np.mean(SupportVectorMachine_accuracy))
# print("K-Nearest Neighbors Accuracy: ", np.mean(KNearestNeighbors_accuracy))
# print("Random Forest Accuracy: ", np.mean(RandomForest_accuracy))

Logistic Regression Accuracy:  0.725
Support Vector Machine Accuracy:  0.96
Decision Tree Accuracy:  0.68
Random Forest Accuracy:  0.891


In [36]:
df_heart_disease.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,2,0
1,49,0,2,160,180,0,1,156,0,1.0,1,1
2,37,1,1,130,283,0,2,98,0,0.0,2,0
3,48,0,0,138,214,0,1,108,1,1.5,1,1
4,54,1,2,150,195,0,1,122,0,0.0,2,0
