In [1]:
#importing necessary libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy as sc
import sklearn as sk
import seaborn as sns

In [None]:
#Reading data set

In [None]:
df = pd.read_csv('test_data.csv')

In [None]:
#Scanning the data frame

In [None]:
df.head()

In [None]:
#dimensions

In [None]:
df.shape

In [None]:
# Calculate skewness for numerical columns
import matplotlib.pyplot as plt
skewness = df.select_dtypes(include=['int64', 'float64']).skew()

# Count the number of numerical columns
num_cols_count = len(df.select_dtypes(include=['int64', 'float64']).columns)

# Determine the layout for subplots
num_rows = (num_cols_count + 3) // 4  # Adjust the number of columns in each row
num_cols = min(4, num_cols_count)  # Maximum of 4 columns in each row

# Plot histograms for numerical columns to visualize distributions and identify anomalies
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 10))

for i in range(num_rows):
    for j in range(num_cols):
        col_idx = i * num_cols + j
        if col_idx < num_cols_count:
            col = df.select_dtypes(include=['int64', 'float64']).columns[col_idx]
            axes[i, j].hist(df[col], bins=15, color='green', alpha=0.7)
            axes[i, j].set_title(f'{col}')
            axes[i, j].set_xlabel(col)
            axes[i, j].set_ylabel('Frequency')
            
            # Compute skewness
            skew_val = skewness[col]
            
            # Plot skewness value in the center of plot
            axes[i, j].text(0.5, 0.5, f'Skewness: {skew_val:.2f}', horizontalalignment='center',
                            verticalalignment='center', transform=axes[i, j].transAxes, fontsize=10, color='red')

plt.tight_layout()
plt.show()

# Print skewness values
print("Skewness:")
print(skewness)

In [None]:
df.plot(kind='box', rot=45,color='green')

# Show the plot
plt.show()

In [None]:
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Plotting boxplots for each numerical feature to identify outliers
for column in numeric_cols:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x=df[column])
    plt.title(f'Boxplot of {column}')
    plt.show()

In [None]:
df.info()

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
df.describe().transpose()

In [None]:
#Finding missing value

In [None]:
df.isnull()

In [None]:
#Which columns are missing values and what is the extent of missingness??

In [None]:
df.isnull().sum()/len(df)*100

In [None]:
import missingno as msno

In [None]:
msno.bar(df)
plt.show()

In [None]:
msno.matrix(df)
plt.show()

In [None]:
df.duplicated().sum()

In [None]:
#Data processing

In [None]:
 cate_val =[]
cont_val = []
for column in df.columns:
    if df[column].nunique() <= 10:
        cate_val.append(column)
    else:
        cont_val.append(column)

In [None]:
cate_val

In [None]:
df['Disease'].unique()

In [None]:
pd.get_dummies(df,columns = cate_val, drop_first = True)

In [None]:
#Correlation matrix

In [None]:
import matplotlib.pyplot as plt
cnames = ['Glucose','Cholesterol','Hemoglobin',	'Platelets','White Blood Cells','Red Blood Cells']
f, ax = plt.subplots(figsize=(7, 5))

#Correlation plot
df_corr = df.loc[:,cnames]
#Generate correlation matrix
corr = df_corr.corr()

#Plot using seaborn library
sns.heatmap(corr, annot = True, cmap='coolwarm',linewidths=.1)
plt.show()

In [None]:
#Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
st = StandardScaler()
df[cont_val] = st.fit_transform(df[cont_val])

In [None]:
df.head()

In [None]:
#Splitting the dataset into training and test set

In [None]:
X = df.iloc[:, :-1].values  
y = df.iloc[:, -1].values
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.2, random_state = 42)

In [None]:
X_test

In [None]:
y_test

In [None]:
#Logisitc regression

In [None]:
from sklearn.linear_model import LogisticRegression
log = LogisticRegression()
log.fit(X_train,y_train)

In [None]:
y_pred1 = log.predict(X_test)

In [None]:
#CONFUSION MATRIX

In [None]:
from sklearn.metrics import confusion_matrix
matrix= confusion_matrix(y_test, y_pred1)
sns.heatmap(matrix,annot = True, fmt = "d")

In [None]:
from sklearn.metrics import precision_score
precision = precision_score(y_test, y_pred1,
                            pos_label='positive',
                                           average='micro')
print("Precision: ",precision)

from sklearn.metrics import recall_score
recall = recall_score(y_test, y_pred1, pos_label='positive',
                                           average='micro')
print("Recall is: ",recall)

In [None]:
#F- Score
print((2*precision*recall)/(precision+recall))

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test ,y_pred1)

In [None]:
#SVC

In [None]:
from sklearn import svm
svm = svm.SVC()
svm.fit(X_train,y_train)

In [None]:
y_pred2 = svm.predict(X_test)
accuracy_score(y_test ,y_pred2)

In [None]:
from sklearn.metrics import confusion_matrix
matrix= confusion_matrix(y_test, y_pred2)
sns.heatmap(matrix,annot = True, fmt = "d")

In [None]:
from sklearn.metrics import precision_score
precision = precision_score(y_test, y_pred2,
                            pos_label='positive',
                                           average='micro')
print("Precision: ",precision)

from sklearn.metrics import recall_score
recall = recall_score(y_test, y_pred2, pos_label='positive',
                                           average='micro')
print("Recall is: ",recall)

In [None]:
#F- Score
print((2*precision*recall)/(precision+recall))

In [None]:
# K Neighbours Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)

In [None]:
y_pred3 = knn.predict(X_test)
accuracy_score(y_test ,y_pred3)

In [None]:
from sklearn.metrics import confusion_matrix
matrix= confusion_matrix(y_test, y_pred3)
sns.heatmap(matrix,annot = True, fmt = "d")

In [None]:
score = []
for k in range(1,40):
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train,y_train)
    y_pred = knn.predict(X_test)
    score.append(accuracy_score(y_test,y_pred))
    

In [None]:
score

In [None]:
knn = KNeighborsClassifier(n_neighbors = 7)
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
accuracy_score(y_test,y_pred)
    

In [None]:
#Precision and Recall score

In [None]:
from sklearn.metrics import precision_score
precision = precision_score(y_test, y_pred3,
                            pos_label='positive',
                                           average='micro')
print("Precision: ",precision)

from sklearn.metrics import recall_score
recall = recall_score(y_test, y_pred3, pos_label='positive',
                                           average='micro')
print("Recall is: ",recall)

In [None]:
#F-Score
print((2*precision*recall)/(precision+recall))

In [None]:
#Non-linear ML algorithms

In [None]:
import pandas as pd
df = pd.read_csv('test_data.csv')
df.head()

In [None]:
df = df.drop_duplicates()

In [None]:
X = df.iloc[:, :-1].values  
y = df.iloc[:, -1].values


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.2, random_state = 42)

In [None]:
#Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)

In [None]:
y_pred4 = dt.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred4)

In [None]:
from sklearn.metrics import confusion_matrix
matrix= confusion_matrix(y_test, y_pred4)
sns.heatmap(matrix,annot = True, fmt = "d")

In [None]:
from sklearn.metrics import precision_score
precision = precision_score(y_test, y_pred4,
                            pos_label='positive',
                                           average='micro')
print("Precision: ",precision)

from sklearn.metrics import recall_score
recall = recall_score(y_test, y_pred4, pos_label='positive',
                                           average='micro')
print("Recall is: ",recall)

In [None]:
#F-Score
print((2*precision*recall)/(precision+recall))

In [None]:
#Random Forest Claasifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train,y_train)

In [None]:
y_pred5 = rf.predict(X_test)
accuracy_score(y_test,y_pred5)

In [None]:
from sklearn.metrics import confusion_matrix
matrix= confusion_matrix(y_test, y_pred5)
sns.heatmap(matrix,annot = True, fmt = "d")

In [None]:
from sklearn.metrics import precision_score
precision = precision_score(y_test, y_pred5,
                            pos_label='positive',
                                           average='micro')
print("Precision: ",precision)

from sklearn.metrics import recall_score
recall = recall_score(y_test, y_pred5, pos_label='positive',
                                           average='micro')
print("Recall is: ",recall)

In [None]:
#F-Score
print((2*precision*recall)/(precision+recall))

In [None]:
#Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train,y_train)


In [None]:
y_pred6 = gbc.predict(X_test)
accuracy_score(y_test,y_pred6)

In [None]:
from sklearn.metrics import confusion_matrix
matrix= confusion_matrix(y_test, y_pred6)
sns.heatmap(matrix,annot = True, fmt = "d")

In [None]:
from sklearn.metrics import precision_score
precision = precision_score(y_test, y_pred6,
                            pos_label='positive',
                                           average='micro')
print("Precision: ",precision)

from sklearn.metrics import recall_score
recall = recall_score(y_test, y_pred6, pos_label='positive',
                                           average='micro')
print("Recall is: ",recall)

In [None]:
#F-Score
print((2*precision*recall)/(precision+recall))

In [None]:
final_data = pd.DataFrame({ 'Models' : ['LR','SVM','KNN','DT','RF','GB'],
                           'ACC':[accuracy_score(y_test,y_pred1),
                                 accuracy_score(y_test,y_pred2),
                                 accuracy_score(y_test,y_pred3),
                                 accuracy_score(y_test,y_pred4),
                                 accuracy_score(y_test,y_pred5),
                                 accuracy_score(y_test,y_pred6)]})

In [None]:
final_data

In [None]:
import seaborn as sns


In [None]:
sns.barplot(x = final_data.Models,y = final_data.ACC,color = 'blue', palette = 'hls')

In [None]:
#Predcition on new Data

In [None]:
import pandas as pd

# Assuming data is provided in a CSV format or directly as a DataFrame
data = {
    'Glucose': [0.001827, 0.436679, 0.545697, 0.172994, 0.758534],
    'Cholesterol': [0.033693, 0.972653, 0.324815, 0.050351, 0.739968],
    'Hemoglobin': [0.114755, 0.084998, 0.584467, 0.736, 0.597868],
    'Platelets': [0.997927, 0.180909, 0.475748, 0.782022, 0.772683],
    'White Blood Cells': [0.562604, 0.675736, 0.558596, 0.069435, 0.87572],
    'Red Blood Cells': [0.866499, 0.563889, 0.661007, 0.085219, 0.860265],
    'Hematocrit': [0.578042, 0.798382, 0.934056, 0.032907, 0.486189],
    'Mean Corpuscular Volume': [0.914615, 0.670361, 0.381782, 0.460619, 0.486686],
    'Mean Corpuscular Hemoglobin': [0.026864, 0.376092, 0.500342, 0.785448, 0.621048],
    'Mean Corpuscular Hemoglobin Concentration': [0.038641, 0.18489, 0.531829, 0.491495, 0.191756],
    'HbA1c': [0.65323, 0.83354, 0.678901, 0.3815, 0.993381],
    'LDL Cholesterol': [0.186104, 0.153001, 0.220479, 0.459396, 0.272338],
    'HDL Cholesterol': [0.430398, 0.458533, 0.817151, 0.420154, 0.663579],
    'ALT': [0.016678, 0.401845, 0.690981, 0.798537, 0.265227],
    'AST': [0.885352, 0.635969, 0.101633, 0.399236, 0.918847],
    'Heart Rate': [0.652733, 0.574425, 0.85574, 0.3246, 0.80491],
    'Creatinine': [0.788235, 0.047025, 0.551124, 0.499504, 0.571119],
    'Troponin': [0.054788, 0.607985, 0.413294, 0.436662, 0.188368],
    'C-reactive Protein': [0.031313, 0.594123, 0.070909, 0.242766, 0.750848],
    'Disease': [1, 1, 1, 1, 1]  # Assuming 1 means diseased
}

df = pd.DataFrame(data)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Encoding the target variable
label_encoder = LabelEncoder()
df['Disease'] = label_encoder.fit_transform(df['Disease'])

# Separate features and target
X = df.drop(columns=['Disease'])
y = df['Disease']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn import tree
dtc = tree.DecisionTreeClassifier()
dtc = dtc.fit(X_train,y_train)


In [None]:
# Example new patient data (normalized as the provided data)
new_patient_data = {
    'Glucose': 0.5,
    'Cholesterol': 0.4,
    'Hemoglobin': 0.6,
    'Platelets': 0.7,
    'White Blood Cells': 0.6,
    'Red Blood Cells': 0.5,
    'Hematocrit': 0.8,
    'Mean Corpuscular Volume': 0.6,
    'Mean Corpuscular Hemoglobin': 0.5,
    'Mean Corpuscular Hemoglobin Concentration': 0.4,
    'HbA1c': 0.7,
    'LDL Cholesterol': 0.3,
    'HDL Cholesterol': 0.6,
    'ALT': 0.4,
    'AST': 0.7,
    'Heart Rate': 0.6,
    'Creatinine': 0.5,
    'Troponin': 0.4,
    'C-reactive Protein': 0.6
}

# Convert new patient data to DataFrame
new_patient_df = pd.DataFrame([new_patient_data])

# Apply the same preprocessing (not needed in this case since it's already numerical)
# new_patient_encoded = encoder.transform(new_patient_df)

# Make a prediction
prediction = dtc.predict(new_patient_df)

# Convert prediction to the disease label
predicted_disease = label_encoder.inverse_transform(prediction)

if predicted_disease[0] == 1:
    print("The patient is predicted to be diseased.")
else:
    print("The patient is predicted to be healthy.")


In [None]:
#Save model using Joblib

In [None]:
import joblib

In [None]:
joblib.dump(dtc, 'model_joblib_disease')

In [None]:
model = joblib.load('model_joblib_disease')

In [None]:
#Graphical user Interface

In [None]:
import tkinter as tk
from tkinter import Label, Button, Entry
import joblib
import threading

def show_entry_fields():
    try:
        p1 = float(e1.get())
        p2 = float(e2.get())
        p3 = float(e3.get())
        p4 = float(e4.get())
        p5 = float(e5.get())
        p6 = float(e6.get())
        p7 = float(e7.get())
        p8 = float(e8.get())
        p9 = float(e9.get())
        p10 = float(e10.get())
        p11 = float(e11.get())
        p12 = float(e12.get())
        p13 = float(e13.get())
        p14 = float(e14.get())
        p15 = float(e15.get())
        p16 = float(e16.get())
        p17 = float(e17.get())
        p18 = float(e18.get())
        p19 = float(e19.get())
        
        model = joblib.load('model_joblib_disease')
        result = model.predict([[p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19]])

        if result[0] == 0:
            Label(master, text="The patient is predicted to be healthy").grid(row=21, columnspan=2)
        else:
            Label(master, text="The patient is predicted to be diseased").grid(row=21, columnspan=2)
    except ValueError:
        Label(master, text="Please enter valid numeric values").grid(row=21, columnspan=2)

def run_tkinter():
    global master, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19

    master = tk.Tk()
    master.title("Disease Prediction")

    Label(master, text="Disease Prediction System", bg="black", fg="white").grid(row=0, columnspan=2)

    labels = [
        "Enter Glucose", "Enter Cholesterol", "Enter Hemoglobin", "Enter Platelets", 
        "Enter White Blood Cells", "Enter Red Blood Cells", "Enter Hematocrit", 
        "Enter Mean Corpuscular Volume", "Enter Mean Corpuscular Hemoglobin", 
        "Enter Mean Corpuscular Hemoglobin Concentration", "Enter HbA1c", 
        "Enter LDL Cholesterol", "Enter HDL Cholesterol", "Enter ALT", 
        "Enter AST", "Enter Heart Rate", "Enter Creatinine", "Enter Troponin", 
        "Enter C-reactive Protein"
    ]

    entries = []
    for i, label in enumerate(labels):
        Label(master, text=label).grid(row=i+1)
        entry = Entry(master)
        entry.grid(row=i+1, column=1)
        entries.append(entry)

    e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19 = entries

    Button(master, text="Predict", command=show_entry_fields).grid(row=20, columnspan=2)

    master.mainloop()

# Run the tkinter GUI in a separate thread
thread = threading.Thread(target=run_tkinter)
thread.start()

           