1. Loading libraries and dataset

In [None]:
from asi_project.main import x as x_coef
x_coef

In [None]:
# Manipulation and Visualasation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Libraries for Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score

# Models
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import pickle

In [None]:
df = pd.read_csv('../data/liver_data.csv')

2. Data analysis

In [None]:
df.head()

In [None]:
from matplotlib import pyplot as plt
df['AlcoholConsumption'].plot(kind='hist', bins=20, title='AlcoholConsumption')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
from matplotlib import pyplot as plt
df['BMI'].plot(kind='hist', bins=20, title='BMI')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
from matplotlib import pyplot as plt
df['Age'].plot(kind='hist', bins=20, title='Age')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
numeric_cols = ['BMI', 'AlcoholConsumption', 'PhysicalActivity', 'LiverFunctionTest']

plt.figure(figsize=(12, 6))

for i, column in enumerate(numeric_cols, 1):
    plt.subplot(len(numeric_cols), len(numeric_cols)//2, i)
    sns.boxplot(x=df[column])

plt.suptitle('Boxplots for numerical variables')
plt.tight_layout()
plt.show()

In [None]:
sns.pairplot(df[['BMI', 'AlcoholConsumption', 'PhysicalActivity', 'LiverFunctionTest', 'Diagnosis']], hue = 'Diagnosis')
plt.show()

In [None]:
# Calculate the correlation matrix
corr = df.corr()
plt = plt.figure(figsize=(10, 10))

sns.heatmap(corr, annot=True, cmap="coolwarm")

In [None]:
X = df.drop("Diagnosis", axis=1)
y = df["Diagnosis"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Define and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Evaluate the Random Forest model
y_pred_rf = rf_model.predict(X_test_scaled)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {rf_accuracy:.2f}")

In [None]:
# Define and train the XGBoost model
xgboost_model = XGBClassifier(n_estimators=100, random_state=42)
xgboost_model.fit(X_train_scaled, y_train)

# Evaluate the XGBoost model
y_pred_xgboost = xgboost_model.predict(X_test_scaled)
xgboost_accuracy = accuracy_score(y_test, y_pred_xgboost)
print(f"XGBoost Accuracy: {xgboost_accuracy:.2f}")

In [None]:
# Define and train the LightGBM model
lgbm_model = LGBMClassifier(n_estimators=100, random_state=42)
lgbm_model.fit(X_train_scaled, y_train)

# Evaluate the LightGBM model
y_pred_lgbm = lgbm_model.predict(X_test_scaled)
lgbm_accuracy = accuracy_score(y_test, y_pred_lgbm)
print(f"LightGBM Accuracy: {lgbm_accuracy:.2f}")

In [None]:
# Save the best model
with open('../models/rf_model.pkl', 'wb') as f:
     pickle.dump(rf_model, f)

In [None]:
# Save XGBoost model
with open('../models/xgboost_model.pkl', 'wb') as f:
     pickle.dump(xgboost_model, f)

In [None]:
# Save LightGBM model
with open('../models/lgbm_model.pkl', 'wb') as f:
     pickle.dump(lgbm_model, f)

In [None]:
# with open('scaler.pkl', 'wb') as f:
#     pickle.dump(scaler, f)