In [None]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import os 
import pandas as pd

In [None]:
df = pd.read_csv("liver-disease.csv")

In [None]:
df

**EXPLORATORY DATA ANALYSIS :-**

In [None]:
df.info()

In [None]:
df.dtypes[df.dtypes=='object']

Distribution of Numberical Features

In [None]:
df.hist(figsize=(15,15), xrot=-45,bins=10)
plt.show()

In [None]:
df.describe()

In [None]:
def convertdataset(x):
    if x==2:
        return 0
    return 1
df['Dataset'] = df['Dataset'].map(convertdataset)

In [None]:
df.head()

In [None]:
df.Dataset.value_counts()

In [None]:
df.describe(include=['object'])

Bar plots for categorical features

In [None]:
import seaborn as sns

plt.figure(figsize=(5,5))
sns.countplot(y='Gender', data=df)

In [None]:
df[df['Gender'] == 'Male'][['Dataset','Gender']].head()

In [None]:
sns.catplot(x="Age", y="Gender", hue="Dataset", data=df)

In [None]:
df['Gender'].value_counts()

In [None]:
# Categorical Value Handling
def convertgender(x):
    if x== 'Male':
        return 0
    else:
        return 1
df['Gender'] = df['Gender'].map(convertgender)

In [None]:
df.head()


Correlation

In [None]:
df.corr()

Positive Correlation-> one feature increases other also increases
Negative Correlation-> one feature increases other decreases
closer to 0-> weak relationship

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df.corr())

In [None]:
from matplotlib.colors import ListedColormap

mask = np.zeros_like(df.corr())
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize=(10,10))
with sns.axes_style("white"):
    ax = sns.heatmap(df.corr()*100, mask=mask, fmt = ".0f", annot=True, lw=1, cmap=ListedColormap(['green','yellow','red','blue']))

In [None]:
df = df.drop_duplicates()

In [None]:
df.shape

Removing Outlier

In [None]:
df.columns

In [None]:
sns.boxplot(df.Aspartate_Aminotransferase)

In [None]:
sns.boxplot(df.Total_Bilirubin)

In [None]:
df.Aspartate_Aminotransferase.sort_values(ascending=False).head()

In [77]:
df = df[df.Aspartate_Aminotransferase<=3000]

In [78]:
df.shape

(569, 11)

In [79]:
df.Aspartate_Aminotransferase.sort_values(ascending=False).head()

117    2946
118    1600
207    1500
119    1050
199    1050
Name: Aspartate_Aminotransferase, dtype: int64

In [80]:
df = df[df.Aspartate_Aminotransferase<=2500]

In [81]:
df.shape

(568, 11)

In [82]:
df.isnull().sum()


Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    4
Dataset                       0
dtype: int64

In [83]:
df = df.dropna(how='any')

In [84]:
df.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,1,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,0,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,0,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,0,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,0,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [85]:
df.shape

(564, 11)

Machine Learning Model

In [86]:
# Data Preparation
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.svm import SVC

In [87]:
y=df.Dataset
X=df.drop('Dataset', axis=1)

In [88]:
# Encode categorical columns
label_encoder = LabelEncoder()
df["Gender"] = label_encoder.fit_transform(df["Gender"])

In [89]:
X_train, X_test, y_train , y_test = train_test_split(X,y, test_size=0.2, random_state=0, stratify=y)

Data Standardization

In [90]:
# Standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) 

In [91]:
# All models
models = {
    "Logistic Regression": LogisticRegression(),
    "Ridge Classifier": RidgeClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "SVM (Linear)": SVC(kernel='linear'),
    "SVM (RBF Kernel)": SVC(kernel='rbf')
}

MODEL EVALUATION

In [92]:
# Train & Evaluate Models
model_results = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    model_results[name] = accuracy
    print(f"\n{name} Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))


Logistic Regression Accuracy: 0.7699
              precision    recall  f1-score   support

           0       0.69      0.34      0.46        32
           1       0.78      0.94      0.85        81

    accuracy                           0.77       113
   macro avg       0.74      0.64      0.66       113
weighted avg       0.76      0.77      0.74       113


Ridge Classifier Accuracy: 0.7257
              precision    recall  f1-score   support

           0       0.67      0.06      0.11        32
           1       0.73      0.99      0.84        81

    accuracy                           0.73       113
   macro avg       0.70      0.53      0.48       113
weighted avg       0.71      0.73      0.63       113


Decision Tree Accuracy: 0.6195
              precision    recall  f1-score   support

           0       0.37      0.50      0.43        32
           1       0.77      0.67      0.72        81

    accuracy                           0.62       113
   macro avg       0.57

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [93]:
# Hyperparameter Tuning for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)
best_rf = grid_search.best_estimator_

In [94]:
# Evaluate Tuned Random Forest
y_pred_rf = best_rf.predict(X_test_scaled)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
model_results["Tuned Random Forest"] = rf_accuracy
print("\nTuned Random Forest Accuracy:", rf_accuracy)
print(classification_report(y_test, y_pred_rf))


Tuned Random Forest Accuracy: 0.7079646017699115
              precision    recall  f1-score   support

           0       0.48      0.41      0.44        32
           1       0.78      0.83      0.80        81

    accuracy                           0.71       113
   macro avg       0.63      0.62      0.62       113
weighted avg       0.69      0.71      0.70       113



Model Comparison Visualization

In [95]:
# Convert model results into a DataFrame
model_comparison_df = pd.DataFrame(model_results.items(), columns=["Model", "Accuracy"])

# Sort models by accuracy in descending order
model_comparison_df = model_comparison_df.sort_values(by="Accuracy", ascending=False)

print(model_comparison_df)

                 Model  Accuracy
0  Logistic Regression  0.769912
1     Ridge Classifier  0.725664
7         SVM (Linear)  0.716814
8     SVM (RBF Kernel)  0.716814
9  Tuned Random Forest  0.707965
4    Gradient Boosting  0.690265
3        Random Forest  0.681416
5              XGBoost  0.681416
6  K-Nearest Neighbors  0.663717
2        Decision Tree  0.619469


In [96]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import pickle

# Load dataset
df = pd.read_csv('liver-disease.csv')

# Encode categorical features
label_encoders = {}
categorical_columns = ['Gender']

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

X = df.drop(columns=['Dataset']) 
y = df['Dataset']

# Handle missing values by filling with the mean of each column
X = X.fillna(X.mean())

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression Model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = model.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Save Model & Scaler
with open('model3.pkl', 'wb') as file:
    pickle.dump(model, file)

with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

print("Model and Scaler Saved Successfully!")


# Save the model
with open('model3.pkl', 'wb') as file:
    pickle.dump(model, file)
with open('model3.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# If you used StandardScaler, save it as well
scaler = StandardScaler()
scaler.fit(X_train)

with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)
with open('scaler.pkl', 'rb') as file:
    loaded_scaler = pickle.load(file)
    
    
print(" Model and Scaler saved👌")

Accuracy: 0.7606837606837606
Classification Report:
               precision    recall  f1-score   support

           1       0.79      0.92      0.85        87
           2       0.56      0.30      0.39        30

    accuracy                           0.76       117
   macro avg       0.68      0.61      0.62       117
weighted avg       0.73      0.76      0.73       117

Model and Scaler Saved Successfully!
 Model and Scaler saved👌


In [97]:
import joblib
model = joblib.load('model3.pkl')
print(type(model))

<class 'sklearn.linear_model._logistic.LogisticRegression'>


In [98]:


# Test the loaded model
print(loaded_model.predict(X_test_scaled))

[2 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1
 1 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2
 1 1 2 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1
 1 1 1 1 2 2]


In [99]:
import pickle
import numpy as np

# Load model
with open('model3.pkl', 'rb') as file:
    model = pickle.load(file)

# Load scaler (if used)
try:
    with open('scaler.pkl', 'rb') as file:
        scaler = pickle.load(file)
except FileNotFoundError:
    scaler = None  # If no scaler, we'll proceed without it

print("Model Loaded:", model)
print("Scaler Loaded:", scaler)


Model Loaded: LogisticRegression()
Scaler Loaded: StandardScaler()


In [100]:
test_inputs = [
    [45, 1, 1.2, 0.5, 220, 50, 30, 6.8, 3.4, 1.1],  # Example 1
    [25, 0, 0.8, 0.3, 180, 40, 25, 7.2, 4.0, 1.3],  # Example 2
]

for i, test in enumerate(test_inputs):
    test_array = np.array(test).reshape(1, -1)
    
    if scaler:
        test_array = scaler.transform(test_array)
    
    pred = model.predict(test_array)[0]
    print(f"Test Case {i+1}: Prediction = {'Liver Disease' if pred == 1 else 'No Liver Disease'}")


Test Case 1: Prediction = Liver Disease
Test Case 2: Prediction = No Liver Disease


