## Importing the Relevant Library

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Importing data

In [None]:
data = pd.read_csv('../input/indian-liver-patient-records/indian_liver_patient.csv')
data.head()

Dataset Column<br>
* 1 - Patient with liver disease
* 2 - Patient with no disease

#### Checking for Null Values

In [None]:
data.isnull().sum()

Looks for column containing the Null Values

In [None]:
data[data['Albumin_and_Globulin_Ratio'].isnull()]

Looks for Correct metrics to replace the Null value

In [None]:
data['Albumin_and_Globulin_Ratio'].describe()

In [None]:
data['Albumin_and_Globulin_Ratio'].plot.hist()

Here data are normally distrubuted but sightly right skewed.So, we can can go with Mean or Mediam. Here i am going with median.

In [None]:
data['Albumin_and_Globulin_Ratio'].median()

In [None]:
#Creating checkpoint
df = data.copy()

In [None]:
df['Albumin_and_Globulin_Ratio'].fillna(data['Albumin_and_Globulin_Ratio'].median(), inplace=True)
df.isnull().sum()

#### Looking for data description

In [None]:
df.describe(percentiles=[0.3,.5,.8]).round(2)

In [None]:
print(f'Total number of Rows {df.shape[0]}\nTotal number of Columns {df.shape[1]}')

In [None]:
sns.pairplot(df, hue='Dataset', palette='viridis')

#### Looking into univairate features

Lets look for Total Bilirubin and Direct Bilirubin
* Normal results for a total bilirubin test are 1.2 milligrams per deciliter (mg/dL) for adults and usually 1 mg/dL for those under 18.
* Normal results for direct bilirubin are generally 0.3 mg/dL.

*https://www.mayoclinic.org/tests-procedures/bilirubin/about/pac-20393041#:~:text=Normal%20results%20for%20a%20total,are%20generally%200.3%20mg%2FdL.*


In [None]:
fig = plt.figure(figsize=(5,7))
ax = sns.boxplot(data = df['Total_Bilirubin'],orient='v')
ax.set_ylabel("Total Bilirubin")

Here we could see that the datasets contains huge outliers as Normal Total Bilirubin must contains data around 1.2 but here data shows max upto 75, which are false or huge outliers

In [None]:
fig = plt.figure(figsize=(5,7))
ax = sns.boxplot(data = df['Direct_Bilirubin'],color='Red',orient='v')
ax.set_ylabel("Direct Bilirubin")

Here also we could see that the datasets contains huge outliers as Normal Direct Bilirubin must contains data around 0.3 but here data shows max upto 20, which are False or huge outliers.

In [None]:
df['Alkaline_Phosphotase'].hist()

In [None]:
df['Aspartate_Aminotransferase'].hist()

In [None]:
df['Total_Protiens'].hist()

In [None]:
df['Albumin'].hist()

In [None]:
df['Albumin_and_Globulin_Ratio'].hist()

In [None]:
plt.figure(figsize=(6,6))
ax = sns.countplot(x = df['Dataset'].apply(lambda x:'Liver Disease' if x == 1 else 'Non-Liver Disease'), hue=df['Gender'])
ax.set_xlabel('Patient Condition')
for p in ax.patches:
  ax.annotate(f'{p.get_height()}',(p.get_x()+0.15, p.get_height()+3))

Lets see by Age Category in datset

In [None]:
pd.Series(map(lambda x: 'Old_Age' if x>=90 else 'Adult_Age' if x > 21 else "Young_Age",df['Age'])).value_counts(normalize=True)

* 91.5% of data are from 21-89 Age.
* 8% of data are from below 22 Age.
* 0.1% of data are from above 89 Age

In [None]:
df.head()

In [None]:
df.groupby('Gender').sum()['Total_Protiens'].plot.bar(color='#253660').set_ylabel('Total_Proteins')

Male consuming more protients then Female. And also there low number of Female in dataset

In [None]:
#Creating Checkpoint
df2 = df.copy()

In [None]:
df2.head()

#### Lets Encode the nominal features

In [None]:
df2 = pd.get_dummies(data=df2,columns=['Gender','Dataset'], drop_first=True)
df2.head()

In [None]:
df2.rename(columns={'Dataset_2':'Have_Disease'},inplace=True)
df2 = df2[['Gender_Male', 'Age', 'Total_Bilirubin', 'Direct_Bilirubin', 'Alkaline_Phosphotase',
       'Alamine_Aminotransferase', 'Aspartate_Aminotransferase',
       'Total_Protiens', 'Albumin', 'Albumin_and_Globulin_Ratio',
       'Have_Disease']]

In [None]:
df2.head()

#### Looking for Correlation between features

In [None]:
plt.figure(figsize=(15,6))
sns.heatmap(df2.corr(),cmap='GnBu',annot=True)

Here we can conclude that,
* Total_Bilirubin and Direct_Bilirubin are highly colorelated also, Alamine_Aminotransferase and Aspartate_Aminotransferase and Total_Protiens and Albumin.
* We can delete one features to increase the model training speed and accuracy
*But this not always true.For reference look into this
https://datascience.stackexchange.com/questions/24452/in-supervised-learning-why-is-it-bad-to-have-correlated-features

In [None]:
#Creating the check point after removing the correlated data
df3 = df2.drop(['Direct_Bilirubin','Aspartate_Aminotransferase','Albumin'], axis=1)

In [None]:
df3['Have_Disease'].value_counts()

### Here datasets conatins unbalanced Classes so we will try to resample our data to reduce the incorrectness.

In [None]:
df3.describe(percentiles=[0.30,0.60,0.90])

#### Scaling the data

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
#checkpoint
df4 = pd.concat([df3['Gender_Male'],pd.DataFrame(sc.fit_transform(df3.iloc[:,1:7])),df3['Have_Disease']], axis=1)
df4.columns = df3.columns
df4.head()

### Buliding our models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split

We will look for metrics

In [None]:
from sklearn.metrics import confusion_matrix, precision_score, accuracy_score, recall_score

<img src="https://static.packt-cdn.com/products/9781838555078/graphics/C13314_06_05.jpg"/>&nbsp;&nbsp;&nbsp;&nbsp;


<img src="https://cdn.analyticsvidhya.com/wp-content/uploads/2020/04/Equation_Accuracy.png"/>&nbsp;&nbsp;&nbsp;&nbsp;
* Accuracy is defined as the ratio of correctly predicted examples by the total examples.
* Remember, accuracy is a very useful metric when all the classes are equally important. But this might not be the case if we are predicting if a patient has Liver Cancer. In this example, we can probably tolerate FPs but not FNs.

<img src="https://cdn.analyticsvidhya.com/wp-content/uploads/2020/04/Confusion-matrix_Precision.png"/>&nbsp;&nbsp;&nbsp;&nbsp;
* Precision tells us how many of the correctly predicted cases actually turned out to be positive.
* Precision is a useful metric in cases where False Positive is a higher concern than False Negatives.
* Precision is important in music or video recommendation systems, e-commerce websites, etc. Wrong results could lead to customer churn and be harmful to the business.

<img src="https://cdn.analyticsvidhya.com/wp-content/uploads/2020/04/Confusion-matrix_Recall.png"/>&nbsp;&nbsp;&nbsp;&nbsp;
* Recall tells us how many of the actual positive cases we were able to predict correctly with our model.
* Recall is a useful metric in cases where False Negative trumps False Positive.
* Recall is important in medical cases where it doesn’t matter whether we raise a false alarm but the actual positive cases should not go undetected!

So,here we will try to focus on recall score value.

In [None]:
def ml_algorthims(data):
    fig, axes = plt.subplots(3,2, figsize=(10,10))
    print("::::::::::::::::::: Splitting the dataset into train and test ::::::::::")
    x = data.drop('Have_Disease',axis=1).values
    y = data.iloc[:,-1].values
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    print(f"X_Train : {x_train.shape}\t X_Test : {x_test.shape}\nY_Train : {y_train.shape}\t Y_Test : {y_test.shape}\n")
    print()
    print(" :::::::::::::::Logistic Regression::::::::::::: ")
    lg = LogisticRegression().fit(x_train,y_train)
    lg_pred = lg.predict(x_test)
    ax = sns.heatmap(confusion_matrix(y_test,lg_pred),annot=True, ax=axes[0,0])
    ax.set_title("Logistic Confusion Matrix")
    # print(classification_report(lg_pred,y_test))
    print(f'Accuracy : {accuracy_score(y_test,lg_pred):0.2f}')
    print(f'Precision : {precision_score(y_test,lg_pred):0.2f}')
    print(f'Recall : {recall_score(y_test,lg_pred):0.2f}')
    print()
    print(" :::::::::::::::Decision Tree Classifier::::::::::::: ")
    dtree = DecisionTreeClassifier().fit(x_train,y_train)
    dtree_pred = dtree.predict(x_test)
    ax = sns.heatmap(confusion_matrix(y_test,dtree_pred),annot=True, ax=axes[0,1])
    ax.set_title("Decision Tree Confusion Matrix")
    # print(classification_report(dtree_pred,y_test))
    print(f'Accuracy : {accuracy_score(y_test,dtree_pred):0.2f}')
    print(f'Precision : {precision_score(y_test,dtree_pred):0.2f}')
    print(f'Recall : {recall_score(y_test,dtree_pred):0.2f}')
    print()
    print(" :::::::::::::::Random Forest Classifier::::::::::::: ")
    rftree = RandomForestClassifier().fit(x_train,y_train)
    rftree_pred = rftree.predict(x_test)
    ax = sns.heatmap(confusion_matrix(y_test,rftree_pred),annot=True, ax=axes[1,0])
    ax.set_title("Random Forest Confusion Matrix")
    # print(classification_report(rftree_pred,y_test))
    print(f'Accuracy : {accuracy_score(y_test,rftree_pred):0.2f}')
    print(f'Precision : {precision_score(y_test,rftree_pred):0.2f}')
    print(f'Recall : {recall_score(y_test,rftree_pred):0.2f}')
    print()
    print(" :::::::::::::::Xgboost::::::::::::: ")
    xgb_model = XGBClassifier().fit(x_train,y_train)
    xgb_model_pred = xgb_model.predict(x_test)
    ax = sns.heatmap(confusion_matrix(y_test,xgb_model_pred),annot=True, ax=axes[1,1])
    ax.set_title("Xgb Confusion Matrix")
    # print(classification_report(xgb_model_pred,y_test))
    print(f'Accuracy : {accuracy_score(y_test,xgb_model_pred):0.2f}')
    print(f'Precision : {precision_score(y_test,xgb_model_pred):0.2f}')
    print(f'Recall : {recall_score(y_test,xgb_model_pred):0.2f}')
    print()
    print(" :::::::::::::::K Nearest Neighbour::::::::::::: ")
    knn = KNeighborsClassifier().fit(x_train,y_train)
    knn_pred = knn.predict(x_test)
    ax = sns.heatmap(confusion_matrix(y_test,knn_pred),annot=True, ax=axes[2,0])
    ax.set_title("KNN Confusion Matrix")
    # print(classification_report(knn_pred,y_test))
    print(f'Accuracy : {accuracy_score(y_test,knn_pred):0.2f}')
    print(f'Precision : {precision_score(y_test,knn_pred):0.2f}')
    print(f'Recall : {recall_score(y_test,knn_pred):0.2f}')
    print()
    print(" :::::::::::::::Support Vector Machine (SVM)::::::::::::: ")
    svm = SVC().fit(x_train,y_train)
    svm_pred = svm.predict(x_test)
    ax = sns.heatmap(confusion_matrix(y_test,svm_pred),annot=True, ax=axes[2,1])
    ax.set_title("SVM Confusion Matrix")
    # print(classification_report(svm_pred,y_test))
    print(f'Accuracy : {accuracy_score(y_test,svm_pred):0.2f}')
    print(f'Precision : {precision_score(y_test,svm_pred):0.2f}')
    print(f'Recall : {recall_score(y_test,svm_pred):0.2f}')

In [None]:
import warnings  
warnings.filterwarnings('ignore')

### Unbalanced(Original) Dataset

Looking for model accuracy when model is treated with unbalanced dataset

In [None]:
ml_algorthims(df4)

## Balancing the imbalanced Dataset

#### 1. Undersampling the majority class

In [None]:
df_class_0 = df4[df4['Have_Disease'] == 0].copy()
df_class_1 = df4[df4['Have_Disease'] == 1].copy()
undersample = df_class_0.sample(df_class_1.shape[0]).reset_index(drop=True)

In [None]:
ml_algorthims(pd.concat([undersample,df_class_1],axis=0))

#### 2. Oversampling minority class

In [None]:
oversample = df_class_1.sample(df_class_0.shape[0], replace=True)

In [None]:
ml_algorthims(pd.concat([oversample,df_class_0],axis=0))

#### 3.SMOTE(Synthetic Minority Oversampling Techinque)

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy = 'minority')
x_sm, y_sm = smote.fit_sample(df4.drop('Have_Disease',axis=1),df4['Have_Disease'])

In [None]:
nd = pd.concat([pd.DataFrame(x_sm),pd.DataFrame(y_sm)],axis=1)
nd.columns = df4.columns

In [None]:
ml_algorthims(nd)

Upvote if you like, Feedback and suggestions are always welcome😊