In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import LabelEncoder

In [None]:
df = pd.read_csv("/kaggle/input/ckdisease/kidney_disease.csv")

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.drop(["id"], axis=1, inplace=True)

In [None]:
df.columns

In [None]:
df.head()

In [None]:
df.columns = ['age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar', 'red_blood_cells', 'pus_cell', 'puss_cell_clumbs', 'bacteria', 'blood_glucose_random', 'blood_urea',
       'serum_creatinine', 'sodium', 'potassium', 'hemoglabin', 'packed_cell_volume', 'wide_blood_cell_count', 'red_blood_cell_count', 'hypertension', 'diabetes_mellitus', 'coronary_artery_disease',
       'appetite', 'peda_edema', 'anemia', 'class']

In [None]:
df.columns

In [None]:
len(df.columns)

In [None]:
df.isnull().sum()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df["packed_cell_volume"]

In [None]:
df["packed_cell_volume"] = pd.to_numeric(df["packed_cell_volume"], errors = "coerce")
df["wide_blood_cell_count"] = pd.to_numeric(df["wide_blood_cell_count"], errors = "coerce")
df["red_blood_cell_count"] = pd.to_numeric(df["red_blood_cell_count"], errors = "coerce")

In [None]:
df.info()

In [None]:
categorical_cols = [col for col in df.columns if df[col].dtype == "object"]

In [None]:
categorical_cols

In [None]:
len(categorical_cols)

In [None]:
num_cols = [col for col in df.columns if df[col].dtype != "object"]

In [None]:
len(num_cols)

In [None]:
for col in categorical_cols:
    print(f"{col}: {df[col].unique()}")

In [None]:
df["diabetes_mellitus"].replace(to_replace = {" yes":"yes", "\tno":"no", "\tyes":"yes"}, inplace=True)
df["coronary_artery_disease"].replace(to_replace = {"\tno":"no"}, inplace=True)
df["class"].replace(to_replace = {"ckd\t":"ckd"}, inplace=True)

In [None]:
for col in categorical_cols:
    print(f"{col}: {df[col].unique()}")

In [None]:
df["class"] = df["class"].map({"ckd":0, "notckd":1})

In [None]:
df["class"]

In [None]:
plt.figure(figsize=(20,20))
plotnumber = 1
for col in num_cols:
    if plotnumber <= 14:
        ax = plt.subplot(3, 5, plotnumber)
        sns.histplot(df[col], kde=True)
        plt.xlabel(col)
    plotnumber += 1
plt.tight_layout()
plt.show()

In [None]:
float_df = df.select_dtypes(include=['float64','int64'])
plt.figure(figsize=(15,15))
sns.heatmap(float_df.corr(), annot=True, linecolor="white", linewidths=2, cmap="magma")
plt.show()

In [None]:
df.info()

In [None]:
def kde(col):
    grid = sns.FacetGrid(df, hue="class", height=6, aspect=2)

    grid.map(sns.kdeplot, col)
    grid.add_legend()
kde("hemoglabin")

In [None]:
kde("wide_blood_cell_count")

In [None]:
kde("packed_cell_volume")

In [None]:
kde("red_blood_cell_count")

In [None]:
df.isnull().sum().sort_values(ascending=False)

In [None]:
def solve_mv_random_value(feature):
    random_sample = df[feature].dropna().sample(df[feature].isnull().sum())
    random_sample.index = df[df[feature].isnull()].index
    df.loc[df[feature].isnull(), feature] = random_sample

In [None]:
for col in num_cols:
    solve_mv_random_value(col)

In [None]:
df[num_cols].isnull().sum()

In [None]:
def solve_mv_mode(feature):
    mode = df[feature].mode()[0]
    df[feature] = df[feature].fillna(mode)

In [None]:
solve_mv_random_value("red_blood_cells")
solve_mv_random_value("pus_cell")

In [None]:
for col in categorical_cols:
    solve_mv_mode(col)

In [None]:
df[categorical_cols].isnull().sum()

In [None]:
for col in categorical_cols:
    print(f"{col}: {df[col].nunique()}")

In [None]:
encoder = LabelEncoder()
for col in categorical_cols:
    df[col] = encoder.fit_transform(df[col])

In [None]:
df

In [None]:
df.info()

In [None]:
independent_col = [col for col in df.columns if col != "class"]

In [None]:
independent_col

In [None]:
dependent_col = "class"

In [None]:
X = df[independent_col]
y = df[dependent_col]

In [None]:
y

In [None]:
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)
dtc_acc = accuracy_score(y_test, y_pred)

In [None]:
cm = confusion_matrix(y_test, y_pred)

In [None]:
cm

In [None]:
cr = classification_report(y_test, y_pred)

In [None]:
print("Classification Report", cr)

In [None]:
class_names = ["ckd", "notckd"]
plt.figure(figsize=(20,10))
plot_tree(dtc, feature_names = independent_col, filled=True, rounded=True, fontsize=10)
plt.show()

In [None]:
feature_importance = pd.DataFrame({"Feature":independent_col, "Importance":dtc.feature_importances_})

In [None]:
feature_importance.sort_values(by="Importance", ascending=False)

In [None]:
print("The Most Important Feature: ", feature_importance.sort_values(by="Importance", ascending=False).iloc[0])

In [None]:
plt.figure()
sns.barplot(x = "Importance", y = "Feature", data = feature_importance)
plt.title("Feature Importance")
plt.show()