<a href="https://colab.research.google.com/github/SakshamSharma2006/Data-Science/blob/main/Saksham_Sharma_Diabetes_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import scale, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
plt.style.use("ggplot")
import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv("/content/diabetes.csv")

In [None]:
data.head()

In [None]:
data.columns

In [None]:
data.describe()

In [None]:
data.shape

In [None]:
data.isnull().sum()

In [None]:
data.Outcome.value_counts()*100/len(data)

In [None]:
plt.figure(figsize=(8,8))
plt.xlabel("Age", fontsize=10)
plt.ylabel("Count", fontsize=10)
data["Age"].hist(edgecolor="black", label='Age Distribution')
plt.legend()
plt.show()

In [None]:
print("MAX AGE: "+str(data['Age'].max()))
print("MIN AGE: "+str(data['Age'].min()))

In [None]:
fig, ax = plt.subplots(4, 2, figsize=(20, 20))

sns.histplot(data.Pregnancies, bins=20, ax=ax[0,0], color="red", kde=True, line_kws={'linewidth': 2})
sns.histplot(data.Glucose, bins=20, ax=ax[0,1], color="red", kde=True, line_kws={'linewidth': 2})
sns.histplot(data.BloodPressure, bins=20, ax=ax[1,0], color="red", kde=True, line_kws={'linewidth': 2})
sns.histplot(data.SkinThickness, bins=20, ax=ax[1,1], color="red", kde=True, line_kws={'linewidth': 2})
sns.histplot(data.Insulin, bins=20, ax=ax[2,0], color="red", kde=True, line_kws={'linewidth': 2})
sns.histplot(data.BMI, bins=20, ax=ax[2,1], color="red", kde=True, line_kws={'linewidth': 2})
sns.histplot(data.DiabetesPedigreeFunction, bins=20, ax=ax[3,0], color="red", kde=True, line_kws={'linewidth': 2})
sns.histplot(data.Age, bins=20, ax=ax[3,1], color="red", kde=True, line_kws={'linewidth': 2})

plt.tight_layout()
plt.show()

In [None]:
data.corr()

In [None]:
f,ax = plt.subplots(figsize=[10,10])
sns.heatmap(data.corr(), annot=True, fmt = '.2f', ax=ax, cmap='coolwarm')
ax.set_title("Correlation Matrix", fontsize=20)
plt.show()

In [None]:
data[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']] = data[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']].replace(0, np.NaN)

In [None]:
data.isnull().sum()

In [None]:
data.head(10)

In [None]:
def median_target(var):
    temp = data[data[var].notnull()]
    temp = temp[[var, 'Outcome']].groupby(['Outcome'])[[var]].median().reset_index()
    return temp

In [None]:
columns = data.columns
columns = columns.drop("Outcome")
for i in columns:
    median_target(i)
    data.loc[(data['Outcome'] == 0 ) & (data[i].isnull()), i] = median_target(i)[i][0]
    data.loc[(data['Outcome'] == 1 ) & (data[i].isnull()), i] = median_target(i)[i][1]

In [None]:
data.isnull().sum()

In [None]:
Q1 = data.Insulin.quantile(0.25)
Q3 = data.Insulin.quantile(0.75)
IQR = Q3-Q1
lower = Q1-1.5*IQR
upper = Q3+1.5*IQR
data.loc[data['Insulin']>upper, "Insulin"] = upper

In [None]:
from sklearn.neighbors import LocalOutlierFactor
lof = LocalOutlierFactor(n_neighbors=10)
lof.fit_predict(data)

In [None]:
data_scores = lof.negative_outlier_factor_
np.sort(data_scores)[0:20]

In [None]:
thresold = np.sort(data_scores)[5]

In [None]:
thresold

In [None]:
outlier = data_scores>thresold

In [None]:
outlier

In [None]:
data = data[outlier]
data

In [None]:
NewBMI = pd.Series(["Underweight","Normal", "Overweight","Obesity 1", "Obesity 2", "Obesity 3"], dtype = "category")

In [None]:
NewBMI

In [None]:
data['NewBMI'] = NewBMI
data.loc[data["BMI"]<18.5, "NewBMI"] = NewBMI[0]
data.loc[(data["BMI"]>18.5) & data["BMI"]<=24.9, "NewBMI"] = NewBMI[1]
data.loc[(data["BMI"]>24.9) & data["BMI"]<=29.9, "NewBMI"] = NewBMI[2]
data.loc[(data["BMI"]>29.9) & data["BMI"]<=34.9, "NewBMI"] = NewBMI[3]
data.loc[(data["BMI"]>34.9) & data["BMI"]<=39.9, "NewBMI"] = NewBMI[4]
data.loc[data["BMI"]>39.9, "NewBMI"] = NewBMI[5]

In [None]:
data.head()

In [None]:
def set_insuline(row):
    if row["Insulin"]>=16 and row["Insulin"]<=166:
        return "Normal"
    else:
        return "Abnormal"

In [None]:
data = data.assign(NewInsulinScore=data.apply(set_insuline, axis=1))

In [None]:
data.head()

In [None]:
NewGlucose = pd.Series(["Low", "Normal", "Overweight", "Secret", "High"], dtype = "category")
data["NewGlucose"] = NewGlucose
data.loc[data["Glucose"] <= 70, "NewGlucose"] = NewGlucose[0]
data.loc[(data["Glucose"] > 70) & (data["Glucose"] <= 99), "NewGlucose"] = NewGlucose[1]
data.loc[(data["Glucose"] > 99) & (data["Glucose"] <= 126), "NewGlucose"] = NewGlucose[2]
data.loc[data["Glucose"] > 126 ,"NewGlucose"] = NewGlucose[3]

In [None]:
data = pd.get_dummies(data, columns = ["NewBMI", "NewInsulinScore", "NewGlucose"], drop_first=True)

In [None]:
categorical_data = data[['NewBMI_Obesity 1',
       'NewBMI_Obesity 2', 'NewBMI_Obesity 3', 'NewBMI_Overweight',
       'NewBMI_Underweight', 'NewInsulinScore_Normal', 'NewGlucose_Low',
       'NewGlucose_Normal', 'NewGlucose_Overweight', 'NewGlucose_Secret']]

In [None]:
categorical_data.head()

In [None]:
y=data['Outcome']
X=data.drop(['Outcome','NewBMI_Obesity 1',
       'NewBMI_Obesity 2', 'NewBMI_Obesity 3', 'NewBMI_Overweight',
       'NewBMI_Underweight', 'NewInsulinScore_Normal', 'NewGlucose_Low',
       'NewGlucose_Normal', 'NewGlucose_Overweight', 'NewGlucose_Secret'], axis=1)

In [None]:
cols = X.columns
index = X.index

In [None]:
X.head()

In [None]:
from sklearn.preprocessing import RobustScaler
transformer = RobustScaler().fit(X)
X=transformer.transform(X)
X=pd.DataFrame(X, columns = cols, index = index)

In [None]:
X.head()

In [None]:
X = pd.concat([X, categorical_data], axis=1)

In [None]:
X.head()

In [None]:
X_train, X_test, y_train , y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [None]:
scaler =StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

In [None]:
y_pred = log_reg.predict(X_test)

In [None]:
accuracy_score(y_train, log_reg.predict(X_train))

In [None]:
log_reg_acc = accuracy_score(y_test, log_reg.predict(X_test))

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
rand_clf = RandomForestClassifier(criterion = 'entropy', max_depth = 15, max_features = 0.75, min_samples_leaf = 2, min_samples_split = 3, n_estimators = 130)
rand_clf.fit(X_train, y_train)

In [None]:
y_pred = rand_clf.predict(X_test)

In [None]:
y_pred = rand_clf.predict(X_test)
print(accuracy_score(y_train, rand_clf.predict(X_train)))
rand_acc = accuracy_score(y_test, rand_clf.predict(X_test))
print(accuracy_score(y_test, rand_clf.predict(X_test)))
print(confusion_matrix(y_test, y_pred))

In [None]:
# Model Comparison
models = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest Classifier'],
    'Score': [100*round(log_reg_acc,4), 100*round(rand_acc,4)]
})
models.sort_values(by = 'Score', ascending = False)