In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

In [2]:
filepath = r"Datasets/diabetes (1).csv"
dataset = pd.read_csv(filepath)

In [3]:
dataset.shape

(768, 9)

In [4]:
dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [5]:
dataset.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [6]:
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [7]:
dataset.tail()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.34,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1
767,1,93,70,31,0,30.4,0.315,23,0


In [8]:
x = dataset.drop(columns=["Outcome"])
y = dataset["Outcome"]

In [9]:
# Finding Optimum Number
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [10]:
pca = PCA()
pca.fit(x_scaled)
variance = pca.explained_variance_ratio_
cumulative_variances = np.cumsum(variance)
diff_cumulative_variance = np.diff(cumulative_variances,2)
elbow_index = np.argmax(diff_cumulative_variance) +1
Optimum_Number = elbow_index

In [11]:
Optimum_Number

6

In [12]:
pca = PCA(n_components=Optimum_Number)
x_pca = pca.fit_transform(x_scaled)

In [13]:
# Data Split for Training and Testing sets
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)
x_pca_train, x_pca_test, y_pca_train, y_pca_test = train_test_split(x_pca,y,test_size=0.2,random_state=42)

In [14]:
# Complete Data Models
# Creating a Random Forest Base Model
RandomForestModel = RandomForestClassifier(random_state=42).fit(x_train,y_train)
ypred_rf = RandomForestModel.predict(x_test)

# Creating linear regression Model
LinearRegressionModel = LinearRegression().fit(x_train,y_train)
ypred_lr = LinearRegressionModel.predict(x_test)

In [15]:
# Reduced Data Models
# Following Random Forest model
RandomForestModelPCA = RandomForestClassifier(random_state=42).fit(x_pca_train,y_pca_train)
ypredpca_rf = RandomForestModelPCA.predict(x_pca_test)

# Following Linear Regresion model
LinearRegressionModelPCA = LinearRegression().fit(x_pca_train,y_pca_train)
ypredpca_lr = LinearRegressionModelPCA.predict(x_pca_test)

In [16]:
def metircscalculation(y_true,y_pred):
    r2 = r2_score(y_true,y_pred)
    MAE = mean_absolute_error(y_true,y_pred)
    MSE = mean_squared_error(y_true,y_pred)
    return r2,MAE,MSE

In [17]:
# Calculating Metrics for complete data
r2_lr,MAE_lr,MSE_lr = metircscalculation(y_test,ypred_lr)
r2_rf,MAE_rf,MSR_rf = metircscalculation(y_test,ypred_rf)

#Calculating Metrics for Reduced data
r2pca_lr,MAEpca_lr,MSEpca_lr = metircscalculation(y_pca_test,ypredpca_lr)
r2pca_rf,MAEpca_rf,MSRpca_rf = metircscalculation(y_pca_test,ypredpca_rf)

In [18]:
print("Linear Regression on complete data:")
print("R-Squared:",r2_lr)
print("Mean Absoulte Error:",MAE_lr)
print("Mean Square Error:",MSE_lr)

Linear Regression on complete data:
R-Squared: 0.2550028117674178
Mean Absoulte Error: 0.3481282599992823
Mean Square Error: 0.17104527280850096


In [19]:
print("Random Forest on Complete data:")
print("R-Squared:",r2_rf)
print("Mean Absoulte Error:",MAE_rf)
print("Mean Square Error:",MSR_rf)

Random Forest on Complete data:
R-Squared: -0.21616161616161644
Mean Absoulte Error: 0.2792207792207792
Mean Square Error: 0.2792207792207792


In [20]:
print("Linear Regression on Reduced data:")
print("R-Squared:",r2pca_rf)
print("Mean Absoulte Error:",MAEpca_rf)
print("Mean Square Error:",MSEpca_lr)

Linear Regression on Reduced data:
R-Squared: -0.21616161616161644
Mean Absoulte Error: 0.2792207792207792
Mean Square Error: 0.16847229218223028


In [21]:
print("Random Forest on Reduced data:")
print("R-Squared:",r2pca_rf)
print("Mean Absoulte Error:",MAEpca_rf)
print("Mean Square Error:",MSRpca_rf)

Random Forest on Reduced data:
R-Squared: -0.21616161616161644
Mean Absoulte Error: 0.2792207792207792
Mean Square Error: 0.2792207792207792
