In [16]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

In [18]:
# Load the datasets
uci_data = pd.read_csv('dataset/diabetes.csv')
pima_data = pd.read_csv('dataset/pima_indian_diabetes.csv')

In [19]:
# Helper function for univariate analysis
def univariate_analysis(data, dataset_name):
    print(f"Univariate Analysis for {dataset_name}")
    print(data.describe())  # Summary statistics
    
    for col in data.columns[:-1]:  # Exclude the target column
        print(f"\nColumn: {col}")
        print(f"Mean: {data[col].mean()}")
        print(f"Median: {data[col].median()}")
        print(f"Mode: {data[col].mode()[0]}")
        print(f"Variance: {data[col].var()}")
        print(f"Standard Deviation: {data[col].std()}")
        print(f"Skewness: {data[col].skew()}")
        print(f"Kurtosis: {data[col].kurtosis()}")

In [20]:
# Conduct univariate analysis
univariate_analysis(uci_data, "UCI Diabetes Dataset")
univariate_analysis(pima_data, "Pima Indians Diabetes Dataset")

Univariate Analysis for UCI Diabetes Dataset
       Diabetes_binary         HighBP       HighChol      CholCheck  \
count    253680.000000  253680.000000  253680.000000  253680.000000   
mean          0.139333       0.429001       0.424121       0.962670   
std           0.346294       0.494934       0.494210       0.189571   
min           0.000000       0.000000       0.000000       0.000000   
25%           0.000000       0.000000       0.000000       1.000000   
50%           0.000000       0.000000       0.000000       1.000000   
75%           0.000000       1.000000       1.000000       1.000000   
max           1.000000       1.000000       1.000000       1.000000   

                 BMI         Smoker         Stroke  HeartDiseaseorAttack  \
count  253680.000000  253680.000000  253680.000000         253680.000000   
mean       28.382364       0.443169       0.040571              0.094186   
std         6.608694       0.496761       0.197294              0.292087   
min        

In [21]:
# Bivariate Analysis: Linear Regression (for a numeric feature and binary target)
def bivariate_analysis(data, dataset_name):
    print(f"\nBivariate Analysis for {dataset_name}")
    X = pima_data['Glucose'].values.reshape(-1, 1)  # Use 'Glucose' as feature
    y = pima_data['Outcome']  # Use 'Outcome' as target (binary)

    # Linear Regression
    lin_reg = LinearRegression()
    lin_reg.fit(X, y)
    print(f"Linear Regression Coefficient: {lin_reg.coef_[0]}")
    print(f"Linear Regression Intercept: {lin_reg.intercept_}")

    # Logistic Regression
    log_reg = LogisticRegression(max_iter=200)
    log_reg.fit(X, y)
    y_pred = log_reg.predict(X)
    accuracy = accuracy_score(y, y_pred)
    print(f"Logistic Regression Accuracy: {accuracy:.2f}")
    print(classification_report(y, y_pred))


In [22]:
# Conduct bivariate analysis
bivariate_analysis(uci_data, "UCI Diabetes Dataset")
bivariate_analysis(pima_data, "Pima Indians Diabetes Dataset")



Bivariate Analysis for UCI Diabetes Dataset
Linear Regression Coefficient: 0.006960225751927552
Linear Regression Intercept: -0.49249489634012683
Logistic Regression Accuracy: 0.75
              precision    recall  f1-score   support

           0       0.76      0.89      0.82       500
           1       0.70      0.49      0.57       268

    accuracy                           0.75       768
   macro avg       0.73      0.69      0.70       768
weighted avg       0.74      0.75      0.73       768


Bivariate Analysis for Pima Indians Diabetes Dataset
Linear Regression Coefficient: 0.006960225751927552
Linear Regression Intercept: -0.49249489634012683
Logistic Regression Accuracy: 0.75
              precision    recall  f1-score   support

           0       0.76      0.89      0.82       500
           1       0.70      0.49      0.57       268

    accuracy                           0.75       768
   macro avg       0.73      0.69      0.70       768
weighted avg       0.74     

In [23]:
# Multiple Regression Analysis
def multiple_regression_analysis(data, dataset_name):
    print(f"\nMultiple Regression Analysis for {dataset_name}")
    X = pima_data.drop(columns=['Outcome'])
    y = pima_data['Outcome']

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Linear Regression
    lin_reg = LinearRegression()
    lin_reg.fit(X_train, y_train)
    y_pred = lin_reg.predict(X_test)
    print(f"Linear Regression Score (R²): {lin_reg.score(X_test, y_test):.2f}")

    # Logistic Regression
    log_reg = LogisticRegression(max_iter=200)
    log_reg.fit(X_train, y_train)
    y_pred_log = log_reg.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred_log)
    print(f"Logistic Regression Accuracy: {accuracy:.2f}")
    print(classification_report(y_test, y_pred_log))

In [24]:
# Conduct multiple regression analysis
multiple_regression_analysis(uci_data, "UCI Diabetes Dataset")
multiple_regression_analysis(pima_data, "Pima Indians Diabetes Dataset")


Multiple Regression Analysis for UCI Diabetes Dataset
Linear Regression Score (R²): 0.22
Logistic Regression Accuracy: 0.74
              precision    recall  f1-score   support

           0       0.80      0.79      0.80       151
           1       0.62      0.62      0.62        80

    accuracy                           0.74       231
   macro avg       0.71      0.71      0.71       231
weighted avg       0.74      0.74      0.74       231


Multiple Regression Analysis for Pima Indians Diabetes Dataset
Linear Regression Score (R²): 0.22
Logistic Regression Accuracy: 0.74
              precision    recall  f1-score   support

           0       0.80      0.79      0.80       151
           1       0.62      0.62      0.62        80

    accuracy                           0.74       231
   macro avg       0.71      0.71      0.71       231
weighted avg       0.74      0.74      0.74       231



In [25]:
def compare_datasets(pima_df, other_df):
    print("\nComparison of Datasets:")
    
    pima_mean = pima_df.mean()
    other_mean = other_df.mean()
    
    mean_comparison = pd.DataFrame({
        'Pima': pima_mean,
        'Other': other_mean
    })

    print("Mean Comparison:\n", mean_comparison)


In [26]:
compare_datasets(pima_df,other_df)


Comparison of Datasets:
Mean Comparison:
                                Pima       Other
Age                        8.032119   33.240885
AnyHealthcare              0.951053         NaN
BMI                       28.382364   31.992578
BloodPressure                   NaN   69.105469
CholCheck                  0.962670         NaN
DiabetesPedigreeFunction        NaN    0.471876
Diabetes_binary            0.139333         NaN
DiffWalk                   0.168224         NaN
Education                  5.050434         NaN
Fruits                     0.634256         NaN
GenHlth                    2.511392         NaN
Glucose                         NaN  120.894531
HeartDiseaseorAttack       0.094186         NaN
HighBP                     0.429001         NaN
HighChol                   0.424121         NaN
HvyAlcoholConsump          0.056197         NaN
Income                     6.053875         NaN
Insulin                         NaN   79.799479
MentHlth                   3.184772         N

In [27]:
print("\nComparison of Results:")
print("1. Univariate statistics provide an understanding of the distribution of each feature.")
print("2. Bivariate analysis shows the relationship between 'Glucose' and the binary target 'Outcome'.")
print("3. Multiple regression results give an overall model performance for predicting the binary outcome 'Outcome'.")


Comparison of Results:
1. Univariate statistics provide an understanding of the distribution of each feature.
2. Bivariate analysis shows the relationship between 'Glucose' and the binary target 'Outcome'.
3. Multiple regression results give an overall model performance for predicting the binary outcome 'Outcome'.
