In [1]:
import pandas as pd

# Load the Pima Indians Diabetes dataset
pima_data = pd.read_csv('pima-indians-diabetes.csv')

# Load the UCI Diabetes dataset (replace the URL with the actual file path or download link)
uci_data = pd.read_csv('diabetes.csv')


A.Frequency Distribution, Mean, Median, Mode, Variance, Standard Deviation, Skewness, and Kurtosis

In [2]:
# Univariate analysis for Pima Indians Diabetes dataset
pima_stats = pima_data.describe().transpose()
pima_stats['mode'] = pima_data.mode().iloc[0]
pima_stats['variance'] = pima_data.var()
pima_stats['skewness'] = pima_data.skew()
pima_stats['kurtosis'] = pima_data.kurt()

# Univariate analysis for UCI Diabetes dataset
uci_stats = uci_data.describe().transpose()
uci_stats['mode'] = uci_data.mode().iloc[0]
uci_stats['variance'] = uci_data.var()
uci_stats['skewness'] = uci_data.skew()
uci_stats['kurtosis'] = uci_data.kurt()

# Output summary
print("Pima Diabetes Stats:\n", pima_stats)
print("\nUCI Diabetes Stats:\n", uci_stats)


Pima Diabetes Stats:
                           count        mean         std     min       25%  \
Pregnancies               768.0    3.845052    3.369578   0.000   1.00000   
Glucose                   768.0  120.894531   31.972618   0.000  99.00000   
BloodPressure             768.0   69.105469   19.355807   0.000  62.00000   
SkinThickness             768.0   20.536458   15.952218   0.000   0.00000   
Insulin                   768.0   79.799479  115.244002   0.000   0.00000   
BMI                       768.0   31.992578    7.884160   0.000  27.30000   
DiabetesPedigreeFunction  768.0    0.471876    0.331329   0.078   0.24375   
Age                       768.0   33.240885   11.760232  21.000  24.00000   
Outcome                   768.0    0.348958    0.476951   0.000   0.00000   

                               50%        75%     max    mode      variance  \
Pregnancies                 3.0000    6.00000   17.00   1.000     11.354056   
Glucose                   117.0000  140.25000  19

B.Bivariate Analysis:
Linear Regression Modeling

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Example: Linear regression on Pima dataset (predicting 'Pregnancies' based on other features)
X = pima_data.drop('Pregnancies', axis=1)
y = pima_data['Pregnancies']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
lin_reg_model = LinearRegression()
lin_reg_model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = lin_reg_model.predict(X_test)


Logistic Regression:

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Logistic regression on Pima dataset (predicting 'Outcome' - diabetes)
X = pima_data.drop('Outcome', axis=1)
y = pima_data['Outcome']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fit the model with increased iterations and a solver option
log_reg_model = LogisticRegression(max_iter=1000, solver='lbfgs')  # You can try other solvers like 'saga' or 'liblinear'
log_reg_model.fit(X_train_scaled, y_train)

# Predictions and evaluation
y_pred = log_reg_model.predict(X_test_scaled)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Confusion Matrix:\n {confusion_matrix(y_test, y_pred)}")


Accuracy: 0.7532467532467533
Confusion Matrix:
 [[79 20]
 [18 37]]


C. Multiple Regression Analysis

In [5]:
from sklearn.linear_model import LinearRegression

# Multiple Regression on Pima dataset (predicting 'Glucose' using other features)
X = pima_data.drop('Glucose', axis=1)
y = pima_data['Glucose']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
multiple_reg_model = LinearRegression()
multiple_reg_model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = multiple_reg_model.predict(X_test)


Comparison of Results

In [11]:
import pandas as pd

# Define the order you want the metrics to be in
metric_order = ["Mean", "Median", "Mode", "Variance", "Std. Dev.", "Skewness", "Kurtosis", "Accuracy"]

# Create the dictionary
comparison = {
    "Metric": metric_order,
    "Pima Dataset": [pima_stats['mean'], pima_stats['50%'], pima_stats['mode'], pima_stats['variance'], pima_stats['std'], pima_stats['skewness'], pima_stats['kurtosis'], accuracy_pima],
    "UCI Dataset": [uci_stats['mean'], uci_stats['50%'], uci_stats['mode'], uci_stats['variance'], uci_stats['std'], uci_stats['skewness'], uci_stats['kurtosis'], accuracy_uci]
}

# Create the DataFrame
comparison_df = pd.DataFrame(comparison)

# Print the DataFrame
print(comparison_df)



      Metric                                       Pima Dataset  \
0       Mean  Pregnancies                   3.845052
Glucose...   
1     Median  Pregnancies                   3.0000
Glucose  ...   
2       Mode  Pregnancies                  1.000
Glucose    ...   
3   Variance  Pregnancies                    11.354056
Gluco...   
4  Std. Dev.  Pregnancies                   3.369578
Glucose...   
5   Skewness  Pregnancies                 0.901674
Glucose  ...   
6   Kurtosis  Pregnancies                 0.159220
Glucose  ...   
7   Accuracy                                               0.85   

                                         UCI Dataset  
0  Pregnancies                   3.845052
Glucose...  
1  Pregnancies                   3.0000
Glucose  ...  
2  Pregnancies                  1.000
Glucose    ...  
3  Pregnancies                    11.354056
Gluco...  
4  Pregnancies                   3.369578
Glucose...  
5  Pregnancies                 0.901674
Glucose  ...  
6  Pregnanc