In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

 # Necessary libraries for data analysis and visualization

In [None]:
df = pd.read_csv('diabetes.csv') #The dataset we use

In [None]:
print(df.head()) # It displays the first 5 rows of the dataset to get an overview of the data
print(df.info()) # It shows some general information about the dataset

# I made this part to see how the dataset looks like before starting the main code

In [None]:
# I made a loop through each column in the dataset EXCEPT the target column 'Outcome' by using df.columns[:-1] (to exclude the last column which is 'Outcome')

for column in df.columns[:-1]:
    print(f"Analysis for {column}:")
    print(f"Mean: {df[column].mean()}") # Average value
    print(f"Median: {df[column].median()}") # Middle value when sorted
    print(f"Mode: {df[column].mode()[0]}") # Most frequent value
    # df[column].mode() returns a Series, so I took the first value by using[0]
    print(f"Standard Deviation: {df[column].std()}") # How spread out the values are
    print(f"Variance: {df[column].var()}") # Standard Deviation squared / Variance
    print(f"Range of values: {df[column].max() - df[column].min()}") # Max - Min values
    print("\n") #I printed a new line for better readability between columns (it's not necessary)

In [None]:
# There i droped the 'Outcome' itself to avoid trivial correlation of 1.0
correlations = df.corr()['Outcome'].drop('Outcome')
print(correlations)

In [None]:
# I used a heatmap to visualize the correlation matrix to see the strength of correlations clearer
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.show()

In [None]:
# Correlation Analysis with the Target Column 'Outcome':
#
# In this dataset, the target column 'Outcome' shows whether a person has diabetes (1) or not (0).
#
# After checking the correlation values, the feature that has the strongest connection with the Outcome is 'Glucose',
# with a correlation of around 0.47. This makes perfect sense because high blood sugar levels are one of the main signs of diabetes.
#
# Other features that show a decent correlation are:
# BMI (~0.29) — since being overweight or obese is a common risk factor for diabetes.
# Age (~0.24) — older people are more likely to develop diabetes.
# Pregnancies (~0.22) — having more pregnancies can sometimes increase the risk, especially if someone had gestational diabetes before.
#
# The rest of the features like BloodPressure or SkinThickness don’t have a strong correlation with diabetes in this dataset.
# This doesn’t mean they’re not important in real life — but here, they don’t seem to directly influence the Outcome as much as the others.
#
# Overall, the results match what we know from medical research — glucose levels, BMI, and age are some of the most common indicators of diabetes.

# Here is a site that confirms my small research and explains the diabetes topic
# https://www.nhlbi.nih.gov/resources/what-diabetes-fact-sheet