In [2]:
import pandas as pd
import numpy as np

# Task 1: Load and Explore the Dataset

# Define the file path. In a real scenario, you'd have this file in your directory.
# For this example, I'll use a URL to a common diabetes dataset to make it reproducible.
file_path = "diabetes.csv"

try:
    # Load the dataset
    df = pd.read_csv('diabetes.csv')
    print("Dataset 'diabetes.csv' loaded successfully.")
except FileNotFoundError:
    print(f"Error: The file 'diabetes.csv' was not found at '{file_path}'.")
    # If the file is not found, you can create a sample DataFrame for demonstration
    # This sample will contain missing values to show the cleaning process.
    data = {'Pregnancies': [6, 1, 8, 1, 0, 5, 3],
            'Glucose': [148, 85, 183, np.nan, 137, 116, 78],
            'BloodPressure': [72, 66, 64, 66, np.nan, 74, 50],
            'SkinThickness': [35, 29, 0, 23, 35, 0, 32],
            'Insulin': [0, 0, 0, 94, 168, 0, 88],
            'BMI': [33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0],
            'DiabetesPedigreeFunction': [0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.248],
            'Age': [50, 31, 32, 21, 33, 30, 26],
            'Outcome': [1, 0, 1, 0, 1, 0, 1]}
    df = pd.DataFrame(data)
    print("Using a sample DataFrame with missing values for demonstration.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")
    exit() # Exit if the dataset cannot be loaded

# Display the first few rows of the dataset
print("\n--- First 5 rows of the dataset ---")
print(df.head())

# Explore the structure of the dataset
print("\n--- Dataset Info (Data types and non-null counts) ---")
df.info()

# Check for missing values
# In some diabetes datasets, 0 is used to represent a missing value for certain features
# like Glucose, BloodPressure, SkinThickness, Insulin, and BMI.
# Let's count these 'pseudo-missing' values.
print("\n--- Counting 'pseudo-missing' values (0s in key columns) ---")
pseudo_missing_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in pseudo_missing_cols:
    zero_count = (df[col] == 0).sum()
    if zero_count > 0:
        print(f"Column '{col}': {zero_count} zeros found.")
    else:
        print(f"Column '{col}': No zeros found.")

# Clean the dataset by handling missing values
# Strategy: Replace 0s with NaN, then fill NaNs with the mean of the column.
# This is a common practice for this dataset to avoid treating 0 as a valid measurement.
print("\n--- Cleaning the dataset ---")
for col in pseudo_missing_cols:
    df[col] = df[col].replace(0, np.nan)
    print(f"Replaced 0s with NaN in '{col}'.")

# Now, let's see the actual NaNs
print("\n--- Missing values (NaN) per column after converting 0s ---")
print(df.isnull().sum())

# Fill missing numerical values with the mean of their respective columns
print("\n--- Filling missing values with the mean ---")
numerical_cols = df.select_dtypes(include=np.number).columns
for col in numerical_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].mean(), inplace=True)
        print(f"Filled missing values in '{col}' with its mean.")

# Verify the cleaning
print("\n--- Dataset Info after cleaning ---")
df.info()

print("\n--- First 5 rows of the cleaned dataset ---")
print(df.head())

# Check for any remaining missing values
print("\n--- Final check for missing values ---")
print(df.isnull().sum())

Dataset 'diabetes.csv' loaded successfully.

--- First 5 rows of the dataset ---
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  

--- Dataset Info (Data types and non-null counts) ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ---

In [None]:
import pandas as pd
import numpy as np

# Load the diabetes.csv dataset.
try:
    df = pd.read_csv('diabetes.csv')
except FileNotFoundError:
    print("Error: The 'diabetes.csv' file was not found. Please make sure the file is in the same directory.")
    # Exit the script gracefully if the file is not found
    exit()

print("Original DataFrame (first 5 rows):")
print(df.head())
print("-" * 50)

# Task 1: Compute basic statistics of the numerical columns.
# We will use the .describe() method on a selection of key numerical columns.
print("1. Basic Statistics of Key Numerical Columns:")
numerical_stats = df[['Pregnancies', 'Glucose', 'BMI', 'Age']].describe()
print(numerical_stats)
print("-" * 50)

# Task 2: Perform grouping on the 'Outcome' categorical column and compute the mean of 'Glucose' and 'BMI'.
# The 'Outcome' column indicates whether a person is diabetic (1) or not (0).
# This helps us compare key health indicators between the two groups.
print("2. Mean Glucose and BMI by Diabetes Outcome:")
mean_by_outcome = df.groupby('Outcome')[['Glucose', 'BMI']].mean()
print(mean_by_outcome)
print("-" * 50)

# Task 3: Identify any patterns or interesting findings.
# From the analysis above, we can draw a few conclusions about the dataset.
print("3. Findings from the Analysis:")
print("- The mean glucose level and BMI are notably higher for individuals with a diabetes outcome of 1 (diabetic) compared to those with an outcome of 0 (non-diabetic).")
print("- The average age and number of pregnancies are also higher for the diabetic group.")
print("- The .describe() output shows a wide range of values for columns like Glucose and Insulin, indicating a diverse population sample.")


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the diabetes.csv dataset.
try:
    df = pd.read_csv('diabetes.csv')
except FileNotFoundError:
    print("Error: The 'diabetes.csv' file was not found. Please make sure the file is in the same directory.")
    # Exit the script gracefully if the file is not found
    exit()

print("Original DataFrame (first 5 rows):")
print(df.head())
print("-" * 50)

# Task 1: Compute basic statistics of the numerical columns.
# We will use the .describe() method on a selection of key numerical columns.
print("1. Basic Statistics of Key Numerical Columns:")
numerical_stats = df[['Pregnancies', 'Glucose', 'BMI', 'Age']].describe()
print(numerical_stats)
print("-" * 50)

# Task 2: Perform grouping on the 'Outcome' categorical column and compute the mean of 'Glucose' and 'BMI'.
# The 'Outcome' column indicates whether a person is diabetic (1) or not (0).
# This helps us compare key health indicators between the two groups.
print("2. Mean Glucose and BMI by Diabetes Outcome:")
mean_by_outcome = df.groupby('Outcome')[['Glucose', 'BMI']].mean()
print(mean_by_outcome)
print("-" * 50)

# Task 3: Data Visualization
# Create various plots to visualize the dataset.

# Bar Chart: Compare mean Glucose and BMI by Outcome
print("3. Data Visualization:")
mean_by_outcome.plot(kind='bar', figsize=(10, 6), color=['skyblue', 'salmon'])
plt.title('Mean Glucose and BMI by Diabetes Outcome')
plt.xlabel('Outcome (0: Non-Diabetic, 1: Diabetic)')
plt.ylabel('Mean Value')
plt.xticks(rotation=0)
plt.legend(title='Metric')
plt.grid(axis='y', linestyle='--')
plt.tight_layout()
plt.show()

# Histogram: Distribution of Glucose levels
plt.figure(figsize=(10, 6))
sns.histplot(df['Glucose'], kde=True, bins=20, color='teal')
plt.title('Distribution of Glucose Levels')
plt.xlabel('Glucose Level')
plt.ylabel('Frequency')
plt.grid(axis='y', linestyle='--')
plt.tight_layout()
plt.show()

# Scatter Plot: Relationship between BMI and Glucose
plt.figure(figsize=(10, 6))
sns.scatterplot(x='BMI', y='Glucose', hue='Outcome', data=df, palette='viridis', s=50)
plt.title('Relationship between BMI and Glucose')
plt.xlabel('Body Mass Index (BMI)')
plt.ylabel('Glucose Level')
plt.legend(title='Outcome')
plt.grid(True, linestyle='--')
plt.tight_layout()
plt.show()

# Line Chart (as a placeholder for time-series data)
# Since the diabetes dataset is not time-series, this line plot shows the
# trend of BMI values over the sample index to demonstrate the plot type.
plt.figure(figsize=(10, 6))
df['BMI'].plot(kind='line', color='darkgreen')
plt.title('BMI Values Across the Dataset')
plt.xlabel('Sample Index')
plt.ylabel('BMI')
plt.grid(True, linestyle='--')
plt.tight_layout()
plt.show()

# Task 4: Identify any patterns or interesting findings from all analyses.
# From the analysis above, we can draw a few conclusions about the dataset.
print("-" * 50)
print("4. Findings from the Analysis:")
print("- The mean glucose level and BMI are notably higher for individuals with a diabetes outcome of 1 (diabetic) compared to those with an outcome of 0 (non-diabetic). This is clearly visible in the bar chart.")
print("- The average age and number of pregnancies are also higher for the diabetic group.")
print("- The .describe() output shows a wide range of values for columns like Glucose and Insulin, indicating a diverse population sample.")
print("- The scatter plot reveals a positive correlation between BMI and Glucose, suggesting that as one's BMI increases, their glucose level also tends to increase.")
print("- The histogram shows that glucose levels are somewhat normally distributed, but with a slight skew towards higher values.")
