# Diabetes Dataset Analysis

## 1. Loading and Inspecting the Data

In [None]:
import pandas as pd

# Load the dataset
data = pd.read_csv('/mnt/data/diabetes.csv')

# Display the first few rows to understand the structure of the dataset
data.head()

## 2. Checking Dataset Shape

In [None]:
data.shape

## 3. Summary Statistics

In [None]:
data.describe()

## 4. Distribution of Target Variable

In [None]:
data['Outcome'].value_counts()

## 5. Checking and Handling Missing Data

In [None]:
data.isnull().sum()

## 6. Handling Outliers using Boxplot

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Create a boxplot for all numerical features to spot outliers
plt.figure(figsize=(15,10))
sns.boxplot(data=data)
plt.xticks(rotation=90)
plt.show()

## 7. Removing Outliers (Optional)

In [None]:
# Calculate IQR for each column and remove outliers
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1

# Remove rows where any feature has values outside 1.5*IQR from Q1 and Q3
data_cleaned = data[~((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).any(axis=1)]
data_cleaned.shape  # Check the new shape after removing outliers

## 8. Correlation Heatmap

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm', linewidths=0.5)
plt.show()

## 9. Data Visualization (Distributions)

In [None]:
plt.figure(figsize=(15,5))

plt.subplot(1, 3, 1)
sns.histplot(data['Glucose'], kde=True)
plt.title('Glucose Distribution')

plt.subplot(1, 3, 2)
sns.histplot(data['BMI'], kde=True)
plt.title('BMI Distribution')

plt.subplot(1, 3, 3)
sns.histplot(data['Age'], kde=True)
plt.title('Age Distribution')

plt.show()

## 10. Pair Plot for Feature Relationships

In [None]:
sns.pairplot(data, hue='Outcome', diag_kind='kde')
plt.show()

## 11. Feature Engineering (BMI Categories)

In [None]:
# Define categories for BMI based on general ranges
bins = [0, 18.5, 24.9, 29.9, 40]
labels = ['Underweight', 'Normal', 'Overweight', 'Obese']

data['BMI_Category'] = pd.cut(data['BMI'], bins=bins, labels=labels)

# Check the first few rows to verify the new column
data[['BMI', 'BMI_Category']].head()

## 12. Feature Engineering (Age Grouping)

In [None]:
# Create age bins (e.g., young, middle-aged, senior)
bins = [20, 30, 40, 50, 100]
labels = ['Young', 'Middle-Aged', 'Senior', 'Elderly']

data['Age_Group'] = pd.cut(data['Age'], bins=bins, labels=labels)

# Check the first few rows to verify the new column
data[['Age', 'Age_Group']].head()

## 13. Feature Importance using Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Train a Random Forest classifier and get feature importances
X = data.drop('Outcome', axis=1)
y = data['Outcome']

rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)

# Get feature importances
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]  # Sort feature importances

# Plot the feature importances
plt.figure(figsize=(10,6))
plt.title('Feature Importance')
plt.bar(range(X.shape[1]), importances[indices], align='center')
plt.xticks(range(X.shape[1]), X.columns[indices], rotation=90)
plt.show()

## 14. Train-Test Split Variations

In [None]:
from sklearn.model_selection import train_test_split

# Example of a 70-30 split
X_train_70, X_test_30, y_train_70, y_test_30 = train_test_split(X, y, test_size=0.3, random_state=42)

# Check the new shape of training and test sets
print('Training set:', X_train_70.shape)
print('Testing set:', X_test_30.shape)