<a href="https://www.kaggle.com/code/varun7709yy/maths-project-probability-and-statistics?scriptVersionId=221675942" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import os
print(os.listdir('/kaggle/input'))

In [None]:
# Import necessary libraries
import pandas as pd  

# Load the dataset
df = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')

# Display the first 5 rows
df.head()

In [None]:
df.shape #rows and columns

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe()
#display summary statistics

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plot Glucose Level Distribution
sns.histplot(df['Glucose'], kde=True, bins=30)
plt.title("Glucose Level Distribution")
plt.xlabel("Glucose Level (mg/dL)")
plt.ylabel("Frequency")
plt.show()

In [None]:
import numpy as np

# Compute correlation matrix
corr_matrix = df.corr()

# Plot heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
# Calculate P(Diabetes=1)
p_diabetes = df['Outcome'].mean()

# Calculate P(Glucose > 140 | Diabetes=1)
p_glucose_given_diabetes = (df[df['Outcome'] == 1]['Glucose'] > 140).mean()

# Calculate P(Glucose > 140)
p_glucose = (df['Glucose'] > 140).mean()

# Apply Bayes' Theorem
p_diabetes_given_glucose = (p_glucose_given_diabetes * p_diabetes) / p_glucose

print(f"Probability of having diabetes given high glucose levels: {p_diabetes_given_glucose:.2f}")

In [None]:
from scipy.stats import ttest_ind

# Split data into Diabetes (1) and No-Diabetes (0)
age_diabetes = df[df['Outcome'] == 1]['Age']
age_no_diabetes = df[df['Outcome'] == 0]['Age']

# Perform T-test
t_stat, p_value = ttest_ind(age_diabetes, age_no_diabetes)

print(f"T-Statistic: {t_stat:.2f}, P-Value: {p_value:.4f}")

if p_value < 0.05:
    print("Age has a significant effect on diabetes.")
else:
    print("Age does not have a significant effect on diabetes.")

In [None]:
from sklearn.model_selection import train_test_split

# Define features (X) and target variable (y)
X = df.drop(columns=['Outcome'])  # All columns except 'Outcome'
y = df['Outcome']  # Target variable (0 or 1)

# Split into 80% training and 20% testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training Data: {X_train.shape}, Testing Data: {X_test.shape}")

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train Logistic Regression Model
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Check Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Detailed Report
print(classification_report(y_test, y_pred))