# Oral Health Risk Analysis
This notebook performs a full analysis on a dataset with encoded features for dental and dietary health.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import f_oneway, kruskal, chi2_contingency
from pandas.api.types import is_numeric_dtype
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
# Load the data
df = pd.read_csv('training_data.csv')
df.shape, df.columns.tolist()

## Correlation Matrix and Heatmap

In [None]:
# Only use numeric columns
numeric_df = df.select_dtypes(include=np.number)
correlation_matrix = numeric_df.corr()
plt.figure(figsize=(16, 12))
sns.heatmap(correlation_matrix, cmap='coolwarm', annot=False)
plt.title('Pearson Correlation Heatmap')
plt.tight_layout()
plt.show()

## Distribution Plots

In [None]:
for column in numeric_df.columns:
    plt.figure(figsize=(6, 4))
    sns.histplot(df[column], kde=True, bins=20)
    plt.title(f'Distribution of {column}')
    plt.xlabel(column)
    plt.tight_layout()
    plt.show()

## Statistical Tests Against Risk Level

In [None]:
risk_groups = df['risk_level'].unique()
results = []
for column in df.columns:
    if column == 'risk_level':
        continue
    if is_numeric_dtype(df[column]):
        groups = [df[df['risk_level'] == level][column] for level in risk_groups]
        stat, p = kruskal(*groups)
        results.append((column, 'Kruskal-Wallis', p))
    else:
        contingency = pd.crosstab(df[column], df['risk_level'])
        chi2, p, _, _ = chi2_contingency(contingency)
        results.append((column, 'Chi-square', p))

## Summary Table with p-values

In [None]:
summary_df = pd.DataFrame(results, columns=['Variable', 'Test', 'p-value'])
summary_df['Significant (<0.05)'] = summary_df['p-value'] < 0.05
summary_df.sort_values(by='p-value', inplace=True)
summary_df