In [None]:
# Important Libraries Required
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import io
import matplotlib.ticker as mtick
from scipy.stats import skew
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy.stats import shapiro

In [None]:
# Reading the file
df = pd.read_csv('/Users/rubyc/Desktop/Capstone_2025/Office-of-Catholic-Schools-MSDS-25.3-/Data/final_dataset.csv')

In [None]:
# Basic Information on the data
print(df.info())

In [None]:
# Checking for Missing values in the data
missing = df.isnull().sum().sort_values(ascending=False)
missing[missing > 0]

In [None]:
# Imputing 0 in the missing values
columns_to_fill = [
    'fulcrum_grant_mean',
    'fulcrum_total_app',
    'fulcrum_grant_count',
    'Nonfamily Avg Household Size',
    'Med inc_Med income families'
]
df[columns_to_fill] = df[columns_to_fill].fillna(0)
print(df[columns_to_fill].isnull().sum())

In [None]:
# Checking for Summary Statistics
df.describe(include='number').T

In [None]:
# Checking for Number of School in each year from 2018 to 2022
schools_per_year = df.groupby('Year')['school_id'].nunique().reset_index()
schools_per_year.columns = ['Year', 'Total Schools']
print(schools_per_year)

In [None]:
# Checking for Unique values in the all the columns 
unique_vals = df.nunique().sort_values(ascending=True)
unique_vals

In [None]:
# Listing out number of schools in each Region
schools_per_region = df.groupby('Region')['school_id'].nunique().reset_index()
schools_per_region.columns = ['Region', 'Total Schools']
print(schools_per_region)

In [None]:
# Listing Numerical Columns and Categorical columns
num_cols = df.select_dtypes(include='number').columns.tolist()
cat_cols = df.select_dtypes(include='object').columns.tolist()
print("Numerical Columns:")
print(num_cols)
print("\nCategorical Columns:")
print(cat_cols)

In [None]:
# Choosing important Columns we will be using for analysis
cat_cols = ['type', 'Region', 'location_type', 'District', 'parish', 'level']

num_cols = ['tuition_fees_finaid_scholarships', 'parish_support_direct',
       'neighboring_par_support', 'fundraising_parents_club', 'bequests',
       'gift_revenue_44xx', 'business_revenue_45xx', 'all_other_revenue',
       'salaries_51xx', 'benefits_52xx', 'supplies',
       'repairs_maintenance_58xx', 'program_expenses', 'contracted_services',
       'interest_expense_6106_6106', 'fundraising_expense_6180', 'utilities',
       'depreciation_bad_debts_62xx', 'all_other_expenses', 'net_program_80', 'total_revenue_80',
       'total_expenses_80',
       'TS-Enrollment', 'TS-Capacity', 'TS-Catholic', 'TS-Non-Catholic',
       'TS-Hispanic', 'TPS-Catholic', 'TPS-Non-Catholic', 'TPS-Employment',
       'fulcrum_total_app', 'fulcrum_grant_count',
       'fulcrum_grant_mean', 'Total Households', 'Total Families',
       'Households with Children Under 18', 'Owner-Occupied Units',
       'Renter-Occupied Units', 'Nonfamily Total Households',
       'Avg Household Size', 'Avg Family Size', 'Nonfamily Avg Household Size',
       'Married Avg Household Size', 'Married Avg Family Size',
       'Income Below Poverty Level (Male)',
       'Income Below Poverty Level (Female)',
       'Income At or Above Poverty Level',
       'Income At or Above Poverty Level (Male)',
       'Income At or Above Poverty Level (Female)', 'Total Population',
       'Male (Male)', 'Female (Female)', 'child_15_17', 'child_10_14',
       'child_5_9', 'child_under_5', 'male_young_adults', 'male_middle_age',
       'male_older_adults', 'female_young_adults', 'female_middle_age',
       'female_older_adults', 'Med inc_all households',
       'Med inc_Med income families', 'pub_enroll_all_students',
       'pub_enroll_female', 'pub_enroll_male',
       'pub_enroll_americanindian_alaskanative', 'pub_enroll_asian',
       'pub_enroll_black_africanamerican', 'pub_enroll_hispanic_latino',
       'pub_enroll_nativehawaiian_pacificislander', 'pub_enroll_two_more',
       'pub_enroll_white', 'PercentMetStandard_ELA', 'PercentMetStandard_Math',
       'PercentLevel1_ELA', 'PercentLevel1_Math', 'PercentLevel2_ELA',
       'PercentLevel2_Math', 'PercentLevel3_ELA', 'PercentLevel3_Math',
       'PercentLevel4_ELA', 'PercentLevel4_Math']

In [None]:
# Now let's check out the pattern in our Numerical columns data
excluded = ['school_id', 'zip_code']
num_cols = df.select_dtypes(include='number').drop(columns=excluded, errors='ignore').columns
num_rows = (len(num_cols) // 5) + 1  
num_cols_grid = 5  

# Plotting histograms
plt.figure(figsize=(15, num_rows * 3)) 

for i, col in enumerate(num_cols, 1):
    plt.subplot(num_rows, num_cols_grid, i) 
    sns.histplot(df[col].dropna(), kde=True)  
    plt.title(col, fontsize=10)  

plt.subplots_adjust(hspace=0.5)  

plt.tight_layout()
plt.show()

In [None]:
# Checking for Skewness in the data
skewness = df[num_cols].skew()
skewness_df = pd.DataFrame({'col': skewness.index, 'skew': skewness.values})
print(skewness_df[abs(skewness_df['skew']) > 1])

#### Visulizations

In [None]:
# Enrollment Trends over the years
total_enrollment_by_year = df.groupby('Year')['TS-Enrollment'].sum()
plt.figure(figsize=(10, 6))
total_enrollment_by_year.plot(kind='line', marker='o', color='purple', linewidth=2)
plt.title('Enrollment Trends')
plt.xlabel('Academic Year')
plt.ylabel('Total Enrollment')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

In [None]:
# Fulcrum Application Trend
enrollment_by_year = df.groupby('Year')['fulcrum_total_app'].sum().reset_index()

plt.figure(figsize=(10, 6))
plt.plot(enrollment_by_year['Year'], enrollment_by_year['fulcrum_total_app'], marker='o', color='purple')
for i, value in enumerate(enrollment_by_year['fulcrum_total_app']):
    plt.text(i, value, f'{value:.0f}', ha='center', va='bottom')
plt.xlabel('Year')
plt.ylabel('Total Fulcrum Applications')
plt.title('Total Fulcrum Applications Across Years')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Fulcrum Application Percentage compared to total enrollments
df_year = df.groupby('Year')[['fulcrum_total_app', 'TS-Enrollment']].sum().reset_index()

# Calculating percentage
df_year['percent'] = (df_year['fulcrum_total_app'] / df_year['TS-Enrollment']) * 100

# Plotting the trend
plt.plot(df_year['Year'], df_year['percent'], marker='o', color='purple')
for x, y in zip(df_year['Year'], df_year['percent']):
    plt.text(x, y, f'{y:.1f}%', ha='center', va='bottom')
plt.title('Fulcrum Applications (% of Total Enrollment)')
plt.xlabel('Year')
plt.ylabel('Percentage')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Checking the Fulcrum percentage trend with all the numbers
data_by_year = df.groupby('Year')[['TS-Enrollment', 'fulcrum_total_app']].sum()
data_by_year['Percentage'] = (data_by_year['fulcrum_total_app'] / data_by_year['TS-Enrollment']) * 100
print(data_by_year)

In [None]:
# Correlation Analysis (More than 0.4)
corr_matrix = df[num_cols].corr()
#Filtering
filtered_cols = corr_matrix.columns[(corr_matrix.abs() > 0.4).any()]
filtered_corr = corr_matrix.loc[filtered_cols, filtered_cols]
top_cols = filtered_corr.abs().max().sort_values(ascending=False).head(15).index
top_corr = filtered_corr.loc[top_cols, top_cols]

# Creating the heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(top_corr, cmap='coolwarm', center=0, annot=True, fmt='.2f', square=True)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.title('Numerical Columns with |Correlation| > 0.4')
plt.tight_layout()
plt.show()

In [None]:
# Box Plot for Trend Analysis on the Enrollments across Regions
cat_vars = ['Region']
num_var = 'TS-Enrollment'

plt.figure(figsize=(16, 6))

for i, cat in enumerate(cat_vars, 1):
    plt.subplot(1, 2, i)
    order = df.groupby(cat)[num_var].median().sort_values().index
    sns.boxplot(
        x=cat,
        y=num_var,
        data=df,
        hue=cat,
        palette='Set2',
        order=order,
        showfliers=False,
        legend=False  
    )
    plt.title(f'{num_var} by {cat}', fontsize=12)
    plt.xticks(rotation=30, ha='right')
    plt.xlabel(cat, fontsize=10)
    plt.ylabel(num_var, fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
# Enrollment Trends Region wise
df_sorted = df.sort_values('Year')
g = sns.relplot(
    data=df_sorted,
    x='Year',
    y='TS-Enrollment',
    col='Region',
    kind='line',
    marker='o',
    color='purple',
    col_wrap=1,
    height=4,
    aspect=2.5,
    facet_kws={'sharey': False}
)
for ax in g.axes.flat:
    ax.tick_params(labelbottom=True)

plt.tight_layout()
plt.show()


In [None]:
# Enrollment Trends across Catholic and Non Catholic Students
df_grouped = df.groupby('Year')[['TS-Catholic', 'TS-Non-Catholic']].sum().reset_index()
plt.figure(figsize=(10, 6))
sns.lineplot(data=df_grouped, x='Year', y='TS-Catholic', label='Catholic Enrollment', marker='o',color='purple')
sns.lineplot(data=df_grouped, x='Year', y='TS-Non-Catholic', label='Non-Catholic Enrollment', marker='o', color='blue')
plt.title('Catholic vs Non-Catholic Enrollment Over Years')
plt.tight_layout()
plt.show()

In [None]:
# Enrollment Trends across Catholic and Non Catholic Students (Percenatge of Total Enrollment
df_grouped = df.groupby('Year')[['TS-Catholic', 'TS-Non-Catholic', 'TS-Enrollment']].sum().reset_index()
df_grouped['Catholic_%'] = (df_grouped['TS-Catholic'] / df_grouped['TS-Enrollment']) * 100
df_grouped['Non_Catholic_%'] = (df_grouped['TS-Non-Catholic'] / df_grouped['TS-Enrollment']) * 100

# Ploting
plt.figure(figsize=(10, 6))
sns.lineplot(data=df_grouped, x='Year', y='Catholic_%', label='Catholic %', marker='o', color='purple')
sns.lineplot(data=df_grouped, x='Year', y='Non_Catholic_%', label='Non-Catholic %', marker='o', color='blue')
for i in range(len(df_grouped)):
    plt.text(df_grouped['Year'][i], df_grouped['Catholic_%'][i] + 0.5, f"{df_grouped['Catholic_%'][i]:.1f}%", ha='center', color='purple')
    plt.text(df_grouped['Year'][i], df_grouped['Non_Catholic_%'][i] + 0.5, f"{df_grouped['Non_Catholic_%'][i]:.1f}%", ha='center', color='blue')

plt.title('Catholic vs Non-Catholic Enrollment Percentage Over Years')
plt.xlabel('Year')
plt.ylabel('Enrollment Percentage')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Comparition between the population of children younger than 15 years vs Enrollments (Scalling)
df['child_under_15'] = (
    df['child_under_5'] +
    df['child_5_9'] +
    df['child_10_14']
)

child_enroll_trend = df.groupby('Year')[['child_under_15', 'TS-Enrollment']].sum().reset_index()
child_enroll_trend['child_under_15_pct'] = (child_enroll_trend['child_under_15'] / child_enroll_trend['child_under_15'].max()) * 100
child_enroll_trend['TS-Enrollment_pct'] = (child_enroll_trend['TS-Enrollment'] / child_enroll_trend['TS-Enrollment'].max()) * 100

# Plotting the percentage line chart
plt.figure(figsize=(10, 6))
sns.lineplot(data=child_enroll_trend, x='Year', y='child_under_15_pct', label='Children Under 15 (%)', marker='o', color='purple')
sns.lineplot(data=child_enroll_trend, x='Year', y='TS-Enrollment_pct', label='Total Enrollment (%)', linestyle='--', color='blue', marker='o')
plt.title('Children Under 15 vs Enrollment')
plt.ylabel('Percentage of Max Value')
plt.tight_layout()
plt.show()

In [None]:
region_enrollment_trends = df.groupby(['Region', 'Year'])['TS-Enrollment'].sum().reset_index()
pivot_table = region_enrollment_trends.pivot(index='Region', columns='Year', values='TS-Enrollment')
print(pivot_table)

In [None]:
# Trend Analysis on Children populations
df['child_under_5_change'] = df.groupby('school_id')['child_under_5'].diff()
df['child_5_9_change'] = df.groupby('school_id')['child_5_9'].diff()
df['child_10_14_change'] = df.groupby('school_id')['child_10_14'].diff()
age_group_columns = ['child_under_5_change', 'child_5_9_change', 'child_10_14_change']
demographic_change_by_region = df.groupby('Region')[age_group_columns].mean().reset_index()

plt.figure(figsize=(12, 6))
for col in age_group_columns:
    plt.plot(demographic_change_by_region['Region'], demographic_change_by_region[col], label=col)
plt.title('Average Demographic Shifts by Region (Line Plot)')
plt.xlabel('Region')
plt.ylabel('Average Change in Population')
plt.xticks(rotation=45)
plt.grid(True)
plt.legend(title='Age Group')
plt.tight_layout()
plt.show()


In [None]:
# Enrollment Trend Vs Population Under 15 for each region
df['child_under_15'] = df['child_under_5'] + df['child_5_9'] + df['child_10_14']
trend_data = df.groupby(['Region', 'Year'])[['child_under_15', 'TS-Enrollment']].sum().reset_index()
regions = trend_data['Region'].unique()

# Plotting the trends for each region
for region in regions:
    region_data = trend_data[trend_data['Region'] == region]
    fig, ax1 = plt.subplots(figsize=(12, 6))
    ax1.plot(region_data['Year'], region_data['child_under_15'], label='Children Under 15', marker='o', color='purple')
    ax1.set_xlabel('Year')
    ax1.set_ylabel('Children Under 15', color='purple')
    ax1.tick_params(axis='y', labelcolor='purple')
    # Secodn Axis
    ax2 = ax1.twinx()
    ax2.plot(region_data['Year'], region_data['TS-Enrollment'], label='TS-Enrollment', marker='x', linestyle='--', color='blue')
    ax2.set_ylabel('TS-Enrollment', color='blue')
    ax2.tick_params(axis='y', labelcolor='blue')
    plt.title(f'Children Under 15 vs Enrollment in {region}')
    fig.tight_layout()
    plt.show()


In [None]:
# Region wise School Enrollments Trend
df['enrollment_change'] = df.groupby('Name')['TS-Enrollment'].diff()

colors = sns.color_palette('Set2', n_colors=df['Name'].nunique())
color_map = dict(zip(df['Name'].unique(), colors))

# Plotting by region
for region in df['Region'].unique():
    plt.figure(figsize=(12, 6))
    region_data = df[df['Region'] == region]
    
    for name, group in region_data.groupby('Name'):
        change = group['enrollment_change'].iloc[-1]
        style = '--' if change < 0 else '-'
        plt.plot(group['Year'], group['TS-Enrollment'], style, label=name, color=color_map[name])
    
    plt.title(f'Enrollment Trends in {region}')
    plt.xlabel('Year')
    plt.ylabel('Enrollment')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


In [None]:
# Comparition between Tution Income vs Salaries distributed 
year_data = df.groupby('Year')[['tuition_fees_finaid_scholarships', 'salaries_51xx']].sum()
year_data['salary_percentage'] = (year_data['salaries_51xx'] / year_data['tuition_fees_finaid_scholarships']) * 100

# Plotting
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(year_data.index, year_data['salary_percentage'], marker='o', color='purple', label='Salaries % of Tuition Fees')
max_percentage = year_data['salary_percentage'].max()
for i, v in enumerate(year_data['salary_percentage']):
    ax.text(i, v + 1, f'{v:.1f}%', ha='center', va='bottom')
ax.set_ylim(0, max_percentage + 10)  
ax.set_title('Percentage of Tuition Fees Used for Salaries Over Time')
ax.set_xlabel('Year')
ax.set_ylabel('Percentage (%)')
ax.set_xticks(range(len(year_data.index)))
ax.set_xticklabels(year_data.index, rotation=45)
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Tuition Income Vs Homeownership
year_data = df.groupby('Year')[['tuition_fees_finaid_scholarships', 'Owner-Occupied Units', 'Renter-Occupied Units']].sum()
x = np.arange(len(year_data))
bar_width = 0.35
colors = sns.color_palette('Set2', 3)
owner_color, renter_color = colors[0], colors[1]
tuition_color = 'purple'  # Set the trend line color to purple

# Plotting
fig, ax1 = plt.subplots(figsize=(10, 6))

ax1.bar(x - bar_width/2, year_data['Owner-Occupied Units'], width=bar_width, color=owner_color, label='Owner-Occupied Units')
ax1.bar(x + bar_width/2, year_data['Renter-Occupied Units'], width=bar_width, color=renter_color, label='Renter-Occupied Units')
ax1.set_xlabel('Year')
ax1.set_ylabel('Number of Units')
ax1.set_xticks(x)
ax1.set_xticklabels(year_data.index, rotation=45)
ax1.grid(False)  # Remove grid lines from primary axis

# Secondary axis for Tuition Fees
ax2 = ax1.twinx()
ax2.plot(x, year_data['tuition_fees_finaid_scholarships'], marker='o', color=tuition_color, label='Tuition Fees')
ax2.set_ylabel('Tuition Fees, FinAid, Scholarships ($)', color=tuition_color)
ax2.tick_params(axis='y', colors=tuition_color)
ax2.grid(False)  
lines_labels = [ax.get_legend_handles_labels() for ax in [ax1, ax2]]
lines, labels = [sum(lol, []) for lol in zip(*lines_labels)]
ax1.legend(lines, labels, loc='upper left', bbox_to_anchor=(1.05, 1))

plt.title('Owner/Renter-Occupied Units with Tuition Fees Trend Over Time')
plt.tight_layout()
plt.show()

In [None]:
# Comparison between Total Revenue vs Total Expenses
df_grouped = df.groupby('Year')[['total_revenue_80', 'total_expenses_80']].sum().reset_index()
x = np.arange(len(df_grouped))
width = 0.35
colors = sns.color_palette('Set2', 2)
revenue_color, expense_color = colors[0], colors[1]
# Plotting
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(x - width/2, df_grouped['total_revenue_80'], width, label='Revenue', color=revenue_color)
ax.bar(x + width/2, df_grouped['total_expenses_80'], width, label='Expenses', color=expense_color)
ax.set_xlabel('Year')
ax.set_ylabel('Amount ($)')
ax.set_title('Total Revenue vs Total Expenses Over Time')
ax.set_xticks(x)
ax.set_xticklabels(df_grouped['Year'], rotation=45)
ax.legend()
plt.tight_layout()
plt.show()

In [None]:
# Comparision between Revenue and Expenses
df_grouped = df.groupby('Year')[['total_revenue_80', 'total_expenses_80']].sum().reset_index()
df_grouped['expense_pct'] = (df_grouped['total_expenses_80'] / df_grouped['total_revenue_80']) * 100
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(df_grouped['Year'], df_grouped['expense_pct'], marker='o', color='purple', label='Expense % of Revenue')
for x, y in zip(df_grouped['Year'], df_grouped['expense_pct']):
    ax.text(x, y + 0.17, f'{y:.1f}%', ha='center', va='bottom', fontsize=9)
ax.set_xlabel('Year')
ax.set_ylabel('Expenses as % of Revenue')
ax.set_title('Percentage of Expenses from Revenue Over Time')
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
ax.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Comparision between TS Enrollment and Public School Enrollments accross year (Region Wise)
sns.set_theme(style="white")

df['Year'] = df['Year'].astype(str)
df['Year'] = df['Year'].apply(lambda x: f"{x[:4]}_{x[4:]}" if '_' not in x else x)

regions = df['Region'].unique()

for region in regions:
    region_data = df[df['Region'] == region].copy()
    region_data['Year'] = region_data['Year'].apply(lambda x: int(x[:4]))
    region_data = region_data.sort_values('Year')
    region_data['pub_enroll_norm'] = region_data['pub_enroll_all_students'] / region_data['pub_enroll_all_students'].max()
    region_data['ts_enroll_norm'] = region_data['TS-Enrollment'] / region_data['TS-Enrollment'].max()
    region_data['Year'] = region_data['Year'].astype(str)

    plt.figure(figsize=(10, 6))
    sns.lineplot(data=region_data, x='Year', y='pub_enroll_norm', label='Public School Enrollments', marker='o', color='purple')
    sns.lineplot(data=region_data, x='Year', y='ts_enroll_norm', label='Enrollment across Archdiocese', marker='s', color='blue')
    plt.title(f'Normalized Enrollment Trends in {region}')
    plt.xlabel('Year')
    plt.ylabel('Enrollment (Normalized at a scale of 0 to 1)')
    plt.xticks(rotation=45)
    plt.legend()
    plt.grid(False)
    plt.tight_layout()
    plt.show()

In [None]:
def analyze_market_share_trend(df):
    # Grouping by year for analysis
    yearly_data = df.groupby('Year').agg({
        'TS-Enrollment': 'sum',
        'private_enroll': 'sum'
    }).reset_index()
    
    # Calculating share
    yearly_data['TS_market_share'] = yearly_data['TS-Enrollment'] / yearly_data['private_enroll'] * 100
    
    # Ploting market share trend
    plt.figure(figsize=(12, 6))
    plt.plot(yearly_data['Year'], yearly_data['TS_market_share'], 'o-',color='purple', linewidth=2)
    plt.title('TS Enrollment Market Share Over Time')
    plt.xlabel('Year')
    plt.ylabel('Market Share (%)')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('ts_market_share_trend.png')
    plt.show()
    
    return yearly_data

yearly_data = analyze_market_share_trend(df)

In [None]:
def compare_enrollment_trends(df):
    # Grouping by year
    yearly_data = df.groupby('Year').agg({
        'TS-Enrollment': 'sum',
        'private_enroll': 'sum'
    }).reset_index()
    
    # Comparing enrollments
    plt.figure(figsize=(12, 6))
    plt.plot(yearly_data['Year'], yearly_data['TS-Enrollment'], 'o-', 
             label='TS Enrollment', color='purple', linewidth=2)
    plt.plot(yearly_data['Year'], yearly_data['private_enroll'], 's-', 
             label='Total Private Enrollment', color='blue', linewidth=2)
    plt.title('TS_Enrollment vs Total Private Enrollment Over Time')
    plt.xlabel('Year')
    plt.ylabel('Number of Students')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('ts_vs_private_enrollment.png')
    plt.show()
    
    return yearly_data

enrollment_data = compare_enrollment_trends(df)

In [None]:
def analyze_students_per_school(df):
    # Grouping by year
    yearly_data = df.groupby('Year').agg({
        'private_count': 'mean',
        'private_enroll': 'sum'
    }).reset_index()
    
    # Students per school
    yearly_data['students_per_school'] = yearly_data['private_enroll'] / yearly_data['private_count']
    
    # Ploting students per school
    plt.figure(figsize=(12, 6))
    plt.plot(yearly_data['Year'], yearly_data['students_per_school'], 'o-', linewidth=2)
    plt.title('Average Students per Private School Over Time')
    plt.xlabel('Year')
    plt.ylabel('Students per School')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('students_per_school_trend.png')
    plt.show()
    
    return yearly_data

students_per_school_data = analyze_students_per_school(df)

In [None]:
def plot_enrollment_trends_by_region(df):
    region_enrollment = df.groupby(['Region', 'Year']).agg({
        'TS-Enrollment': 'sum',
        'private_enroll': 'sum'
    }).reset_index()
    
    #  Subplot for each region
    regions = region_enrollment['Region'].unique()
    fig, axes = plt.subplots(len(regions), 1, figsize=(10, 4*len(regions)))
    
    if len(regions) == 1:
        axes = [axes]
    
    # Plots
    for i, region in enumerate(regions):
        data = region_enrollment[region_enrollment['Region'] == region]
        axes[i].plot(data['Year'], data['TS-Enrollment'], 'o-', color='purple', label='TS Enrollment')
        axes[i].plot(data['Year'], data['private_enroll'], 's-', color='blue', label='Private Enrollment')
        axes[i].set_title(f'Region: {region}')
        axes[i].set_ylabel('Students')
        axes[i].grid(alpha=0.3)
        axes[i].legend()
    
    plt.xlabel('Year')
    plt.tight_layout()
    plt.savefig('enrollment_by_region.png')
    plt.show()
    
    return region_enrollment

region_data = plot_enrollment_trends_by_region(df)

In [None]:
df = df.sort_values(['Parish', 'Year'])

enrollment_changes_cols = ['TS-Enrollment', 'pub_enroll_all_students', 'private_enroll']

scaler = StandardScaler()

enroll_changes_cols_scaled = scaler.fit_transform(df[enrollment_changes_cols])

for col in enrollment_changes_cols:
    df[f'{col}_pct_change'] = df.groupby('Region')[col].pct_change()


In [None]:
df.columns

In [None]:
df[df['Parish']=='All Saints'][['TS-Enrollment', 'pub_enroll_all_students', 'private_enroll', 'pub_enroll_all_students_pct_change', 'private_enroll_pct_change']]

In [None]:

df_melted = df.melt(
    id_vars=['Year', 'Parish'], 
    value_vars=['TS-Enrollment_pct_change',
       'pub_enroll_all_students_pct_change', 'private_enroll_pct_change'],
    var_name='School Type', 
    value_name='Enrollment'
)

# Plotting with one panel per Region
g = sns.relplot(
    data=df_melted,
    x='Year', y='Enrollment',
    hue='School Type',
    kind='bar',
    col='Parish',
    col_wrap=2,
    facet_kws={'sharey': False},
    marker='o',
    height=4, aspect=1.5,
    legend='full' 
)

g._legend.set_title("School Type")
g._legend.set_bbox_to_anchor((1.05, 1))
plt.subplots_adjust(right=0.85) 