In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import io

In [None]:
df = pd.read_csv('final_dataset.csv')

In [None]:
print("Basic Info:")
print(df.info())

In [None]:
missing = df.isnull().sum().sort_values(ascending=False)
missing[missing > 0]

In [None]:
# List of columns with missing values
columns_to_fill = [
    'fulcrum_grant_mean',
    'fulcrum_total_app',
    'fulcrum_grant_count',
    'Nonfamily Avg Household Size',
    'Married Avg Family Size',
    'Married Avg Household Size',
    'Avg Family Size',
    'Avg Household Size'
]

# Filling NaN with 0 only in these columns
df[columns_to_fill] = df[columns_to_fill].fillna(0)

# Verifying the changes
print("Missing Values After Filling with 0:")
print(df[columns_to_fill].isnull().sum())

In [None]:
# Summary statistics for numerical columns
df.describe(include='number').T

In [None]:
# Group by 'Year' and count the number of unique schools
schools_per_year = df.groupby('Year')['school_id'].nunique().reset_index()

# Rename columns for clarity
schools_per_year.columns = ['Year', 'Total Schools']

# Display the result
print(schools_per_year)

In [None]:
unique_vals = df.nunique().sort_values(ascending=True)
unique_vals

In [None]:
# Group by 'Region' and count the number of unique schools
schools_per_region = df.groupby('Region')['school_id'].nunique().reset_index()

# Rename columns for clarity
schools_per_region.columns = ['Region', 'Total Schools']

# Display the result
print(schools_per_region)

In [None]:
numerical_cols = df.select_dtypes(include='number').columns.tolist()
categorical_cols = df.select_dtypes(include='object').columns.tolist()
print("Numerical Columns:")
print(numerical_cols)
print("\nCategorical Columns:")
print(categorical_cols)

# Total Enrollment Over Time (for all schools)

In [None]:
# Group by 'Year' and sum the total enrollment for all schools
total_enrollment_by_year = df.groupby('Year')['TS-Enrollment'].sum()

# Plotting the total enrollment over the years
plt.figure(figsize=(10, 6))
total_enrollment_by_year.plot(kind='line', marker='o', color='purple', linewidth=2)
plt.title('Total Enrollment Trends for All Schools Over Time')
plt.xlabel('Academic Year')
plt.ylabel('Total Enrollment')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

In [None]:
enrollment_by_year = df.groupby('Year')['fulcrum_total_app'].sum().reset_index()
plt.figure(figsize=(10, 6))
plt.plot(enrollment_by_year['Year'], enrollment_by_year['fulcrum_total_app'], marker='o', color='skyblue')
for i, value in enumerate(enrollment_by_year['fulcrum_total_app']):
    plt.text(i, value, f'{value:.0f}', ha='center', va='bottom')
plt.xlabel('Year')
plt.ylabel('Total Fulcrum Applications')
plt.title('Total Fulcrum Applications Across Years')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Group by 'Year' and sum both columns
data_by_year = df.groupby('Year')[['TS-Enrollment', 'fulcrum_total_app']].sum()

# Calculate percentage
data_by_year['Percentage'] = (data_by_year['fulcrum_total_app'] / data_by_year['TS-Enrollment']) * 100

# Plot
plt.figure(figsize=(10, 6))
plt.plot(data_by_year.index, data_by_year['Percentage'], marker='o', color='green', linewidth=2)
for i, value in enumerate(data_by_year['Percentage']):
    plt.text(i, value, f'{value:.1f}%', ha='center', va='bottom')
plt.title('Percentage of Fulcrum Applications Out of Total Enrollment Over Time')
plt.xlabel('Academic Year')
plt.ylabel('Percentage (%)')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Group by 'Year' and sum both columns
data_by_year = df.groupby('Year')[['TS-Enrollment', 'fulcrum_total_app']].sum()

# Add percentage column
data_by_year['Percentage'] = (data_by_year['fulcrum_total_app'] / data_by_year['TS-Enrollment']) * 100

# Display the table
print("Total Enrollment and Fulcrum Applications by Year:")
print(data_by_year)

In [None]:
# Select numerical columns, excluding 'school_id' and 'zip_code'
numerical_cols = [col for col in df.select_dtypes(include='number').columns if col not in ['school_id', 'zip_code']]
corr_matrix = df[numerical_cols].corr()

# Filter columns with correlations > 0.4, then pick top 10 by max correlation
filtered_cols = corr_matrix.columns[(corr_matrix.abs() > 0.4).any()]
filtered_corr = corr_matrix.loc[filtered_cols, filtered_cols]
top_cols = filtered_corr.abs().max().sort_values(ascending=False).head(10).index
top_corr = filtered_corr.loc[top_cols, top_cols]

# Create heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(top_corr, cmap='coolwarm', center=0, annot=True, fmt='.2f', square=True)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.title('Top 10 Numerical Columns (Excluding school_id & zip_code) with |Correlation| > 0.4')
plt.tight_layout()
plt.show()

In [None]:
# Key variables to explore
key_vars = ['TS-Enrollment', 'fulcrum_total_app', 'tuition_fees_finaid_scholarships', 'Total Population']

# Plot distributions
plt.figure(figsize=(12, 8))
for i, var in enumerate(key_vars, 1):
    plt.subplot(2, 2, i)
    sns.histplot(df[var], bins=20, kde=True, color='skyblue')
    plt.title(f'Distribution of {var}')
    plt.xlabel(var)
    plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
# Box plots for TS-Enrollment vs. categorical variables
cat_vars = ['Region']
num_var = 'TS-Enrollment'

plt.figure(figsize=(16, 6))

for i, cat in enumerate(cat_vars, 1):
    plt.subplot(1, 2, i)
    # Sort categories by median TS-Enrollment
    order = df.groupby(cat)[num_var].median().sort_values().index
    sns.boxplot(
        x=cat,
        y=num_var,
        data=df,
        hue=cat,
        palette='Set2',
        order=order,
        showfliers=False,
        legend=False  # Avoid duplicate legends
    )
    plt.title(f'{num_var} by {cat}', fontsize=12)
    plt.xticks(rotation=30, ha='right')
    plt.xlabel(cat, fontsize=10)
    plt.ylabel(num_var, fontsize=10)

plt.tight_layout()
plt.show()


In [None]:
df.groupby('Region')['TS-Enrollment'].sum().plot(kind='bar', color='skyblue')
plt.title('Total Enrollment by Region')
plt.ylabel('Enrollment')
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='Year', y='TS-Enrollment', hue='Region', marker='o')
plt.title('Enrollment Trend by Region')
plt.tight_layout()
plt.show()


In [None]:
df_grouped = df.groupby('Year')[['TS-Catholic', 'TS-Non-Catholic']].sum().reset_index()

plt.figure(figsize=(10, 6))
sns.lineplot(data=df_grouped, x='Year', y='TS-Catholic', label='Catholic Enrollment', marker='o')
sns.lineplot(data=df_grouped, x='Year', y='TS-Non-Catholic', label='Non-Catholic Enrollment', marker='o')
plt.title('Catholic vs Non-Catholic Enrollment Over Years')
plt.tight_layout()
plt.show()


In [None]:
# Combine under-15 child population
df['child_under_15'] = (
    df['child_under_5'] +
    df['child_5_9'] +
    df['child_10_14']
)

# Group by Year to get trends
child_enroll_trend = df.groupby('Year')[['child_under_15', 'TS-Enrollment']].sum().reset_index()

# Normalize both variables to percentage of max for visual comparison
child_enroll_trend['child_under_15_pct'] = (child_enroll_trend['child_under_15'] / child_enroll_trend['child_under_15'].max()) * 100
child_enroll_trend['TS-Enrollment_pct'] = (child_enroll_trend['TS-Enrollment'] / child_enroll_trend['TS-Enrollment'].max()) * 100

# Plotting the percentage line chart
plt.figure(figsize=(10, 6))
sns.lineplot(data=child_enroll_trend, x='Year', y='child_under_15_pct', label='Children Under 15 (%)', marker='o')
sns.lineplot(data=child_enroll_trend, x='Year', y='TS-Enrollment_pct', label='Total Enrollment (%)', linestyle='--', color='black', marker='o')
plt.title('Children Under 15 vs Enrollment (Normalized % Scale)')
plt.ylabel('Percentage of Max Value')
plt.tight_layout()
plt.show()




In [None]:
# Group and pivot to show numbers instead of plotting
region_enrollment_trends = df.groupby(['Region', 'Year'])['TS-Enrollment'].sum().reset_index()
pivot_table = region_enrollment_trends.pivot(index='Region', columns='Year', values='TS-Enrollment')
print(pivot_table)



In [None]:
# Calculate year-over-year changes for population columns (children under 5, 5-9, and 10-14)
df['child_under_5_change'] = df.groupby('school_id')['child_under_5'].diff()
df['child_5_9_change'] = df.groupby('school_id')['child_5_9'].diff()
df['child_10_14_change'] = df.groupby('school_id')['child_10_14'].diff()

# Group by region and calculate the average changes in population by age group
age_group_columns = ['child_under_5_change', 'child_5_9_change', 'child_10_14_change']
demographic_change_by_region = df.groupby('Region')[age_group_columns].mean().reset_index()

# Plotting the demographic changes by region for different age groups as line plots
plt.figure(figsize=(12, 6))

# Plot each age group change as a line
for col in age_group_columns:
    plt.plot(demographic_change_by_region['Region'], demographic_change_by_region[col], label=col)

# Customize plot
plt.title('Average Demographic Shifts by Region (Line Plot)')
plt.xlabel('Region')
plt.ylabel('Average Change in Population')
plt.xticks(rotation=45)
plt.grid(True)
plt.legend(title='Age Group')
plt.tight_layout()

# Show plot
plt.show()



In [None]:
# Group by region and year, calculating the sum for each child age group and enrollment
trend_data = df.groupby(['Region', 'Year'])[['child_under_5', 'child_5_9', 'child_10_14', 'TS-Enrollment']].sum().reset_index()

# List of regions
regions = trend_data['Region'].unique()

# Plotting the trends for each region
for region in regions:
    plt.figure(figsize=(12, 6))
    
    # Filter data for the current region
    region_data = trend_data[trend_data['Region'] == region]
    
    # Primary axis for age groups
    fig, ax1 = plt.subplots(figsize=(12, 6))
    ax1.plot(region_data['Year'], region_data['child_under_5'], label='Children Under 5', marker='o')
    ax1.plot(region_data['Year'], region_data['child_5_9'], label='Children 5-9', marker='o')
    ax1.plot(region_data['Year'], region_data['child_10_14'], label='Children 10-14', marker='o')
    
    ax1.set_xlabel('Year')
    ax1.set_ylabel('Population', labelpad=15)
    ax1.set_title(f'Population Trends and Enrollment in {region} Over the Years')
    ax1.grid(True)
    
    # Secondary axis for Enrollment
    ax2 = ax1.twinx()
    ax2.plot(region_data['Year'], region_data['TS-Enrollment'], label='Enrollment', linestyle='--', color='black', marker='x')
    ax2.set_ylabel('Enrollment', labelpad=15)
    
    # Legends outside the graph
    ax1.legend(title='Age Group', bbox_to_anchor=(1.05, 1), loc='upper left')
    ax2.legend(title='Enrollment', bbox_to_anchor=(1.05, 0.80), loc='upper left')
    
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


In [None]:
# Generate two different color palettes, one for upward and one for downward trends
upward_colors = sns.color_palette("tab20", len(df['Name'].unique()))
downward_colors = sns.color_palette("dark:red", len(df['Name'].unique()))

# Map school names to unique colors for upward and downward trends
school_colors_upward = {school_name: upward_colors[i] for i, school_name in enumerate(df['Name'].unique())}
school_colors_downward = {school_name: downward_colors[i] for i, school_name in enumerate(df['Name'].unique())}

# Calculate enrollment changes
df['enrollment_change'] = df.groupby('Name')['TS-Enrollment'].diff()

# Group by region and plot trends
regions = df['Region'].unique()

for region in regions:
    region_data = df[df['Region'] == region]
    
    plt.figure(figsize=(14, 8))
    
    for school_name, school_data in region_data.groupby('Name'):
        if school_data['enrollment_change'].iloc[-1] < 0:
            color = school_colors_downward[school_name]
            plt.plot(school_data['Year'], school_data['TS-Enrollment'], label=f'{school_name} (Downward)', color=color, linestyle='--')
        else:
            color = school_colors_upward[school_name]
            plt.plot(school_data['Year'], school_data['TS-Enrollment'], label=f'{school_name} (Upward)', color=color)
    
    plt.title(f'Enrollment Trends for Schools in {region}')
    plt.xlabel('Year')
    plt.ylabel('Enrollment')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


In [None]:
df['Year'] = df['Year'].astype(str)

# Defining the regions
regions = df['Region'].unique()
plt.figure(figsize=(16, len(regions) * 6))

# Region wise plots
for i, region in enumerate(regions, 1):
    # Filter data for each region
    region_data = df[df['Region'] == region]
    
    # Set up the subplot for each region
    ax1 = plt.subplot(len(regions), 1, i)
    
    # Line plot for 'tuition_fees_finaid_scholarships'
    sns.lineplot(data=region_data, x='Year', y='tuition_fees_finaid_scholarships', ax=ax1, color='blue', label='Tuition Fees Trend', marker='o')

    # Create a secondary y-axis for the bar charts (enrollment data)
    ax2 = ax1.twinx()

    # Define the bar width
    width = 0.25

    # Get unique years sorted in ascending order
    sorted_years = sorted(region_data['Year'].unique())

    # Plot the bar charts for enrollment data
    ax2.bar([p - width for p in range(len(sorted_years))], region_data.groupby('Year')['TS-Catholic'].sum(), width=width, label='Catholic Enrollment', color='green', align='center')
    ax2.bar(range(len(sorted_years)), region_data.groupby('Year')['TS-Non-Catholic'].sum(), width=width, label='Non-Catholic Enrollment', color='orange', align='center')
    ax2.bar([p + width for p in range(len(sorted_years))], region_data.groupby('Year')['TS-Hispanic'].sum(), width=width, label='Hispanic Enrollment', color='red', align='center')

    # Label the axes
    ax1.set_ylabel('Tuition Fees & Scholarships', color='blue')
    ax2.set_ylabel('Enrollment Numbers', color='black')
    ax1.set_xlabel('Year')
    ax1.set_title(f'Tuition & Demographics in {region}')
    
    # Set x-ticks as sorted years
    ax1.set_xticks(range(len(sorted_years)))
    ax1.set_xticklabels(sorted_years, rotation=45)

    # Move the legends outside the graph
    ax1.legend(loc='upper left', bbox_to_anchor=(1.05, 1), title='Tuition Fees', frameon=False)
    ax2.legend(loc='upper left', bbox_to_anchor=(1.05, 0.6), title='Enrollment', frameon=False)

# Adjust layout to avoid overlapping elements
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
df['Income Below Poverty Level'] = df['Income Below Poverty Level (Male)'] + df['Income Below Poverty Level (Female)']

df_region = df.groupby('Region')[['Income Below Poverty Level', 'TS-Enrollment']].sum().reset_index()

fig, ax1 = plt.subplots(figsize=(12, 8))

ax1.bar(df_region['Region'], df_region['Income Below Poverty Level'], color='coral', alpha=0.6, label='Income Below Poverty Level')

ax1.set_xlabel('Region')
ax1.set_ylabel('Income Below Poverty Level', color='coral')

ax2 = ax1.twinx()
ax2.plot(df_region['Region'], df_region['TS-Enrollment'], color='red', marker='o', linestyle='--', label='TS-Enrollments')

ax2.set_ylabel('TS-Enrollments', color='red')

ax1.set_title('Income Below Poverty Level vs TS-Enrollments by Region')

ax1.set_xticklabels(df_region['Region'], rotation=45)

ax1.legend(loc='upper left')
ax2.legend(loc='upper right')

plt.tight_layout()

plt.show()


In [None]:
df['Income Below Poverty Level'] = df['Income Below Poverty Level (Male)'] + df['Income Below Poverty Level (Female)']
df['Income At or Above Poverty Level'] = df['Income At or Above Poverty Level (Male)'] + df['Income At or Above Poverty Level (Female)']

df_region = df.groupby('Region')[['Income Below Poverty Level', 'Income At or Above Poverty Level', 'TS-Enrollment']].sum().reset_index()

fig, ax1 = plt.subplots(figsize=(12, 8))

ax1.bar(df_region['Region'], df_region['Income Below Poverty Level'], color='blue', alpha=0.6, label='Income Below Poverty Level')
ax1.bar(df_region['Region'], df_region['Income At or Above Poverty Level'], bottom=df_region['Income Below Poverty Level'], color='orange', alpha=0.6, label='Income At or Above Poverty Level')

ax1.set_xlabel('Region', labelpad=20)
ax1.set_ylabel('Population', color='blue', labelpad=20)

ax2 = ax1.twinx()
ax2.plot(df_region['Region'], df_region['TS-Enrollment'], color='coral', marker='o', linestyle='--', label='TS-Enrollments')

ax2.set_ylabel('TS-Enrollments', color='red', labelpad=20)

ax1.set_title('Income Below and At or Above Poverty Level vs TS-Enrollments by Region')

ax1.set_xticks(df_region['Region'])
ax1.set_xticklabels(df_region['Region'], rotation=45, ha='right')

ax1.legend(loc='upper left', bbox_to_anchor=(1, 1), title='Income Levels')
ax2.legend(loc='upper right', bbox_to_anchor=(1.30, 0.85), title='TS-Enrollments')

plt.tight_layout()

plt.show()


In [None]:
df['Income Below Poverty Level'] = df['Income Below Poverty Level (Male)'] + df['Income Below Poverty Level (Female)']
df['Income At or Above Poverty Level'] = df['Income At or Above Poverty Level (Male)'] + df['Income At or Above Poverty Level (Female)']

# Group by Region and Year
df_region_year = df.groupby(['Region', 'Year'])[['Income Below Poverty Level', 'Income At or Above Poverty Level', 'TS-Enrollment']].sum().reset_index()

# Creating separate plots for each region
for region in df_region_year['Region'].unique():
    region_data = df_region_year[df_region_year['Region'] == region]
    
    fig, ax1 = plt.subplots(figsize=(11, 7))

    # Bar plot for income levels by year
    ax1.bar(region_data['Year'], region_data['Income Below Poverty Level'], color='blue', alpha=0.6, label='Income Below Poverty Level')
    ax1.bar(region_data['Year'], region_data['Income At or Above Poverty Level'], bottom=region_data['Income Below Poverty Level'], color='orange', alpha=0.6, label='Income At or Above Poverty Level')

    ax1.set_xlabel('Year', labelpad=20)
    ax1.set_ylabel('Population', color='blue', labelpad=20)

    # Line plot for TS-Enrollments by year
    ax2 = ax1.twinx()
    ax2.plot(region_data['Year'], region_data['TS-Enrollment'], color='coral', marker='o', linestyle='--', label='TS-Enrollments')

    ax2.set_ylabel('TS-Enrollments', color='red', labelpad=20)

    ax1.set_title(f'Income Below and At or Above Poverty Level vs TS-Enrollments for {region} (Year-wise)')

    # Rotate and adjust x-axis labels
    ax1.set_xticks(region_data['Year'])
    ax1.set_xticklabels(region_data['Year'], rotation=45, ha='right')

    # Legends
    ax1.legend(loc='upper left', bbox_to_anchor=(1.2, 1), title='Income Levels')
    ax2.legend(loc='upper right', bbox_to_anchor=(1.70, 0.85), title='TS-Enrollments')

    plt.tight_layout()

    # Show plot for this region
    plt.show()


In [None]:
# Correlation between TS-Enrollment and Tuition Fees & Financial Aid
correlation_enrollment_tuition = df[['TS-Enrollment', 'tuition_fees_finaid_scholarships']].corr().iloc[0, 1]

# Correlation between TS-Enrollment and Income Below Poverty Level
correlation_enrollment_income_below = df[['TS-Enrollment', 'Income Below Poverty Level']].corr().iloc[0, 1]

# Correlation between TS-Enrollment and Income At or Above Poverty Level
correlation_enrollment_income_above = df[['TS-Enrollment', 'Income At or Above Poverty Level']].corr().iloc[0, 1]

print(f"Correlation between TS-Enrollment and Tuition Fees & Financial Aid: {correlation_enrollment_tuition}")
print(f"Correlation between TS-Enrollment and Income Below Poverty Level: {correlation_enrollment_income_below}")
print(f"Correlation between TS-Enrollment and Income At or Above Poverty Level: {correlation_enrollment_income_above}")


In [None]:
df_region_family_size_tuition = df.groupby('Region')[['Avg Family Size', 'tuition_fees_finaid_scholarships']].mean().reset_index()

fig, ax1 = plt.subplots(figsize=(12, 8))

ax1.bar(df_region_family_size_tuition['Region'], df_region_family_size_tuition['Avg Family Size'], color='skyblue', alpha=0.6, label='Avg Family Size')

ax1.set_xlabel('Region', labelpad=20)
ax1.set_ylabel('Average Family Size', color='skyblue', labelpad=20)

ax2 = ax1.twinx()
ax2.plot(df_region_family_size_tuition['Region'], df_region_family_size_tuition['tuition_fees_finaid_scholarships'], color='black', marker='o', linestyle='--', label='Tuition Fees & Financial Aid')

ax2.set_ylabel('Tuition Fees & Financial Aid', color='black', labelpad=20)

ax1.set_title('Family Size vs Tuition Fees & Financial Aid by Region')

ax1.set_xticks(df_region_family_size_tuition['Region'])
ax1.set_xticklabels(df_region_family_size_tuition['Region'], rotation=45, ha='right')

ax1.legend(loc='upper left', bbox_to_anchor=(1, 1), title='Family Size')
ax2.legend(loc='upper right', bbox_to_anchor=(1.3, 0.85), title='Tuition & Financial Aid')

plt.tight_layout()
plt.show()


In [None]:
correlation = df[['Avg Family Size', 'tuition_fees_finaid_scholarships']].corr().iloc[0, 1]
print(f"Correlation between Family Size and Tuition Fees & Financial Aid: {correlation}")


In [None]:
# Calculate the correlation matrix for numeric columns only
numeric_df = df.select_dtypes(include=['number'])
correlation_matrix = numeric_df.corr()

# Define the pairs of variables to analyze
variable_pairs = [
    ('Income Below Poverty Level', 'TS-Enrollment'),
    ('Income At or Above Poverty Level', 'TS-Enrollment'),
    ('Income Below Poverty Level', 'tuition_fees_finaid_scholarships'),
    ('Income At or Above Poverty Level', 'tuition_fees_finaid_scholarships')
]

# Calculate and display the correlation coefficients for each pair
for var1, var2 in variable_pairs:
    if var1 in numeric_df.columns and var2 in numeric_df.columns:
        correlation = correlation_matrix.loc[var1, var2]
        print(f"Pearson correlation between '{var1}' and '{var2}': {correlation:.2f}")
    else:
        print(f"One or both variables '{var1}' and '{var2}' are not numeric columns.")


In [None]:
# Group by Year and sum
year_data = df.groupby('Year')[['TS-Enrollment', 'fulcrum_total_app', 'fulcrum_grant_count']].sum()

# Set up plot
x = np.arange(len(year_data))
bar_width = 0.25
fig, ax = plt.subplots(figsize=(10, 6))

# Bars
ax.bar(x - bar_width, year_data['TS-Enrollment'], width=bar_width, color='lightgreen', label='Enrollment')
ax.bar(x, year_data['fulcrum_total_app'], width=bar_width, color='royalblue', label='Financial Aid Apps')
ax.bar(x + bar_width, year_data['fulcrum_grant_count'], width=bar_width, color='lightcoral', label='Grant Count')

# Labels and ticks
ax.set_title('Enrollment, Financial Aid & Grants Over Time (All Schools)')
ax.set_xlabel('Year')
ax.set_ylabel('Total Amount')
ax.set_xticks(x)
ax.set_xticklabels(year_data.index)
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

In [None]:
# Group by Region and Year, then sum
data = df.groupby(['Region', 'Year'])[['TS-Enrollment', 'fulcrum_total_app', 'fulcrum_grant_count']].sum().reset_index()

# Plot for each region
for region in data['Region'].unique():
    region_data = data[data['Region'] == region]
    x = np.arange(len(region_data['Year']))
    bar_width = 0.25
    
    plt.figure(figsize=(10, 6))
    plt.bar(x - bar_width, region_data['TS-Enrollment'], width=bar_width, color='lightgreen', label='Enrollment')
    plt.bar(x, region_data['fulcrum_total_app'], width=bar_width, color='royalblue', label='Financial Aid Apps')
    plt.bar(x + bar_width, region_data['fulcrum_grant_count'], width=bar_width, color='lightcoral', label='Grant Count')
    
    plt.title(f'{region} - Enrollment, Financial Aid & Grants Over Time')
    plt.xlabel('Year')
    plt.ylabel('Total Amount')
    plt.xticks(x, region_data['Year'], rotation=45)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

In [None]:
# Group by Year and sum
year_data = df.groupby('Year')[['tuition_fees_finaid_scholarships', 'salaries_51xx']].sum()

# Calculate percentage
year_data['salary_percentage'] = (year_data['salaries_51xx'] / year_data['tuition_fees_finaid_scholarships']) * 100

# Plot
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(year_data.index, year_data['salary_percentage'], marker='o', color='teal', label='Salaries % of Tuition Fees')

# Add percentage labels and adjust y-limit
max_percentage = year_data['salary_percentage'].max()
for i, v in enumerate(year_data['salary_percentage']):
    ax.text(i, v + 1, f'{v:.1f}%', ha='center', va='bottom')
ax.set_ylim(0, max_percentage + 10)  # Add padding above max value

# Labels and ticks
ax.set_title('Percentage of Tuition Fees Used for Salaries Over Time')
ax.set_xlabel('Year')
ax.set_ylabel('Percentage (%)')
ax.set_xticks(range(len(year_data.index)))
ax.set_xticklabels(year_data.index, rotation=45)
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Group by Year and sum
year_data = df.groupby('Year')[['tuition_fees_finaid_scholarships', 'Owner-Occupied Units', 'Renter-Occupied Units']].sum()

# Set up plot
x = np.arange(len(year_data))
bar_width = 0.35
fig, ax1 = plt.subplots(figsize=(10, 6))

# Bars for housing units
ax1.bar(x - bar_width/2, year_data['Owner-Occupied Units'], width=bar_width, color='teal', label='Owner-Occupied Units')
ax1.bar(x + bar_width/2, year_data['Renter-Occupied Units'], width=bar_width, color='lightcoral', label='Renter-Occupied Units')
ax1.set_xlabel('Year')
ax1.set_ylabel('Number of Units')
ax1.set_xticks(x)
ax1.set_xticklabels(year_data.index, rotation=45)

# Trend line for Tuition Fees on secondary axis
ax2 = ax1.twinx()
ax2.plot(x, year_data['tuition_fees_finaid_scholarships'], marker='o', color='black', label='Tuition Fees')
ax2.set_ylabel('Tuition Fees, FinAid, Scholarships ($)')

# Legends outside
ax1.legend(loc='upper left', bbox_to_anchor=(1.15, 1))
ax2.legend(loc='upper left', bbox_to_anchor=(1.15, 0.85))

plt.title('Owner/Renter-Occupied Units with Tuition Fees Trend Over Time')
plt.tight_layout()
plt.show()

In [None]:
# Group by Year and sum
year_data = df.groupby('Year')[['TS-Enrollment', 'Owner-Occupied Units', 'Renter-Occupied Units']].sum()

# Set up plot
x = np.arange(len(year_data))
bar_width = 0.35
fig, ax1 = plt.subplots(figsize=(10, 6))

# Bars for housing units
ax1.bar(x - bar_width/2, year_data['Owner-Occupied Units'], width=bar_width, color='teal', label='Owner-Occupied Units')
ax1.bar(x + bar_width/2, year_data['Renter-Occupied Units'], width=bar_width, color='lightcoral', label='Renter-Occupied Units')
ax1.set_xlabel('Year')
ax1.set_ylabel('Number of Units')
ax1.set_xticks(x)
ax1.set_xticklabels(year_data.index, rotation=45)

# Trend line for Tuition Fees on secondary axis
ax2 = ax1.twinx()
ax2.plot(x, year_data['TS-Enrollment'], marker='o', color='black', label='Tuition Fees')
ax2.set_ylabel('Tuition Fees, FinAid, Scholarships ($)')

# Legends outside
ax1.legend(loc='upper left', bbox_to_anchor=(1.15, 1))
ax2.legend(loc='upper left', bbox_to_anchor=(1.15, 0.85))

plt.title('Owner/Renter-Occupied Units with Tuition Fees Trend Over Time')
plt.tight_layout()
plt.show()

In [None]:
# Group by Year and Region, summing tuition fees and total population
data = df.groupby(['Year', 'Region'])[['tuition_fees_finaid_scholarships', 'Total Population']].sum().reset_index()

# Plot for each year
for year in data['Year'].unique():
    year_data = data[data['Year'] == year]
    
    fig, ax = plt.subplots(figsize=(10, 6))
    for region in year_data['Region'].unique():
        region_data = year_data[year_data['Region'] == region]
        ax.scatter(region_data['Total Population'], region_data['tuition_fees_finaid_scholarships'], 
                   label=region, s=100, alpha=0.7)
    
    ax.set_title(f'Tuition Fees vs Total Population by Region ({year})')
    ax.set_xlabel('Total Population')
    ax.set_ylabel('Tuition Fees, FinAid, Scholarships ($)')
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title='Region')
    ax.grid(True)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Group by Region, summing tuition fees and total population across all years
data = df.groupby('Region')[['tuition_fees_finaid_scholarships', 'Total Population']].sum().reset_index()

# Plot
fig, ax = plt.subplots(figsize=(10, 6))
for region in data['Region'].unique():
    region_data = data[data['Region'] == region]
    ax.scatter(region_data['Total Population'], region_data['tuition_fees_finaid_scholarships'], 
               label=region, s=100, alpha=0.7)

# Labels
ax.set_title('Tuition Fees vs Total Population by Region (All Years)')
ax.set_xlabel('Total Population')
ax.set_ylabel('Tuition Fees, FinAid, Scholarships ($)')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title='Region')
ax.grid(True)

plt.tight_layout()
plt.show()