In [None]:
## Import typical data science packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re

In [None]:
## Import and concatenate the data files

# Directory containing the CSV files
data_dir = r"C:\Users\ceero\Downloads\Admissions\Data"
# Match files ending with four digits followed by .csv
all_files = [f for f in os.listdir(data_dir) if re.search(r'adm\d{4}\.csv$', f)]

# Read and combine all matched CSV files
dfs = []
for file in all_files:
    # Construct full file path
    file_path = os.path.join(data_dir, file)
    # Read the CSV file
    df = pd.read_csv(file_path)
    # Extract the four digits before .csv
    match = re.search(r'(adm\d{4})\.csv$', file)
    if match:
        df['year'] = match.group(1)
    else:
        df['year'] = None
    # Clean white space out of column names
    df.columns = df.columns.str.strip()
    dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)

In [None]:
# Quick description of the combined data
combined_df.describe()

Unnamed: 0,UNITID,ADMCON1,ADMCON2,ADMCON3,ADMCON4,ADMCON5,ADMCON6,ADMCON7,ADMCON8,ADMCON9,...,ENRLUN,ENRLFTAN,ENRLFTUN,ENRLPTAN,ENRLPTUN,SATVR50,SATMT50,ACTCM50,ACTEN50,ACTMT50
count,20571.0,20571.0,20571.0,20571.0,20571.0,20571.0,20571.0,20571.0,20571.0,20571.0,...,3524.0,295.0,3512.0,221.0,2672.0,2002.0,2002.0,1976.0,1861.0,1863.0
mean,225909.204122,1.521073,2.824121,1.235234,2.260026,2.322882,2.849594,2.506781,1.663993,2.808954,...,0.612089,13.80678,0.573178,0.425339,0.053892,589.937562,581.505994,24.57996,24.209027,23.072464
std,106304.530468,1.033696,1.116885,0.760622,1.15801,1.277672,0.869699,1.537061,1.06761,0.760446,...,7.155043,19.790053,6.932173,2.500925,0.571454,71.487757,79.271699,4.567877,5.198722,4.42913
min,100654.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,380.0,310.0,5.0,2.0,5.0
25%,159391.0,1.0,2.0,1.0,1.0,1.0,3.0,1.0,1.0,3.0,...,0.0,2.0,0.0,0.0,0.0,540.0,530.0,21.0,21.0,20.0
50%,197036.0,1.0,3.0,1.0,2.0,2.0,3.0,2.0,1.0,3.0,...,0.0,6.0,0.0,0.0,0.0,580.0,570.0,24.0,23.0,23.0
75%,231411.0,2.0,3.0,1.0,3.0,3.0,3.0,3.0,2.0,3.0,...,0.0,15.0,0.0,0.0,0.0,635.0,620.0,28.0,27.0,26.0
max,499635.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,9.0,...,358.0,158.0,347.0,35.0,18.0,760.0,800.0,35.0,35.0,35.0


In [None]:
## Look at which observations are not recorded at all in certain years

# For each column, find years where all entries are null for that column
null_years = {}
for col in combined_df.columns:
    # Group by year and check if all values in the column are null for each year
    years_all_null = combined_df.groupby('year')[col].apply(lambda x: x.isnull().all())
    # Get years where all values are null
    null_years[col] = years_all_null[years_all_null].index.tolist()

# Display columns with at least one year where all entries are null
for col, years in null_years.items():
    if years:
        print(f"Column '{col}' is all null in years: {years}")

Column 'XSATWR25' is all null in years: ['2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']
Column 'SATWR25' is all null in years: ['2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']
Column 'XSATWR75' is all null in years: ['2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']
Column 'SATWR75' is all null in years: ['2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']
Column 'XACTWR25' is all null in years: ['2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']
Column 'ACTWR25' is all null in years: ['2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']
Column 'XACTWR75' is all null in years: ['2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']
Column 'ACTWR75' is all null in years: ['2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']
Column 'ADMCON10' is all null in years: ['2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021']
Column 'ADMCON11' is all null in years: ['2014', '2015', '2