In [None]:
'''
INFO_511_ Application Exercise 05: Wrangling College Majors
Author: Todd Adams
Date: 04/06/2024
Description: We are answering questions related to the Degrees dataset.
Note: I used VS Code and ChatGPT to help me write this code.
'''


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FormatStrFormatter

# Load data
degrees = pd.read_csv("data/degrees.csv")

# Preview data
degrees.head()
degrees.info()

In [None]:
# First pivot: wider to longer
degrees_long = degrees.melt(id_vars='degree', 
                            var_name='year', 
                            value_name='n')

# Check result
degrees_long.head()


**What is the type of year and what should it be?**  

The `year` column is currently a string because it was created from column headers.  
It should be converted to an **integer** so that we can treat it numerically for plotting and filtering purposes.


In [None]:
# Check data types
degrees_long['year'] = degrees_long['year'].astype(int)


**What would an NA mean in this context?**  

An `NA` means there was no recorded number of graduates for that degree and year.  
Since this dataset comes from university records, NA likely means **zero** students graduated, not "unknown."  
Therefore, it makes sense to replace these NAs with 0.


In [None]:
# Replace missing values with 0
degrees_long['n'] = degrees_long['n'].fillna(0)

# Extract degree type (e.g., "BA", "BS", "MA", "MS", "PhD")
import re

# Extract degree type from the 'degree' column
degrees_long['degree_type'] = degrees_long['degree'].str.extract(r'\((.*?)\)')

# First draft plot
sns.set_style("whitegrid")

plt.figure(figsize=(10, 6))

sns.lineplot(
    data=degrees_long,
    x='year',
    y='n',
    hue='degree_type',
    ci=None
)

plt.title("First Draft: Number of Graduates by Degree Type")
plt.xlabel("Year")
plt.ylabel("Number of Students")
plt.legend(title="Degree Type")
plt.show()


**What needs to be updated to match the goal plot?**

The final plot should include:  
- A specific title matching the goal image  
- A custom color palette for degree types  
- X-axis should show ticks from 2019 to 2023 only  
- Degree type order should be enforced (BA, BS, MA, MS, PhD)  
- Clean legend labeling  


In [None]:
# Set correct order and color palette
order = ["BA", "BS", "MA", "MS", "PhD"]
colors = {
    "BA": "#53868B",
    "BS": "#7AC5CD",
    "MA": "#89a285",
    "MS": "#8B814C",
    "PhD": "#CDBE70"
}

sns.set_style("whitegrid")

plt.figure(figsize=(10, 6))

sns.lineplot(
    data=degrees_long,
    x='year',
    y='n',
    hue='degree_type',
    hue_order=order,
    palette=colors,
    ci=None
)

plt.title("College of Information Science degrees over the years\nAcademic years 2019 - 2023", fontsize=14)
plt.xlabel("Graduation year")
plt.ylabel("Number of students graduating")
plt.xticks(ticks=[2019, 2020, 2021, 2022, 2023])  # Ensure only these years show
plt.legend(title="Degree type")
plt.tight_layout()
plt.show()
