## Lecture Notebook
### UCB Degrees data - Pandas demo, incl missing values 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.float_format = "{:.1f}".format

Data Source - UC Berkeley "Our Berkeley" Office of Planning and Analysis

[Degree Recipiencts by Major](https://calviz.berkeley.edu/t/OPAP/views/DegreeRecipientsByMajor/ExecutiveSummary?%3Aembed=y&%3AisGuestRedirectFromVizportal=y)


At the bottom of this page there is a link  to [Dowload this Dashboard's Data](https://docs.google.com/spreadsheets/d/1J23dGyhxeQhwWMH3SJDOxtma-l5HfkCXNRDDapm_KFI/edit?gid=723050719#gid=723050719)

Which is a Google Sheet, and google sheets have a cool capability to be able to be read in via csv


In [None]:
sheet_id = "1J23dGyhxeQhwWMH3SJDOxtma-l5HfkCXNRDDapm_KFI"
sheet_name = "Degree%20Recipients%20by%20Major"
url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"



In [None]:
UCBDR = pd.read_csv(url)

In [None]:
#UCBDR= pd.read_csv('UCB Students - Degree Recipients by Major - Degree Recipients by Major.csv')
UCBDR

## Exploring Majors

In [None]:
UCB2024 = UCBDR[UCBDR['Academic Yr'] == '2023-24']
UCB2024

In [None]:
UCB2025 = UCBDR[UCBDR['Academic Yr'] == '2024-25']
UCB2025

In [None]:
UCB2025UG = UCB2025[UCB2025['Degree Level Desc'] == 'Bachelor']
UCB2025UG

In [None]:
UCB2025UG.groupby("College/School")["Headcount"].sum().sort_values(ascending=False).plot(kind="bar", title="Total Degrees Awarded by College/School")

In [None]:
UCB2025UG.groupby("College/School")["Headcount"].sum()

In [None]:
HCXMajor= UCB2025UG.groupby("Major")["Headcount"].sum().sort_values(ascending=False)
HCXMajor.head(30)

In [None]:
# sum of values in headcount column
TotalHC = UCB2025UG['Headcount'].sum()

int(TotalHC) 

In [None]:
# pull out values in MajorShort column for "Comp Sci" and "EECS" and "Data Science"

CSHC = UCB2025UG[UCB2025UG['Major'] == 'CDSS Computer Science']['Headcount'].sum()
int(CSHC)

In [None]:
CSHC/TotalHC

In [None]:
DSHC=HCXMajor['Data Science']
DSHC

In [None]:
EECSHC=HCXMajor['Electrical Eng & Comp Sci']
EECSHC


In [None]:
Computing=CSHC+DSHC+EECSHC
Computing


In [None]:
Computing/TotalHC

In [None]:
DSHC/TotalHC

In [None]:
EECSHC/TotalHC

In [None]:
major_pivot = pd.pivot_table(
    UCB2025UG,
    index="Major",
    values="Headcount",
    aggfunc="sum"
).sort_values("Headcount", ascending=False)

major_pivot.head(10)

In [None]:


# Add percent of total
major_pivot["Percent of Total"] = (
    major_pivot["Headcount"] / major_pivot["Headcount"].sum() * 100
)

major_pivot.head(10)

In [None]:
transfer_major = pd.pivot_table(
    UCB2024UG,
    index="Major",
    columns="Entry Status Desc",
    values="Headcount",
    fill_value=0
).sort_values("Transfer", ascending=False)

# Add row-wise percentages
transfer_major_pct = transfer_major.div(
    transfer_major_pivot.sum(axis=1),
    axis=0
) * 100

transfer_major.head(20)

In [None]:
transfer_major["Total"] = transfer_major["Freshman"]+transfer_major["Other"]+transfer_major["Transfer"]
transfer_major

In [None]:
transfer_major["Percent Transfer"] = (
    transfer_major["Transfer"] / transfer_major["Total"] * 100
)
transfer_major

In [None]:
transfer_major.sort_values(
    by="Percent Transfer",
    ascending=False
).head(10)

In [None]:
UCB2025UG.columns

In [None]:
entry_stem_pivot = pd.pivot_table(
    UCB2025UG,
    index="Entry Status Desc",
    columns="STEM Flag",
    values="Headcount",
    aggfunc="sum",
    fill_value=0
)
entry_stem_pivot

In [None]:
transfer_major2 = pd.pivot_table(
    UCB2024UG,
    index="Major",
    columns="Entry Status Desc",
    values="Headcount",
    fill_value=0,
    margins=True
)
transfer_major2

In [None]:
entry_stem = pd.pivot_table(
    UCB2025UG,
    index="Entry Status Desc",
    columns="STEM Flag",
    values="Headcount",
    aggfunc="sum",
    fill_value=0,
    margins=True
).rename(columns={"All": "Total"}).drop(index="All")

entry_stem

In [None]:
entry_stem["Percent STEM"] = (entry_stem["STEM"] / entry_stem["Total"] * 100).round(1)
entry_stem

In [None]:
entry_stem.loc["Total"] = entry_stem.sum(axis=0)
entry_stem

In [None]:
entry_stem.loc["Percent Transfer"] = (
    entry_stem.loc["Transfer"] / entry_stem.loc["Total"] * 100
).round(1)

entry_stem