In [1]:
# Install packages

# !pip install pandas
# !pip install numpy
# !pip install matplotlib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from textwrap import wrap
from enum import Enum
import json
import os


In [2]:
data_dir = './data/fall-25'
results_dir = './results'

In [3]:
# Read in data

sos_fall_experiences_survey_df = pd.read_csv(f'{data_dir}/experiences-survey-results-fall-25.csv')

In [37]:
# Reports to generate
# [ ] Results per instructor x course
# [ ] General results broken down by demographics
#

class SosQuestions(Enum):
  EC1 = 'EC1'
  EC2 = 'EC2'
  EC3 = 'EC3'
  EC4 = 'EC4'
  ES1 = 'ES1'
  ES2 = 'ES2'
  ES3 = 'ES3'
  ES4 = 'ES4'
  ES5 = 'ES5'
  ES6 = 'ES6'
  EV1 = 'EV1'
  EV2 = 'EV2'
  EV3 = 'EV3'
  EV4 = 'EV4'
  EV5 = 'EV5'
  EV6 = 'EV6'

Q_ID_TO_SOS_LABEL = {
  'q_climate_1': SosQuestions.EC1.value,
  'q_climate_2': SosQuestions.EC2.value,
  'q_climate_3': SosQuestions.EC3.value,
  'q_climate_4': SosQuestions.EC4.value,
  'q_structure_1': SosQuestions.ES1.value,
  'q_structure_2': SosQuestions.ES2.value,
  'q_structure_3': SosQuestions.ES3.value,
  'q_structure_4': SosQuestions.ES4.value,
  'q_structure_5': SosQuestions.ES5.value,
  'q_structure_6': SosQuestions.ES6.value,
  'q_vibrancy_1': SosQuestions.EV1.value,
  'q_vibrancy_2': SosQuestions.EV2.value,
  'q_vibrancy_3': SosQuestions.EV3.value,
  'q_vibrancy_4': SosQuestions.EV4.value,
  'q_vibrancy_5': SosQuestions.EV5.value,
  'q_vibrancy_6': SosQuestions.EV6.value,
}

SOS_LABEL_TO_TEXT = {
  SosQuestions.EC1.value: 'I felt comfortable asking questions and making comments in class',
  SosQuestions.EC2.value: 'My professor believes in my ability to learn and succeed',
  SosQuestions.EC3.value: 'I felt like I was a part of the classroom community',
  SosQuestions.EC4.value: 'My professor valued what I brought to class through my own personal experiences, inside and outside of class',
  SosQuestions.ES1.value: 'I understand what I was expected to learn in the class',
  SosQuestions.ES2.value: 'Work to be done on all assignments/activities was clearly explained',
  SosQuestions.ES3.value: "The professor's teaching style supported my learning",
  SosQuestions.ES4.value: 'The professor used examples that were relevant to my own life and experiences',
  SosQuestions.ES5.value: 'The professor provides feedback on my work that helped my learning',
  SosQuestions.ES6.value: 'The class was challenging in a way that was just right',
  SosQuestions.EV1.value: 'I can tell my professor was excited about teaching the class',
  SosQuestions.EV2.value: 'In class, we used interactive approaches beyond just traditional lecture to learn the material',
  SosQuestions.EV3.value: 'The instructor created an environment where my classmates and I learned from each other',
  SosQuestions.EV4.value: 'I feel encouraged to think critically and question assumptions',
  SosQuestions.EV5.value: 'My professor connected what we do in class to current social and/or cultural events and issues',
  SosQuestions.EV6.value: 'My professor encouraged me to take time to reflect on what and how I was learning',
}

SOS_LABEL_CATEGORIES = {
    'CLIMATE': [
      SosQuestions.EC1.value,
      SosQuestions.EC2.value,
      SosQuestions.EC3.value,
      SosQuestions.EC4.value,
    ],
    'STRUCTURE': [
      SosQuestions.ES1.value,
      SosQuestions.ES2.value,
      SosQuestions.ES3.value,
      SosQuestions.ES4.value,
      SosQuestions.ES5.value,
      SosQuestions.ES6.value,
    ],
    'VIBRANCY': [
      SosQuestions.EV1.value,
      SosQuestions.EV2.value,
      SosQuestions.EV3.value,
      SosQuestions.EV4.value,
      SosQuestions.EV5.value,
      SosQuestions.EV6.value,
    ]
}

LIKERT_TEXT_TO_VAL = {
    'Strongly agree': 5,
    'Somewhat agree': 4,
    'Neutral': 3,
    'Somewhat disagree': 2,
    'Strongly disagree': 1,
}

class Departments(Enum):
  BME = 'BME'
  CE= 'CE'
  CS= 'CS'
  EE= 'EE'
  ENGR= 'ENGR'
  ETEC= 'ETEC'
  FPAT= 'FPAT'
  ME= 'ME'
  MSE= 'MSE'
  TECH= 'TECH'

class Demographics(Enum):
  UNDERGRAD_GRAD = 'Q1'
  YEARS_AT_CSULA = 'Q3'
  PARENTS_EDUCATION = 'Q4'
  CAMPUS_OFF_CAMPUS = 'Q5'
  DAYS_ON_CAMPUS = 'Q6'
  EMPLOYMENT = 'Q7'
  CHILDCARE = 'Q1.1'
  COMMUTE_LENGTH = 'Q1.2'
  MAJOR = 'Q2.2'
  RACE_ETHNICITY = 'Q1.5'
  HOUSEHOLD_INCOME = 'Q2.4'
  DISABILITY = 'Q3.2'
  LGBTQIA = 'Q6.1'
  TRANSGENDER = 'Q7.1'
  GENDER_IDENTITY = 'Q8.2'

Q_ID_TO_COURSE_LABELS = {
    'q_department': 'DEPARTMENT',
    'q_course_name_bme': 'COURSE_NAME'
}

Q_ID_TO_DEMOGRAPHIC_LABELS = {
    'Q1': 'UNDERGRAD-GRAD',
    'Q3': 'YEARS_AT_CSULA',
    'Q4': 'PARENTS_EDUCATION',
    'Q5': 'CAMPUS_OFF_CAMPUS',
    'Q6': 'DAYS_ON_CAMPUS',
    'Q7': 'EMPLOYMENT',
    'Q2.1': 'CHILDCARE',
    'Q1.2': 'COMMUTE_LENGTH',
    'Q2.2': 'MAJOR',
    'Q1.5': 'RACE_ETHNICITY',
    'Q2.4': 'HOUSEHOLD_INCOME',
    'Q3.2': 'DISABILITY',
    'Q6.1': 'LGBTQIA',
    'Q7.1': 'TRANSGENDER',
    'Q8.2': 'GENDER_IDENTITY'
}

HOUSEHOLD_INCOME_ORDER = [
    'Prefer not to say',
    'Unsure',
    '< $25,000 / year',
    '$25,000 - $50,000 / year',
    '$50,000 - $75,000 / year',
    '$75,000 - $100,000 / year',
    '$100,000 - $125,000 / year',
    '$125,000 - $150,000 / year',
    '> $150,000 / year'
]

COMMUTE_LENGTH_ORDER = [
    'Less than 30 mins',
    'Between 30 mins and 1 hr',
    'Between 1 hr and 2 hrs',
    'More than 2 hrs'
]

EMPLOYMENT_ORDER = [
    'Not employed',
    'Employed part-time: less than 10 hours per week',
    'Employed part-time: 10-20 hours per week',
    'Employed part-time: 20-30 hours per week',
    'Employed part-time: 30-40 hours per week',
    'Employed full-time: 40+ hours per week',
]

PARENTS_EDUCATION_ORDER = [
    'Prefer not to say',
    'Unsure',
    'I do not have a parent with any college experience',
    'I do not have a parent who has received at least a 4-year college degree, but I have a parent who attended some college',
    'I have a parent who has received at least a 4-year college degree'
]

In [42]:
def clean_df(survey_df):
  df_valid_response = survey_df[survey_df['DistributionChannel'] == 'anonymous'].copy()
  df_sos_responses = df_valid_response[list(Q_ID_TO_SOS_LABEL.keys())]
  df_valid_response.rename(columns=Q_ID_TO_SOS_LABEL, inplace=True)
  # Drop question text and qualtrics question id
  df_clean = df_valid_response[2:]
  df_clean.replace(LIKERT_TEXT_TO_VAL, inplace=True)
  rows_to_drop = []
  # Drop rows that have only NaN for SOS responses (no response)
  nan_df = df_sos_responses.isnull()
  all_nan_df = nan_df.sum(axis=1)
  all_nan_rows = all_nan_df[all_nan_df == nan_df.shape[1]].index
  rows_to_drop += list(all_nan_rows)
  # Drop rows with response time < 60 seconds
  df_clean['Duration (in seconds)'] = pd.to_numeric(df_clean['Duration (in seconds)'])
  low_response_drop_rows = df_clean[df_clean['Duration (in seconds)'] < 60].index
  rows_to_drop += list(low_response_drop_rows)
  return df_clean.drop(rows_to_drop)

def sos_responses_df(survey_df):
  return survey_df[list(map(lambda e: e.value, list(SosQuestions)))]

def results_by_course_instructor_df(survey_df):
  instructors_courses = {}
  for dept in list(Departments):
    dept_name = dept.value.lower()
    instructor_col = f'q_instructor_{dept_name}'
    course_col = f'q_course_name_{dept_name}'
    instructors = list(survey_df.dropna(subset = [instructor_col])[instructor_col].unique())
    for instructor in instructors:
      if instructor not in instructors_courses:
        instructors_courses[instructor] = {}
      courses = list(survey_df[survey_df[instructor_col] == instructor][course_col].unique())
      for course in courses:
        instructors_courses[instructor][course] = survey_df[
          (survey_df[instructor_col] == instructor) &
          (survey_df[course_col] == course)
        ]
  return instructors_courses

def demographics_all_majors_df(survey_df):
  df_valid_response = survey_df[survey_df['DistributionChannel'] == 'anonymous'].copy()
  df_valid_response = df_valid_response[2:]
  df_demo_responses = df_valid_response[list(Q_ID_TO_DEMOGRAPHIC_LABELS.keys())]
  df_demo_responses.rename(columns=Q_ID_TO_DEMOGRAPHIC_LABELS, inplace=True)
  return df_demo_responses

def demographics_by_major_dfs(demographics_all_majors_df):
  majors = demographics_all_majors_df.dropna(subset = ['MAJOR'])['MAJOR'].unique()
  demo_df_by_major = { major: demographics_all_majors_df[demographics_all_majors_df['MAJOR'] == major] for major in majors }
  return demo_df_by_major

def scores_all_majors_df(survey_df):
  '''
  Return a dataframe of only the SOS questions and values as numbers.

  :param survey_df: The entire dataframe of survey data from Qualtrics.
  '''
  df_valid_response = survey_df[survey_df['DistributionChannel'] == 'anonymous'].copy()
  df_sos_responses = survey_df[list(Q_ID_TO_SOS_LABEL.keys())]
  df_sos_responses.rename(columns=Q_ID_TO_SOS_LABEL, inplace=True)
  # Drop question text and qualtrics question id
  df_sos_responses = df_sos_responses[2:]
  df_sos_responses.replace(LIKERT_TEXT_TO_VAL, inplace=True)
  nan_df = df_sos_responses.isnull()
  all_nan_df = nan_df.sum(axis=1)
  all_nan_rows = all_nan_df[all_nan_df == df.shape[1]].index
  return df_sos_responses.drop(list(all_nan_rows))

def survey_by_major_dfs(survey_df):
  '''
  Split the entire survey dataframe into dataframes keyed on the student major.
  '''
  df_valid_response = survey_df[survey_df['DistributionChannel'] == 'anonymous'].copy()
  df_valid_response = df_valid_response[2:]
  majors = df_valid_response.dropna(subset = ['Q2.2'])['Q2.2'].unique()
  survey_df_by_major = { major: df_valid_response[df_valid_response['Q2.2'] == major] for major in majors }
  return survey_df_by_major

def survey_by_race_eth_dfs(survey_df):
  '''
  Split the entire survey dataframe into dataframes keyed on the student race/ethnicity.
  '''
  df_valid_response = survey_df[survey_df['DistributionChannel'] == 'anonymous'].copy()
  df_valid_response = df_valid_response[2:]
  race_eths = df_valid_response.dropna(subset = ['Q1.5'])['Q1.5'].unique()
  survey_df_by_race_eth = { race_eth: df_valid_response[df_valid_response['Q1.5'] == race_eth] for race_eth in race_eths }
  return survey_df_by_race_eth

def survey_by_gender_id_dfs(survey_df):
  '''
  Split the entire survey dataframe into dataframes keyed on the student gender identity.
  '''
  df_valid_response = survey_df[survey_df['DistributionChannel'] == 'anonymous'].copy()
  df_valid_response = df_valid_response[2:]
  gender_ids = df_valid_response.dropna(subset = ['Q8.2'])['Q8.2'].unique()
  survey_df_by_gender_id = { gender_id: df_valid_response[df_valid_response['Q8.2'] == gender_id] for gender_id in gender_ids }
  return survey_df_by_gender_id

def scores_by_major_dfs(survey_major_dfs):
  scores_by_major = { major: None for major in survey_major_dfs.keys() }
  for major, df in survey_major_dfs.items():
    df_sos_responses = df[list(Q_ID_TO_SOS_LABEL.keys())]
    df_sos_responses.rename(columns=Q_ID_TO_SOS_LABEL, inplace=True)
    # Drop question text and qualtrics question id
    df_sos_responses = df_sos_responses[2:]
    df_sos_responses.replace(LIKERT_TEXT_TO_VAL, inplace=True)
    scores_by_major[major] = df_sos_responses
  return scores_by_major

def scores_by_race_eth_dfs(survey_race_eth_dfs):
  scores_by_race_eth = { race_eth: None for race_eth in survey_race_eth_dfs.keys() }
  for race_eth, df in survey_race_eth_dfs.items():
    df_sos_responses = df[list(Q_ID_TO_SOS_LABEL.keys())]
    df_sos_responses.rename(columns=Q_ID_TO_SOS_LABEL, inplace=True)
    # Drop question text and qualtrics question id
    df_sos_responses = df_sos_responses[2:]
    df_sos_responses.replace(LIKERT_TEXT_TO_VAL, inplace=True)
    scores_by_race_eth[race_eth] = df_sos_responses
  return scores_by_race_eth

def scores_by_gender_id_dfs(survey_gender_id_dfs):
  scores_by_gender_id = { gender_id: None for gender_id in survey_gender_id_dfs.keys() }
  for gender_id, df in survey_gender_id_dfs.items():
    df_sos_responses = df[list(Q_ID_TO_SOS_LABEL.keys())]
    df_sos_responses.rename(columns=Q_ID_TO_SOS_LABEL, inplace=True)
    # Drop question text and qualtrics question id
    df_sos_responses = df_sos_responses[2:]
    df_sos_responses.replace(LIKERT_TEXT_TO_VAL, inplace=True)
    scores_by_gender_id[gender_id] = df_sos_responses
  return scores_by_gender_id

def sos_scores(scores_df):
  '''
  Calculate the SOS scores per-item for a dataframe.

  :param scores_df: The scores dataframe of survey data from Qualtrics (just
  the SOS questions and their numeric responses).
  '''
  score_vec_by_q = { col: np.array(scores_df[col]) for col in scores_df.columns }
  num_nan_per_q = { q_id: np.count_nonzero(np.isnan(score_vec)) for q_id, score_vec in score_vec_by_q.items() }
  score_vec_by_q_no_nan = { q_id: score_vec[~np.isnan(score_vec)] for q_id, score_vec in score_vec_by_q.items() }
  score_results_by_q = { q_id: {
      'n': len(score_vec),
      'mean': round(score_vec.mean(), 2),
      'std': round(score_vec.std(), 2)
  } for q_id, score_vec in score_vec_by_q_no_nan.items() }
  return score_results_by_q, num_nan_per_q

def pretty_print_scores(scores_data):
  print("SOS Values Scores:")
  for sos_id,data in scores_data[0].items():
    print(f'\t- {sos_id}: ({data["mean"]}, {data["std"]}) (n={data["n"]})')
  print(f"Nonanswers per question:")
  for sos_id,count in scores_data[1].items():
    print(f'\t- {sos_id}: {count}')

def write_scores_to_file_json(scores_data, out_file):
  with open(out_file, 'w') as f:
    json.dump(scores_data[0], f)

def write_scores_to_file_csv(scores_data, out_file):
  csv_data = {'item': [],'mean': [],'std': [],'n': []}
  for item, attrs in scores_data[0].items():
    csv_data['item'].append(item)
    csv_data['mean'].append(attrs['mean'])
    csv_data['std'].append(attrs['std'])
    csv_data['n'].append(attrs['n'])
  scores_df = pd.DataFrame(csv_data)
  scores_df.to_csv(out_file)

def print_scores_for_table(scores_data):
  for i in range(len(scores_data[0])):
    sos_id = list(scores_data[0].keys())[i]
    data = scores_data[0][sos_id]
    count = scores_data[1][sos_id]
    print(f'{sos_id}\t{data["mean"].round(2)}\t{data["std"].round(2)}\t{data["n"]}\t{count}')

def graph_counts_histogram(categories, values, xlabel, ylabel, title):
  fix,ax = plt.subplots(figsize=(6,4))
  p = ax.bar(categories, values)
  ax.bar_label(p, values, label_type='center', color='#fff')
  ax.set_xticklabels(categories, rotation=-45, ha='left', fontsize=10)
  ax.set_xlabel(xlabel)
  ax.set_ylabel(ylabel)
  ax.set_title(title)
  plt.show()

def graph_income(demographics_df):
  demo_df = demographics_df.copy()
  incomes = HOUSEHOLD_INCOME_ORDER
  values = [ len(demo_df[demo_df['HOUSEHOLD_INCOME'] == income]) for income in incomes ]
  graph_counts_histogram(
      incomes,
      values,
      'Income',
      'Number of students',
      'Student household income (all majors)'
  )

def graph_disability(demographics_df):
  demo_df = demographics_df.copy()
  disability_states = demo_df.dropna(subset = ['DISABILITY'])['DISABILITY'].unique()
  values = [ len(demo_df[demo_df['DISABILITY'] == state]) for state in disability_states ]
  graph_counts_histogram(
      disability_states,
      values,
      'Disability status',
      'Number of students',
      'Student disability status (all majors)'
  )

def graph_childcare(demographics_df):
  demo_df = demographics_df.copy()
  childcare_states = demo_df.dropna(subset = ['CHILDCARE'])['CHILDCARE'].unique()
  values = [ len(demo_df[demo_df['CHILDCARE'] == state]) for state in childcare_states ]
  graph_counts_histogram(
      childcare_states,
      values,
      'Childcare responsibilities',
      'Number of students',
      'Student childcare responsibilities (all majors)'
  )

def graph_lgbtqia(demographics_df):
  demo_df = demographics_df.copy()
  lgbtqia_states = demo_df.dropna(subset = ['LGBTQIA'])['LGBTQIA'].unique()
  values = [ len(demo_df[demo_df['LGBTQIA'] == state]) for state in lgbtqia_states ]
  graph_counts_histogram(
      lgbtqia_states,
      values,
      'LGBTQIA+ status',
      'Number of students',
      'Students who identify as LGBTQIA+ (all majors)'
  )

def graph_transgender(demographics_df):
  demo_df = demographics_df.copy()
  transgender_states = demo_df.dropna(subset = ['TRANSGENDER'])['TRANSGENDER'].unique()
  values = [ len(demo_df[demo_df['TRANSGENDER'] == state]) for state in transgender_states ]
  graph_counts_histogram(
      transgender_states,
      values,
      'Transgender status',
      'Number of students',
      'Students who are transgender (all majors)'
  )

def graph_parents_education(demographics_df):
  demo_df = demographics_df.copy()
  education_states = PARENTS_EDUCATION_ORDER
  values = [ len(demo_df[demo_df['PARENTS_EDUCATION'] == state]) for state in education_states ]
  education_labels = [ '\n'.join(wrap(state, 30)) for state in education_states ]
  graph_counts_histogram(
      education_labels,
      values,
      'Parents education level',
      'Number of students',
      'Students parents educational attainment (all majors)'
  )

def graph_commute(demographics_df):
  demo_df = demographics_df.copy()
  commute_lengths = COMMUTE_LENGTH_ORDER
  values = [ len(demo_df[demo_df['COMMUTE_LENGTH'] == length]) for length in commute_lengths ]
  graph_counts_histogram(
      commute_lengths,
      values,
      'Commute length',
      'Number of students',
      'Student commute time to campus (all majors)'
  )

def graph_employment(demographics_df):
  demo_df = demographics_df.copy()
  employment_states = EMPLOYMENT_ORDER
  values = [ len(demo_df[demo_df['EMPLOYMENT'] == state]) for state in employment_states ]
  employment_labels = [ '\n'.join(wrap(state, 30)) for state in employment_states ]
  graph_counts_histogram(
      employment_labels,
      values,
      'Employment',
      'Number of students',
      'Student employment status (all majors)'
  )

def graph_responses_per_major(scores_by_major_dfs):
  fix,ax = plt.subplots(figsize=(6,5))
  responses_per_major = {
      major: 0 for major in scores_by_major_dfs.keys()
  }
  threshold = 0.8
  for major, scores in scores_by_major_dfs.items():
    for row in scores.iterrows():
      vec = np.array(row[1].values)
      percent_answered = np.isnan(vec.sum()) / len(vec)
      if percent_answered <= 1 - threshold:
        responses_per_major[major] += 1

  majors = list(responses_per_major.keys())
  responses = [ responses_per_major[major] for major in majors ]

  graph_counts_histogram(
      majors,
      responses,
      'Major',
      'Number of responses',
      'Students who responded to\nat least 80% of questions by major'
  )

  # p = ax.bar(majors, responses)
  # ax.bar_label(p, responses, label_type='center', color='#fff')
  # ax.set_xticklabels(majors, rotation=-45, ha='left', fontsize=10)
  # ax.set_xlabel('Major')
  # ax.set_ylabel()
  # ax.set_title()

  # plt.show()

def graph_scores(scores_df, show=True, out_dir='', out_name=''):
  '''
  Create three violin plots (climate, structure, vibrancy) for the
  scores in a dataframe for all SOS questions.

  :param scores_df: A dataframe with columns as SOS questions and values
  as likert score numbers per respondent
  '''
  for i in range(len(SOS_LABEL_CATEGORIES)):
    fig, ax = plt.subplots(figsize=(6,5))
    ax.set_xticks(range(1,6))
    ax.set_xlim([1, 5])
    ax.set_xticklabels(range(1,6))
    category = list(SOS_LABEL_CATEGORIES.keys())[i]

    temp_subplot = SOS_LABEL_CATEGORIES[category].copy()
    temp_subplot.reverse()
    subplot_data = scores_df[temp_subplot]
    question_texts = [ '\n'.join(wrap(SOS_LABEL_TO_TEXT[label], 30)) for label in temp_subplot ]
    ax.set_yticks(range(len(temp_subplot)))
    ax.set_yticklabels(question_texts)
    ax.set_xlabel('Likert Score\n(1 = "Strongly disagree", 5 = "Strongly agree")')
    ax.set_ylabel('Question')
    ax.set_title(f'Responses for {category.lower()}\n"Indicate your agreement with each of the following statements about the class environment"')

    dataset = [ np.array(subplot_data[col]) for col in subplot_data.columns ]
    dataset = [ vec[~np.isnan(vec)] for vec in dataset ]

    ax.violinplot(dataset, orientation='horizontal', positions=range(len(temp_subplot)))

    if show:
      plt.show()
    if out_dir != '':
      # plt.tight_layout()
      plt.savefig(f'{out_dir}/{out_name}_{category}.png', bbox_inches='tight')
      plt.close()

def report_scores_for_all_majors():
  all_major_scores_df = scores_all_majors_df(sos_fall_values_survey_df)
  score_data = sos_scores(all_major_scores_df)
  # pretty_print_scores(score_data)
  print_scores_for_table(score_data)
  graph_scores(all_major_scores_df)

def report_scores_per_major():
  major_scores = scores_by_major_dfs(survey_by_major_dfs(sos_fall_values_survey_df))
  for major, df in major_scores.items():
    print(f'=========== MAJOR: {major} ===========')
    score_data = sos_scores(df)
    # pretty_print_scores(score_data)
    print_scores_for_table(score_data)
    graph_scores(df)

def report_scores_per_race_eth():
  race_eth_scores = scores_by_race_eth_dfs(survey_by_race_eth_dfs(sos_fall_values_survey_df))
  for race_eth, df in race_eth_scores.items():
    print(f'=========== RACE/ETHNICITY: {race_eth} ===========')
    score_data = sos_scores(df)
    # pretty_print_scores(score_data)
    print_scores_for_table(score_data)
    graph_scores(df)

def report_scores_per_gender_id():
  gender_id_scores = scores_by_gender_id_dfs(survey_by_gender_id_dfs(sos_fall_values_survey_df))
  for gender_id, df in gender_id_scores.items():
    print(f'=========== GENDER IDENTITY: {gender_id} ===========')
    score_data = sos_scores(df)
    # pretty_print_scores(score_data)
    print_scores_for_table(score_data)
    graph_scores(df)

# report_scores_for_all_majors()
# report_scores_per_major()
# graph_responses_per_major(scores_by_major_dfs(survey_by_major_dfs(sos_fall_values_survey_df)))

# demo_all_majors = demographics_all_majors_df(sos_fall_values_survey_df)
# demo_by_major = demographics_by_major_dfs(demo_all_majors)
# demo_by_major
# graph_income_all_majors(demo_all_majors)
# graph_disability_all_majors(demo_all_majors)
# graph_commute_all_majors(demo_all_majors)
# graph_employment_all_majors(demo_all_majors)
# graph_childcare_all_majors(demo_all_majors)
# graph_parents_education_all_majors(demo_all_majors)
# graph_transgender(demo_all_majors)
# graph_lgbtqia(demo_all_majors)

# for major,demo_data in demo_by_major.items():
#   print(f'======= MAJOR: {major} ========')
#   graph_income(demo_data)
#   graph_disability(demo_data)
#   graph_commute(demo_data)
#   graph_employment(demo_data)
#   graph_childcare(demo_data)
#   graph_parents_education(demo_data)
#   graph_transgender(demo_data)
#   graph_lgbtqia(demo_data)

# report_scores_per_race_eth()
# report_scores_per_gender_id()

In [43]:
df_clean = clean_df(sos_fall_experiences_survey_df)
df_sos = sos_responses_df(df_clean)
results_per_course = results_by_course_instructor_df(df_clean)
for instructor, course_results in results_per_course.items():
  all_instructor_responses = []
  for course, responses in course_results.items():
    if len(responses) == 0:
      print(f'EMPTY RESULTS FOR {instructor} -- {course} SKIPPING')
      continue
    print(f'RESULTS FOR {instructor} -- {course}')
    sos_responses = sos_responses_df(responses)
    results = sos_scores(sos_responses)
    instructor_dir =f'{results_dir}/fall-25/experiences/{instructor}'
    if not os.path.exists(instructor_dir):
      os.mkdir(instructor_dir)
    if not os.path.exists(f'{instructor_dir}/{course}'):
      os.mkdir(f'{instructor_dir}/{course}')
    write_scores_to_file_csv(results, f'{instructor_dir}/{course}/results.csv')
    if len(responses) > 1:
      graph_scores(responses, False, f'{instructor_dir}/{course}')
    all_instructor_responses.append(responses)

  # Write results for instructor across all courses
  if len(all_instructor_responses) > 0:
    all_responses = pd.concat(all_instructor_responses)
    sos_all_responses = sos_responses_df(all_responses)
    sos_all_results = sos_scores(sos_all_responses)
    write_scores_to_file_csv(sos_all_results, f'{instructor_dir}/results.csv')
    if len(sos_all_responses) > 1:
      graph_scores(sos_all_responses, False, instructor_dir)

  df_clean.replace(LIKERT_TEXT_TO_VAL, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean.replace(LIKERT_TEXT_TO_VAL, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['Duration (in seconds)'] = pd.to_numeric(df_clean['Duration (in seconds)'])


RESULTS FOR Lucey -- FLUID MECHANICS I
RESULTS FOR Lucey -- WATER RESOURCES ENG
RESULTS FOR Khodayari -- TECH ASPECTS URBAN ENVIRONMENT
RESULTS FOR Khodayari -- INTRO TO CAD CIVIL ENGINEERS
RESULTS FOR Khodayari -- ENVIRONMENTAL TRANSPORT
RESULTS FOR Bowen -- STRENGTH OF MATERIALS I
RESULTS FOR Bowen -- INTRO TO ENGINEERING & TECH
RESULTS FOR Bowen -- SPECIAL TOPICS IN ENGR
RESULTS FOR Rodriguez-Nikl -- SPEC TOPIC IN CIVIL ENGINEERIN
RESULTS FOR Rodriguez-Nikl -- STRUCTURAL MECHANICS II
RESULTS FOR Rodriguez-Nikl -- DYNAMICS FOR CIVIL ENGINEERS
RESULTS FOR Ibrahim -- FUNDMNTLS OF CONSTRUCTION MGMT
RESULTS FOR Ibrahim -- CONST PRJCT PLN SCHDL CNTRL
RESULTS FOR Ibrahim -- STATS & DATA ANALYSIS FOR ENGR
RESULTS FOR Ibrahim -- STATICS
RESULTS FOR De -- CONST PRJCT PLN SCHDL CNTRL
RESULTS FOR Sivathasan -- GEOTECHNICAL ENGR DESIGN II
RESULTS FOR Pourhomayoun -- INTRODUCTION TO DATA SCIENCE
RESULTS FOR Krum -- INTRO 3D COMPUTER GAME PROG
RESULTS FOR Krum -- HUMAN CENTERED COMPUTING
RESULTS F

  'mean': round(score_vec.mean(), 2),
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  'mean': round(score_vec.mean(), 2),
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


RESULTS FOR Griffis -- NUMERICAL METHODS II
RESULTS FOR Eshraghi -- MANUFACTURING PROCESS


In [21]:

df_clean = clean_df(sos_fall_experiences_survey_df)
df_sos = sos_responses_df(df_clean)
results_per_course = results_by_course_instructor_df(df_clean)
results_per_course['Kornblum']

  df_clean.replace(LIKERT_TEXT_TO_VAL, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean.replace(LIKERT_TEXT_TO_VAL, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['Duration (in seconds)'] = pd.to_numeric(df_clean['Duration (in seconds)'])


{'INTRO TO MECHANICS':                StartDate              EndDate      Status     IPAddress  \
 183  2025-12-05 01:03:18  2025-12-05 01:04:59  IP Address  45.48.39.250   
 
     Progress  Duration (in seconds) Finished         RecordedDate  \
 183      100                    101     True  2025-12-05 01:05:00   
 
             ResponseId  RecipientLastName  ...  Q2.3               Q1.5  \
 183  R_6PgFVjtGmCbdrd5                NaN  ...   NaN  Hispanic / Latinx   
 
                          Q2.4 Q3.1 Q4.1 Q5.1 Q6.1 Q7.1  Q8.1  \
 183  $25,000 - $50,000 / year  Yes   No   3+   No   No   Man   
 
                           Q9  
 183  Straight (heterosexual)  
 
 [1 rows x 84 columns]}