# Simpson's Paradox
Use `admission_data.csv` for this exercise.

In [95]:
# Load and view first few lines of dataset
import pandas as pd
import numpy as np

In [96]:
df =pd.read_csv('admission_data.csv')

In [97]:
df.head()

Unnamed: 0,student_id,gender,major,admitted
0,35377,female,Chemistry,False
1,56105,male,Physics,True
2,31441,female,Chemistry,False
3,51765,male,Physics,True
4,53714,female,Physics,True


In [98]:
df.shape

(500, 4)

In [99]:
df.dtypes

student_id     int64
gender        object
major         object
admitted        bool
dtype: object

In [100]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 4 columns):
student_id    500 non-null int64
gender        500 non-null object
major         500 non-null object
admitted      500 non-null bool
dtypes: bool(1), int64(1), object(2)
memory usage: 12.3+ KB


### Proportion and admission rate for each gender

In [101]:
# Proportion of students that are female
total_student = df.gender.count()

In [102]:
df_fm = df.query('gender == "female"')
total_fm = df_fm.gender.count()
fm_prop = round((total_fm/total_student)*100,2)
print(f'Proportion of female students: {fm_prop}%')

Proportion of female students: 51.4%


In [103]:
# Proportion of students that are male
df_m = df.query('gender== "male"')
total_m = df_m.gender.count()
m_prop =round((total_m/total_student)*100)
print(f'Proportion of male students: {m_prop}%')

Proportion of male students: 49.0%


In [104]:
# Admission rate for females
df_fm_adm = df_fm[df_fm.admitted==True]
total_fm_adm = df_fm_adm.gender.count()
fm_adm_rate = round((total_fm_adm/total_fm)*100,2)
print(f'Admission rate of female students: {fm_adm_rate}%')

Admission rate of female students: 28.79%


In [105]:
# Admission rate for males
df_m_adm = df_m[df_m.admitted==True]
total_m_adm = df_m_adm.gender.count()
m_adm_rate = round((total_m_adm/total_m)*100,2)
print(f'Admission rate of male students: {m_adm_rate}%')

Admission rate of male students: 48.56%


### Proportion and admission rate for physics majors of each gender

In [106]:
# What proportion of female students are majoring in physics?
fm_phy = df_fm.query('major=="Physics"')
total_fm_phy = fm_phy.major.count()
fm_phy_prop = round((total_fm_phy/total_fm)*100,2)
print(f'Proportion of female offering physics: {fm_phy_prop}%')

Proportion of female offering physics: 12.06%


In [107]:
# What proportion of male students are majoring in physics?
m_phy  = df_m.query('major == "Physics"')
total_m_phy = m_phy.major.count()
m_phy_prop = round((total_m_phy/total_m)*100,2)
print(f'Proportion of male offering physics: {m_phy_prop}%')

Proportion of male offering physics: 92.59%


In [108]:
# Admission rate for female physics majors
total_fm_phy_adm = fm_phy[fm_phy.admitted==True].count()[0]
fm_phy_adm_rate = round((total_fm_phy_adm/total_fm_phy)*100,2)
print(f'Admission rate of female offering physics: {fm_phy_adm_rate}%')

Admission rate of female offering physics: 74.19%


In [109]:
# Admission rate for male physics majors

        # wrong calculation
# m_phy_adm = df_m_adm.query('major=="Physics"')
# m_phy_adm_rate=(m_phy_adm.count()[0]/total_m_adm)*100
# m_phy_adm_rate

# correct calculation

total_m_phy_adm = m_phy[m_phy.admitted==True].count()[0]
m_phy_adm_rate = round((total_m_phy_adm/total_m_phy)*100,2)
print(f'Admission rate of female offering physics: {m_phy_adm_rate}%')

Admission rate of female offering physics: 51.56%


### Proportion and admission rate for chemistry majors of each gender

In [110]:
# What proportion of female students are majoring in chemistry?
fm_chm = df_fm.query('major=="Chemistry"')
total_fm_chm = fm_chm.major.count()
fm_chm_prop = round((total_fm_chm/total_fm)*100,2)
print(f'Proportion of female offering chemistry: {fm_chm_prop}%')

Proportion of female offering chemistry: 87.94%


In [111]:
# What proportion of male students are majoring in chemistry?
m_chm = df_m.query('major == "Chemistry"')
total_m_chm = m_chm.major.count()
m_chm_prop = round((total_m_chm/total_m)*100,2)
print(f'Proportion of male offering chemistry: {m_chm_prop}%')

Proportion of male offering chemistry: 7.41%


In [112]:
# Admission rate for female chemistry majors

# fm_chm_adm = df_fm_adm.query('major=="Chemistry"')
# fm_chm_adm_rate = (fm_chm_adm.major.count()/total_fm_adm)*100
# fm_chm_adm_rate


fm_chm_adm = fm_chm[fm_chm.admitted==True].count()[0]
fm_chm_adm_rate = round((fm_chm_adm/total_fm_chm)*100,2)
print(f'Admission rate of female offering chemistry: {fm_chm_adm_rate}%')

Admission rate of female offering chemistry: 22.57%


In [113]:
# Admission rate for male chemistry majors
m_chm_adm = m_chm[m_chm.admitted==True].count()[0]
m_chm_adm_rate = round((m_chm_adm/total_m_chm)*100,2)
print(f'Admission rate of female offering chemistry: {m_chm_adm_rate}%')

Admission rate of female offering chemistry: 11.11%


### Admission rate for each major

In [114]:
df_admitted = df[df.admitted==True]
total_df_admitted = df_admitted.count()[0]
total_df_admitted

192

In [115]:
# Admission rate for physics majors
adm_phy = df[df.admitted==True].query('major=="Physics"')
total_adm_phy = adm_phy.count()[0]
adm_phy_rate = round((total_adm_phy/total_df_admitted)*100,2)
print(f'Admission rate for physics majors: {adm_phy_rate}%')

Admission rate for physics majors: 72.4%


In [116]:
# Admission rate for chemistry majors
adm_chm = df[df.admitted == True].query('major=="Chemistry"')
total_adm_chm = adm_chm.count()[0]
adm_chm_rate = round((total_adm_chm/total_df_admitted)*100,2)
print(f'Admission rate for chemistry majors: {adm_chm_rate}%')

Admission rate for chemistry majors: 27.6%
