In [1]:
import pandas as pd
import numpy as np

from enum import Enum

In [2]:
# convenience Enum to transform numerical labels
# into readable ones
class Label(Enum):
    gpa = 1

    # act english score
    actE = 2

    # act composite score -
    # average of eng, math, reading science
    actC = 3

    # sat writing
    satW = 4
    
    # sat math
    satM = 5
    
    # sat english
    satE = 6
    
    # the number of a-g courses completed
    agc = 7
    hc = 8

### About the data

### Column descriptions
#### u_id = university id
#### d_id = metric id (1 - 8) (gpa, act, sat, etc..)
#### d_v = metric value
#### per = percent of students admitted / applied
#### admit = # of students admited for a specific university and metric
#### total = # of students applied for a specific university and metric

### Data Stats
#### 333 total rows, 8 metrics, 45 rows per metric except for SAT English, agc, and hc which has 36 rows
#### 9 unique universities
#### Average GPA = 2.8, min = 0, max = 4.0
#### Average ACT Composite = 18.8, min = 0, max = 31
#### Average SAT math = 440, max = 700, min = 0
#### Average SAT writing = 440, max = 700, min = 0
#### Average SAT English = 11.25, min = 0, max = 20
#### Average agc = 30, min = 0, max = 50
#### Average hc = 7.5, min = 0, max = 15

#### The national average ACT composite is 20.8, so the average in our dataset is slightly lower
#### The max ACT composite is 36, our dataset has a max of 31
#### The max SAT Writing and English scores are 800 but our dataset shows 700

### Questions
#### 1. Does it make sense to use BOTH the ACT english and composite columns as features?
#### 2. Our dataset shows zero for ACT composite, SAT writing, and SAT Eglish scores, even though the minimum for these tests are 1 for ACT and 200 for SAT. Should these rows be considered invalid?
#### 3. Percents and admit numbers are different for each metric - is there anyway to aggregate these numbers across university?

In [3]:
df = pd.read_csv("data/UC_Dataset_for_code.csv")

In [4]:
len(df[df['d_id'] == Label.gpa.value])

45

In [5]:
# how many universities?
len(df['u_id'].unique())

9

In [6]:
# what is the average gpa?
df[df['d_id'] == Label.gpa.value].describe()

Unnamed: 0,u_id,d_id,d_v,per
count,45.0,45.0,45.0,45.0
mean,5.0,1.0,2.8,0.256444
std,2.611165,0.0,1.457114,0.311347
min,1.0,1.0,0.0,0.01
25%,3.0,1.0,3.0,0.01
50%,5.0,1.0,3.3,0.1
75%,7.0,1.0,3.7,0.4
max,9.0,1.0,4.0,0.97


In [7]:
# what is the average act composite?
df[df['d_id'] == Label.actC.value].describe()

Unnamed: 0,u_id,d_id,d_v,per
count,45.0,45.0,45.0,45.0
mean,5.0,3.0,18.8,0.365778
std,2.611165,0.0,10.767375,0.296767
min,1.0,3.0,0.0,0.0
25%,3.0,3.0,16.0,0.13
50%,5.0,3.0,21.0,0.29
75%,7.0,3.0,26.0,0.56
max,9.0,3.0,31.0,0.94


In [8]:
# average sat english score?
df[df['d_id'] == Label.satE.value].describe()

Unnamed: 0,u_id,d_id,d_v,per
count,36.0,36.0,36.0,36.0
mean,5.0,6.0,11.25,0.348889
std,2.618615,0.0,7.5,0.255463
min,1.0,6.0,0.0,0.01
25%,3.0,6.0,7.5,0.1575
50%,5.0,6.0,12.5,0.315
75%,7.0,6.0,16.25,0.5125
max,9.0,6.0,20.0,0.92


In [9]:
# average sat math score?
df[df['d_id'] == Label.satM.value].describe()

Unnamed: 0,u_id,d_id,d_v,per
count,45.0,45.0,45.0,45.0
mean,5.0,5.0,440.0,0.275111
std,2.611165,0.0,244.391638,0.265507
min,1.0,5.0,0.0,0.0
25%,3.0,5.0,400.0,0.06
50%,5.0,5.0,500.0,0.18
75%,7.0,5.0,600.0,0.46
max,9.0,5.0,700.0,0.86


In [10]:
# average sat writing score?
df[df['d_id'] == Label.satW.value].describe()

Unnamed: 0,u_id,d_id,d_v,per
count,45.0,45.0,45.0,45.0
mean,5.0,4.0,440.0,0.294667
std,2.611165,0.0,244.391638,0.292828
min,1.0,4.0,0.0,0.0
25%,3.0,4.0,400.0,0.04
50%,5.0,4.0,500.0,0.18
75%,7.0,4.0,600.0,0.48
max,9.0,4.0,700.0,0.93


In [11]:
df[df['d_id'] == Label.agc.value].describe()

Unnamed: 0,u_id,d_id,d_v,per
count,36.0,36.0,36.0,36.0
mean,5.0,7.0,30.0,0.278056
std,2.618615,0.0,18.973666,0.177632
min,1.0,7.0,0.0,0.06
25%,3.0,7.0,22.5,0.1575
50%,5.0,7.0,35.0,0.225
75%,7.0,7.0,42.5,0.3625
max,9.0,7.0,50.0,0.7


In [12]:
df[df['d_id'] == Label.hc.value].describe()

Unnamed: 0,u_id,d_id,d_v,per
count,36.0,36.0,36.0,36.0
mean,5.0,8.0,7.5,0.295556
std,2.618615,0.0,5.669467,0.245676
min,1.0,8.0,0.0,0.03
25%,3.0,8.0,3.75,0.105
50%,5.0,8.0,7.5,0.21
75%,7.0,8.0,11.25,0.45
max,9.0,8.0,15.0,0.89


In [21]:
df = pd.read_csv("data/UC_Dataset_for_code.csv")

In [22]:
df.head()

Unnamed: 0,u_id,d_id,d_v,per,admit,total
0,1,1,4.0,0.25,11857,47070
1,1,1,3.7,0.04,676,18310
2,1,1,3.3,0.01,116,12006
3,1,1,3.0,0.01,29,3604
4,1,1,0.0,0.01,14,1947


In [29]:
df[df['u_id'] == 1]

Unnamed: 0,u_id,d_id,d_v,per,admit,total
0,1,1,4.0,0.25,11857,47070
1,1,1,3.7,0.04,676,18310
2,1,1,3.3,0.01,116,12006
3,1,1,3.0,0.01,29,3604
4,1,1,0.0,0.01,14,1947
5,1,2,31.0,0.24,4771,19759
6,1,2,26.0,0.12,1072,8586
7,1,2,21.0,0.13,593,4692
8,1,2,16.0,0.07,147,2121
9,1,2,0.0,0.03,11,332


In [37]:
df.groupby(['u_id', 'd_id']).head()

Unnamed: 0,u_id,d_id,d_v,per,admit,total
0,1,1,4.0,0.25,11857,47070
1,1,1,3.7,0.04,676,18310
2,1,1,3.3,0.01,116,12006
3,1,1,3.0,0.01,29,3604
4,1,1,0.0,0.01,14,1947
5,1,2,31.0,0.24,4771,19759
6,1,2,26.0,0.12,1072,8586
7,1,2,21.0,0.13,593,4692
8,1,2,16.0,0.07,147,2121
9,1,2,0.0,0.03,11,332


#### Applicant gives gpa, sat writing, essay, english, act composite score