In [None]:
import boto3
import numpy as np
import pandas as pd

#### Helper methods to filter and transform dataframes

In [None]:
# df should be a pandas dataframe
# cols_to_keep should be a list of column strings present in df
# assumes the input data frame will have a Status, Fall Term, and County columns
# returns all rows in the dataframe where Status = Admitted and Fall Term = 2018
# filters cols based on the cols_to_keep
# removes duplicates if remove_dups is True (by default)
def filter_dataframe(df, cols_to_keep, status='Adm', year=2018, remove_dups=True):
    # take only admitted students
    df = df[df['Status'] == status]
    # only look at the most recent admits
    df = df[df['Fall Term'] ==  year]
    # filter columns
    df = df[cols_to_keep]
    
    if remove_dups:
        # remove duplicates
        df = df.drop_duplicates()
    
    print "Number of rows in the set {0}".format(df.size)
    print "Number of unique counties {0}".format(len(list(df['County'].unique())))
    
    return df

In [None]:
# transform gpas into discrete buckets
# round down to the lower bucket for each range
# min GPA = 3.24, max GPA = 4.35, stddev = 0.2134
def bucket_gpa(gpa):
    if gpa >= 3.0 and gpa < 3.2:
        return 3.0

    if gpa >= 3.2 and gpa < 3.4:
        return 3.2

    if gpa >= 3.4 and gpa < 3.6:
        return 3.4

    if gpa >= 3.6 and gpa < 3.8:
        return 3.6

    if gpa >= 3.8 and gpa < 4.0:
        return 3.8

    if gpa >= 4.0 and gpa < 4.2:
        return 4.2

    return 4.4

In [None]:
# transform sat scores into discrete buckets
# min score = 440 and max score = 550
def bucket_sat_score(score):
    if score >= 400 and score < 440:
        return 400

    if score >= 440 and score < 460:
        return 440
    
    if score >= 460 and score < 480:
        return 460
    
    if score >= 480 and score < 500:
        return 480
    
    if score >= 500 and score < 520:
        return 500
    
    if score >= 520 and score < 540:
        return 520
    
    return 540

#### Ingest and Format Admit Ethnicity Data
#### Resulting DataFrame should have County and Ethnicity of all 2018 Admitted UC students

In [None]:
# import data by ethnicity
ethnicities = pd.read_csv('../data/HS_ethnicity_by_year_data.csv')
# rename County, Status, and Ethnicity Columns
ethnicities = ethnicities.rename(columns={'County/State/ Territory': 'County', 'Count': 'Status', 'Uad Uc Ethn 6 Cat': 'Ethnicity', 'Pivot Field Values': 'Count'})
# further filter rows
# only keep rows that have a county
ethnicities = ethnicities[pd.notnull(ethnicities['County'])]
ethnicities = filter_dataframe(ethnicities, ['County', 'Ethnicity'])

In [None]:
ethnicities.head()

In [None]:
# import data by gender
genders = pd.read_csv('../data/FR_GENDER_data.csv')
genders = genders.rename(columns={'Count': 'Status', 'Pivot Field Values': 'Count', 'County/State/ Territory': 'County'})
genders['Fall Term'] = 2018
genders = filter_dataframe(genders, ['County', 'Gender'])

In [None]:
genders.head()

#### Min GPA for 2018 admits is 3.24
#### Max GPA for 2018 admits is 4.35

In [None]:
# import data by gpa
gpas = pd.read_csv('../data/FR_GPA_by_Inst_data.csv')
# rename columns
gpas = gpas.rename(columns={'Measure Values': 'GPA', 'School Name': 'School', 'Measure Names': 'Status'})
# filter any rows that do not have a GPA or county
gpas = gpas[pd.notnull(gpas['GPA'])]
gpas = gpas[gpas['County'] != 'Not Applicable']
gpas = filter_dataframe(gpas, ['County', 'GPA', 'Campus'], status='Adm GPA', remove_dups=False)
# take the average admitted gpa per county and campus
gpas = gpas[['County', 'Campus', 'GPA']].groupby(['County', 'Campus']).mean().reset_index()
# round all the GPAs into discrete buckets
gpas['GPA'] = [bucket_gpa(gpa) for gpa in gpas['GPA']]
gpas = gpas.drop_duplicates()
print "Number of rows in the set {0}".format(gpas.size)

In [None]:
gpas.head()

In [None]:
sat_scores = pd.read_csv('../data/sat-report-2015-2016.csv')
# columns to keep
sat_scores = sat_scores[['AvgScrRead', 'AvgScrMath', 'AvgScrWrit', 'cname']]
# rename for join
sat_scores = sat_scores.rename(columns={'cname': 'County'})
# drop any rows that do not have a school
sat_scores = sat_scores[pd.notnull(sat_scores['County'])]
# convert score cols to ints
sat_scores['AvgScrRead'] = pd.to_numeric(sat_scores['AvgScrRead'], errors=coerce)
sat_scores['AvgScrMath'] = pd.to_numeric(sat_scores['AvgScrMath'], errors=coerce)
sat_scores['AvgScrWrit'] = pd.to_numeric(sat_scores['AvgScrWrit'], errors=coerce)
# drop null cols
sat_scores = sat_scores[pd.notnull(sat_scores['AvgScrRead'])]
sat_scores = sat_scores[pd.notnull(sat_scores['AvgScrMath'])]
sat_scores = sat_scores[pd.notnull(sat_scores['AvgScrWrit'])]
# take the average sat score by county
sat_scores = sat_scores.groupby('County').mean().reset_index()
# bucket scores
sat_scores['AvgScrRead'] = [bucket_sat_score(score) for score in sat_scores['AvgScrRead']]
sat_scores['AvgScrMath'] = [bucket_sat_score(score) for score in sat_scores['AvgScrMath']]
sat_scores['AvgScrWrit'] = [bucket_sat_score(score) for score in sat_scores['AvgScrWrit']]
# remove duplicates
sat_scores = sat_scores.drop_duplicates()
sat_scores.head()

In [None]:
print "Number of rows in the set {0}".format(sat_scores.size)
print "Number of unique counties {0}".format(len(list(sat_scores['County'].unique())))

In [None]:
join_cols = ['County']

In [None]:
# join the datasets
result = pd.merge(ethnicities, genders, on=join_cols)
result = pd.merge(result, gpas, on=join_cols).drop_duplicates()
result = pd.merge(result, sat_scores, on=join_cols).drop_duplicates()
result.drop_duplicates(inplace=True)

In [None]:
result.head()

In [None]:
result.size

In [None]:
result.describe()

In [None]:
S3 = boto3.client('s3', region_name='eu-central-1')
S3.upload_file('../data/sample-data.csv', 'gosat-data', 'sample-data.csv')

In [None]:
result.to_csv('../data/applicant_admissions_data.csv')