In [None]:
import boto3
import numpy as np
import pandas as pd

In [None]:
# import data by ethnicity
ethnicities = pd.read_csv('../data/fr-applicants-by-ethnicity.csv')
ethnicities = ethnicities.rename(columns={'County/State/ Territory': 'County', 'Count': 'Status', 'Uad Uc Ethn 6 Cat': 'Ethnicity', 'Pivot Field Values': 'Count'})
ethnicities = ethnicities[['Status', 'Fall Term', 'County', 'Ethnicity', 'Count']]
# keep only admitted student data
ethnicities = ethnicities[ethnicities['Status'] == 'Adm']
ethnicities = ethnicities[ethnicities['Fall Term'] == 2016]
ethnicities.drop(['Status', 'Fall Term'], axis=1, inplace=True)
ethnicities = ethnicities.groupby(['County', 'Ethnicity']).sum().reset_index()

In [None]:
ethnicities.head()

In [None]:
ethnicities.size

In [None]:
# import data by gender
genders = pd.read_csv('../data/fr-applicants-by-gender.csv')
genders = genders.rename(columns={'Count': 'Status', 'Pivot Field Values': 'Count', 'County/State/ Territory': 'County'})
genders = genders[['Status', 'Fall Term', 'County', 'Gender', 'Count']]
genders = genders[genders['Status'] == 'Adm']
genders = genders[genders['Fall Term'] == 2016]
genders.drop(['Status', 'Fall Term'], axis=1, inplace=True)
genders = genders.groupby(['County', 'Gender']).sum().reset_index()

In [None]:
genders.head()

In [None]:
genders.size

In [None]:
# import data by gpa
gpas = pd.read_csv('../data/fr-applicants-by-gpa.csv')
# filter any rows that do not have a GPA or school
gpas = gpas[pd.notnull(gpas['School Name'])]
gpas = gpas[pd.notnull(gpas['Measure Values'])]
# only keep admitted applicant data
gpas = gpas[gpas['Measure Names'] == 'Adm GPA']
gpas = gpas[gpas['Fall Term'] == 2016]
# rename columns
gpas = gpas.rename(columns={'Measure Values': 'GPA', 'School Name': 'School'})
gpas = gpas[['County', 'GPA', 'Campus']]

In [None]:
len(gpas)

In [None]:
gpas.head()

In [None]:
sat_scores = pd.read_csv('../data/sat-report-2015-2016.csv')
# columns to keep
sat_scores = sat_scores[['AvgScrRead', 'AvgScrMath', 'AvgScrWrit', 'cname']]
# rename for join
sat_scores = sat_scores.rename(columns={'cname': 'County'})
# drop any rows that do not have a school
sat_scores = sat_scores[pd.notnull(sat_scores['County'])]
# convert score cols to ints
sat_scores['AvgScrRead'] = pd.to_numeric(sat_scores['AvgScrRead'], errors=coerce)
sat_scores['AvgScrMath'] = pd.to_numeric(sat_scores['AvgScrMath'], errors=coerce)
sat_scores['AvgScrWrit'] = pd.to_numeric(sat_scores['AvgScrWrit'], errors=coerce)
# drop null cols
sat_scores = sat_scores[pd.notnull(sat_scores['AvgScrRead'])]
sat_scores = sat_scores[pd.notnull(sat_scores['AvgScrMath'])]
sat_scores = sat_scores[pd.notnull(sat_scores['AvgScrWrit'])]
sat_scores = sat_scores.groupby('County').mean().reset_index()
sat_scores.head()

In [None]:
len(sat_scores)

In [None]:
join_cols = ['County']

In [None]:
# join the datasets
result = pd.merge(ethnicities, genders, on=join_cols)
result = pd.merge(result, gpas, on=join_cols).drop_duplicates()
result = pd.merge(result, sat_scores, on=join_cols).drop_duplicates()
result['NumStudentsAccepted'] = result['Count_x'] + result['Count_y']
result.drop(['Count_x', 'Count_y'], axis=1, inplace=True)
result.drop_duplicates(inplace=True)

In [None]:
result.head()

In [None]:
result.size

In [None]:
result.describe()

In [None]:
counts = result[['NumStudentsAccepted', 'Campus', 'County']].groupby(['Campus', 'County']).sum().reset_index()
counts.rename(columns={'NumStudentsAccepted': 'TotalStudentsAcceptedInTerm'}, inplace=True)
counts.drop(['County'], axis=1, inplace=True)

In [None]:
counts.head()

In [None]:
# merge with dataset
combined = pd.merge(result, counts, on=['Campus'])
combined.head()

In [None]:
combined.size

In [None]:
combined.describe()

In [None]:
# caluclate the predicted column
combined['ProbabilityOfAcceptance'] = (combined['NumStudentsAccepted'] / combined['TotalStudentsAcceptedInTerm']) * 100
combined.drop(['NumStudentsAccepted', 'TotalStudentsAcceptedInTerm'], axis=1, inplace=True)

In [None]:
combined.head()

In [None]:
# fill these columns in later
combined['ACT'] = 0.0

In [None]:
combined.head().to_csv('../data/sample-data.csv')

In [None]:
combined.describe()

In [None]:
S3 = boto3.client('s3', region_name='eu-central-1')
S3.upload_file('../data/sample-data.csv', 'gosat-data', 'sample-data.csv')

In [None]:
for campus in combined['Campus'].unique():
    print(campus)