In [None]:
import boto3
import numpy as np
import pandas as pd

In [None]:
# import data by ethnicity
ethnicities = pd.read_csv('../data/fr-applicants-by-ethnicity.csv')
ethnicities = ethnicities.rename(columns={'County/State/ Territory': 'County', 'Count': 'Status', 'Uad Uc Ethn 6 Cat': 'Ethnicity', 'Pivot Field Values': 'Count'})
ethnicities = ethnicities[['Status', 'Fall Term', 'County', 'Ethnicity', 'Count']]

In [None]:
ethnicities.head()

In [None]:
ethnicities.size

In [None]:
# import data by gender
genders = pd.read_csv('../data/fr-applicants-by-gender.csv')
genders = genders.rename(columns={'Count': 'Status', 'Pivot Field Values': 'Count', 'County/State/ Territory': 'County'})
genders = genders[['Status', 'Fall Term', 'County', 'Gender', 'Count']]

In [None]:
genders.head()

In [None]:
genders.size

In [None]:
# import data by gpa
gpas = pd.read_csv('../data/fr-applicants-by-gpa.csv')
# filter any rows that do not have a GPA or school
gpas = gpas[pd.notnull(gpas['School Name'])]
gpas = gpas[pd.notnull(gpas['Measure Values'])]
# rename columns
gpas = gpas.rename(columns={'Measure Values': 'GPA', 'School Name': 'School'})
gpas = gpas[['Campus', 'County']]

In [None]:
gpas.head()

In [None]:
sat_scores = pd.read_csv('../data/sat-report-2015-2016.csv')
# columns to keep
sat_scores = sat_scores[['AvgScrRead', 'AvgScrMath', 'AvgScrWrit', 'cname']]
# rename for join
sat_scores = sat_scores.rename(columns={'cname': 'County'})
# drop any rows that do not have a school
sat_scores = sat_scores[pd.notnull(sat_scores['County'])]
# convert score cols to ints
sat_scores['AvgScrRead'] = pd.to_numeric(sat_scores['AvgScrRead'], errors=coerce)
sat_scores['AvgScrMath'] = pd.to_numeric(sat_scores['AvgScrMath'], errors=coerce)
sat_scores['AvgScrWrit'] = pd.to_numeric(sat_scores['AvgScrWrit'], errors=coerce)
# drop null cols
sat_scores = sat_scores[pd.notnull(sat_scores['AvgScrRead'])]
sat_scores = sat_scores[pd.notnull(sat_scores['AvgScrMath'])]
sat_scores = sat_scores[pd.notnull(sat_scores['AvgScrWrit'])]
sat_scores = sat_scores.groupby('County').mean().reset_index()
sat_scores.head()

In [None]:
len(sat_scores)

In [None]:
join_cols = ['County']

In [None]:
# output just the categorical data
df1 = ethnicities[['County', 'Ethnicity']].drop_duplicates()
df2 = genders[['County', 'Gender']].drop_duplicates()
df3 = gpas[['County', 'Campus']].drop_duplicates()
result = pd.merge(df1, df2, on=join_cols)
result = pd.merge(result, df3, on=join_cols).drop_duplicates()
result.to_csv('../data/uc-campus-categorical-data.csv')

In [None]:
# join the datasets
result = pd.merge(ethnicities, genders, on=join_cols)
result = pd.merge(result, gpas, on=join_cols).drop_duplicates()
result = pd.merge(result, sat_scores, on=join_cols).drop_duplicates()
result.drop_duplicates(inplace=True)

In [None]:
result.head()

In [None]:
result.size

In [None]:
result.describe()

In [None]:
# fill these columns in later
result['ACT'] = 0.0

In [None]:
result.head().to_csv('../data/sample-data.csv')

In [None]:
result.describe()

In [None]:
S3 = boto3.client('s3', region_name='eu-central-1')
S3.upload_file('../data/sample-data.csv', 'gosat-data', 'sample-data.csv')