# This notebook does additional processing on our cleaned data set for our regression models

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn import linear_model
import statsmodels.api as sm
from statsmodels.graphics.gofplots import ProbPlot
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import seaborn as sns

  from pandas import Int64Index as NumericIndex


## Load cleaned data

In [2]:
data_raw = pd.read_csv("../data/clean/full_dataset.csv")

In [3]:
data_raw.head()

Unnamed: 0,ID Number,Preferred Pronouns,Current Grade Level,Program,First Paying Job,Know employee,Applied Before,Comfortable speaking to crowd,Uncomfortable with,Extra activities,...,Application Year,Current Age,Rejected,EECode,EarnHours,EarnRate,EarnAmount,Dist Position Desc,Year,Month
0,1489.0,masculine,9.0,zoocamp,1,0,0,1.0,invertebrates,0,...,2022,,0,1489.0,5.5,13.0,71.5,WILD Steward,2022.0,4.0
1,1489.0,masculine,9.0,zoocamp,1,0,0,1.0,invertebrates,0,...,2022,,0,1489.0,14.47,13.0,188.11,WILD Steward,2022.0,5.0
2,1489.0,masculine,9.0,zoocamp,1,0,0,1.0,invertebrates,0,...,2022,,0,1489.0,7.13,13.0,92.69,WILD Steward,2022.0,5.0
3,1489.0,masculine,9.0,zoocamp,1,0,0,1.0,invertebrates,0,...,2022,,0,1489.0,130.38,13.0,1694.94,WILD Steward,2022.0,6.0
4,1489.0,masculine,9.0,zoocamp,1,0,0,1.0,invertebrates,0,...,2022,,0,1489.0,123.8,13.0,1609.4,WILD Steward,2022.0,7.0


In [4]:
data_raw.columns

Index(['ID Number', 'Preferred Pronouns', 'Current Grade Level', 'Program',
       'First Paying Job', 'Know employee', 'Applied Before',
       'Comfortable speaking to crowd', 'Uncomfortable with',
       'Extra activities', 'Commit Summer', 'Commit Weekday',
       'Hear about source', 'Application Year', 'Current Age', 'Rejected',
       'EECode', 'EarnHours', 'EarnRate', 'EarnAmount', 'Dist Position Desc',
       'Year', 'Month'],
      dtype='object')

## Calculate monthly hours

In [5]:
#Sum up monthly earn hours
hours = data_raw[['ID Number','EarnHours','Year','Month']]
hours = hours.dropna().reset_index(drop = True)

In [6]:
hours_sum = hours.groupby(['ID Number','Year','Month']).agg({'EarnHours':'sum'}).add_suffix('_Sum').reset_index()
hours_sum = hours_sum.groupby(['ID Number']).agg({'EarnHours_Sum':'mean'}).add_suffix('_AvgPerMonth').reset_index()
hours_sum
#hours_sum.groupby(['ID Number']).rank(method='dense')
#hours_sum.pivot(index = 'ID Number', columns={'Year','Month'}, values='EarnHours_Sum')

Unnamed: 0,ID Number,EarnHours_Sum_AvgPerMonth
0,1154.0,24.831667
1,1155.0,25.07
2,1156.0,27.921429
3,1158.0,37.4092
4,1160.0,40.462222
5,1161.0,36.945556
6,1162.0,33.465
7,1163.0,44.67
8,1164.0,76.846364
9,1165.0,20.95375


## Condense dataframe

In [None]:
# remove info on hours and dedupe
data = data_raw[['ID Number', 'Preferred Pronouns', 'Current Grade Level', 'Program',
       'First Paying Job', 'Know employee', 'Applied Before',
       'Comfortable speaking to crowd', 'Uncomfortable with',
       'Extra activities', 'Commit Summer', 'Commit Weekday',
       'Hear about source', 'Application Year', 'Current Age', 'Rejected',
       'EECode']]

data = data.drop_duplicates().reset_index(drop=True)

# bring in summarized hours
data = data.merge(hours_sum, on=['ID Number'], how="left")

# fill unknown values
data['Current Grade Level'] = data['Current Grade Level'].fillna((data['Current Grade Level'].mean()))
data['Current Age'] = data['Current Age'].fillna((data['Current Age'].mean()))
data['Application Year'] = data['Application Year'].fillna((data['Application Year'].mean()))
data = data.fillna(value='unknown')

# one-hot encode values
data = pd.get_dummies(data, columns = ['Preferred Pronouns', 'Program',
       'First Paying Job', 'Know employee', 'Applied Before',
       'Comfortable speaking to crowd', 'Uncomfortable with',
       'Extra activities', 'Commit Summer', 'Commit Weekday',
       'Hear about source'])

In [None]:
data.to_csv("../data/clean/model_data.csv", index=False)