In [1]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import numpy as np

In [5]:
#ELPAC_file_path = '/Users/oscargil/Downloads/ELPAC.xlsx' # on Mac
ELPAC_file_path = 'C:\\Users\\OscarGil\\Downloads\\ELPAC.xlsx' # on work machine

#### Import files from Excel file, got from information provided to i-TAPP as it was all within one Excel spreadsheet

In [6]:
# Import ELPAC data into dataframes
elpac1718 = pd.read_excel(open(ELPAC_file_path, 'rb'), sheet_name='2017-2018', dtype=str)

# Drop identified columns
elpac1718.drop(elpac1718.columns[[0, 1, 5, 7, 8, 9, 10]], axis=1, inplace=True)

In [7]:
# Import ELPAC data into dataframes
elpac1819 = pd.read_excel(open(ELPAC_file_path, 'rb'), sheet_name='2018-2019', dtype=str)

# Drop identified columns
elpac1819.drop(elpac1819.columns[[0, 1, 5, 7, 8, 9, 10]], axis=1, inplace=True)

In [10]:
# Import ELPAC data into dataframes
elpac1920 = pd.read_excel(open(ELPAC_file_path, 'rb'), sheet_name='2019-2020', dtype=str)

# Drop identified columns
elpac1920.drop(elpac1920.columns[[0, 1, 5, 7, 8, 9, 10]], axis=1, inplace=True)

In [11]:
# Import ELPAC data into dataframes
elpac2021 = pd.read_excel(open(ELPAC_file_path, 'rb'), sheet_name='2020-2021', dtype=str)

# Drop identified columns
elpac2021.drop(elpac2021.columns[[0, 1, 6, 8, 9, 10, 11]], axis=1, inplace=True)

In [12]:
# Import ELPAC data into dataframes
elpac2122 = pd.read_excel(open(ELPAC_file_path, 'rb'), sheet_name='2021-2022', dtype=str)

# Drop identified columns
elpac2122.drop(elpac2122.columns[[1, 5, 7, 8, 9, 17, 19, 21, 54, 55, 56, 57, 58, 61, 62, 63, 64, 70, 71, 72, 73, 79, 80, 81, 82, 88, 89, 90, 91, 97, 98, 99, 100]], axis=1, inplace=True)

#### Create a unique list of all SSIDs and assign them the Pandas Dataframe index number as their de-identified number

In [13]:
# Create data frame of SSIDs to de-identify
ssids = pd.DataFrame(pd.concat([elpac1718.elpac_2018_ssid, elpac1819.elpac_2019_ssid, elpac1920.elpac_2020_ssid, elpac2021.elpac_2021_ssid, elpac2122.SSID], axis=0))

# Unique SSID values
ssid_deidentify = pd.DataFrame(ssids[0].unique())

ssid_deidentify.columns = ['SSID']

# Assign Data frame index as de-identified ID
ssid_deidentify['Stu_deID'] =  ssid_deidentify.index

In [14]:
# Sample of de-identified SSIDs
ssid_deidentify.sample(5)

Unnamed: 0,SSID,Stu_deID
2623,9200928442,2623
3473,6753024707,3473
3294,7468770475,3294
1769,6527232178,1769
4595,6345966150,4595


In [15]:
# Create local file to have as reference to Stu_deID
ssid_deidentify.to_csv('Deidentified/ssid_deidentify.csv', encoding='utf-8', index=False)

#### Merge de-identified ID into ELPAC dataframes, then dropped the SSID field containing the real values

In [16]:
# Merge in de-identier
elpac1718 = elpac1718.merge(ssid_deidentify, how="inner", left_on='elpac_2018_ssid', right_on='SSID')

# Drop SSID columns
elpac1718.drop(elpac1718.columns[[3, 57]], axis=1, inplace=True)

In [17]:
# Merge in de-identier
elpac1819 = elpac1819.merge(ssid_deidentify, how="inner", left_on='elpac_2019_ssid', right_on='SSID')

# Drop SSID columns
elpac1819.drop(elpac1819.columns[[3, 54]], axis=1, inplace=True)

In [18]:
# Merge in de-identier
elpac1920 = elpac1920.merge(ssid_deidentify, how="inner", left_on='elpac_2020_ssid', right_on='SSID')

# Drop SSID columns
elpac1920.drop(elpac1920.columns[[3, 79]], axis=1, inplace=True)

In [19]:
# Merge in de-identier
elpac2021 = elpac2021.merge(ssid_deidentify, how="inner", left_on='elpac_2021_ssid', right_on='SSID')

# Drop SSID columns
elpac2021.drop(elpac2021.columns[[4, 120]], axis=1, inplace=True)

In [20]:
# Merge in de-identier
elpac2122 = elpac2122.merge(ssid_deidentify, how="inner", left_on='SSID', right_on='SSID')

# Drop SSID columns
elpac2122.drop(elpac2122.columns[[3]], axis=1, inplace=True)

### De-identify school and district name

#### School

In [21]:
# Unique Schoolname values - only present in the 2021-2022 file
schools_deidentify = pd.DataFrame(elpac2122.CALPADSSchoolName.unique())

schools_deidentify.columns = ['SchoolName']

# Assign Data frame index as de-identified ID
schools_deidentify['School_deID'] =  schools_deidentify.index

In [22]:
# Merge in de-identier
elpac2122 = elpac2122.merge(schools_deidentify, how="inner", left_on='CALPADSSchoolName', right_on='SchoolName')

# Drop school name columns
elpac2122.drop(elpac2122.columns[[12, 337]], axis=1, inplace=True)

In [23]:
# Create local file to have as reference to School_deID
schools_deidentify.to_csv('Deidentified/schools_deidentify.csv', encoding='utf-8', index=False)

#### District

In [24]:
# Unique distrcit values - only present in the 2021-2022 file
district_deidentify = pd.DataFrame(elpac2122.CALPADSDistrictName.unique())

district_deidentify.columns = ['DistrictName']

# Assign Data frame index as de-identified ID
district_deidentify['District_deID'] =  district_deidentify.index

In [25]:
# Merge in de-identier
elpac2122 = elpac2122.merge(district_deidentify, how="inner", left_on='CALPADSDistrictName', right_on='DistrictName')

# Drop school name columns
elpac2122.drop(elpac2122.columns[[11, 337]], axis=1, inplace=True)

In [26]:
# Create local file to have as reference to District_deID
district_deidentify.to_csv('Deidentified/district_deidentify.csv', encoding='utf-8', index=False)

## Export ELPAC files to CSV

In [27]:
elpac1718.to_csv('Deidentified/elpac1718.csv', encoding='utf-8', index=False)
elpac1819.to_csv('Deidentified/elpac1819.csv', encoding='utf-8', index=False)
elpac1920.to_csv('Deidentified/elpac1920.csv', encoding='utf-8', index=False)
elpac2021.to_csv('Deidentified/elpac2021.csv', encoding='utf-8', index=False)
elpac2122.to_csv('Deidentified/elpac2122.csv', encoding='utf-8', index=False)

### Attendance Files