In [17]:
##########################################################################################
# FSM data processing notebook 
# done in the past, putting here to make more general and re-usable
##########################################################################################


import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sys
import matplotlib
import os

fsmDir = '~/Documents/JobsAndClients/Nuzusys/Clients/FederatedStatesOfMicronesiaPublicSchoolSystem/'

rawPopProjData = os.path.join(fsmDir,'background/PopulationProjection2010CensusProcessed.xlsx')
outPopProjData = os.path.join(fsmDir,'project-life-cycle/development-phase/fsm-population-10-year-projection-initial-data.xlsx')
rawCombinedData = os.path.join(fsmDir,'background/CombinedData.xlsx')
outEnrolmentTransitData = os.path.join(fsmDir,'project-life-cycle/development-phase/fsm-enrolment-initial-transit-data.xlsx')
outEnrolmentData = os.path.join(fsmDir,'project-life-cycle/development-phase/fsm-enrolment-initial-data.xlsx')
rawSchoolsExtraData = os.path.join(fsmDir,'background/All_FSM_Schools.xlsx')
outSchoolsFinalData = os.path.join(fsmDir,'project-life-cycle/development-phase/fsm-schools-initial-data.xlsx')
rawInitialSchoolsData= os.path.join(fsmDir,'background/fsm-schools-with-raw-from-andrew-and-weison.xlsx')
outInitialSchoolsTyposManualData = os.path.join(fsmDir,'project-life-cycle/development-phase/fsm-schools-initial-lookups-anomalies-manual.xlsx')
outInitialSchoolsTyposData = os.path.join(fsmDir,'project-life-cycle/development-phase/fsm-schools-initial-lookups-anomalies.xlsx')
outSchoolsLookup = os.path.join(fsmDir,'project-life-cycle/development-phase/fsm-schools-lookups.xlsx')
outInitialSchoolSurveysData = os.path.join(fsmDir,'project-life-cycle/development-phase/fsm-schoolsurveys-initial-data.xlsx')
rawFSMLookups = os.path.join(fsmDir,'project-life-cycle/design-phase/PineapplesLookups-FSM.xlsx')
#rawTeachersData = os.path.join(fsmDir,'background/SY2015-2016CombinedTeachersending.xlsx')
rawTeachersData = os.path.join(fsmDir,'background/CombinedDataStaff.xlsx')
outTeachersData = os.path.join(fsmDir,'project-life-cycle/development-phase/fsm-teachers-initial-data.xlsx')
rawAccreditationData = os.path.join(fsmDir,'background/AccreditationData_2017.xlsx')
outAccreditationData = os.path.join(fsmDir,'project-life-cycle/development-phase/fsm-schools-accreditation-initial-data.xlsx')

In [None]:
# Process population projections

parse_cols = {
    2010: "A:C",
    2011: "F:H",
    2012: "K:M",
    2013: "P:R",
    2014: "U:W",
    2015: "Z:AB",
    2016: "AE:AG",
    2017: "AJ:AL",
    2018: "AO:AQ"
}

sheets = {
    "Chuuk": [1,"CHU"],
    "Pohnpei": [3, "PNI"],
    "Yap": [4, "YAP"],
    "Kosrae": [2, "KOS"]
}

data = {}

for key, value in parse_cols.items():
    data[key] = pd.read_excel(rawPopProjData, sheetname=None, header=2, index_col=None,
                              names=["popAge","popM","popF"], parse_cols=value)

popProjDF = pd.DataFrame()

# for each year
for year in data:
    # for each state
    for state in data[year]:
        # I've here got a DataFrame
        #print(y,s)
        df = data[year][state]
        df['popmodCode'] = pd.Series(["FSMNSO"] * 76, index=df.index)
        df['dID'] = pd.Series([sheets[state][0]] * 76, index=df.index)
        df['elN'] = pd.Series([sheets[state][1]] * 76, index=df.index)
        df['popYear'] = pd.Series([year] * 76, index=df.index)
        popProjDF = popProjDF.append(df)

popProjDF = popProjDF[["popmodCode","popYear","popAge","popM","popF","dID","elN"]]
popProjDF.loc[popProjDF.popAge == '        75+', 'popAge'] = 75
popProjDF

# Experiment with population projection data for quick and dirty
# quality test
#data[2010]["Pohnpei"]
#df2013 = popProjDF[popProjDF['popYear'] == 2013]
#df2013.sum()

# Write population data
#popProjDF.to_excel(outPopProjData, index=False)

In [None]:
# Prepare schools helpers lookups. This is meant to assign the correct schools from badly entered data in various spreadsheets, etc.
#schoolsLookupDF = pd.read_excel(outEnrolmentTransitData, sheetname="cleanMergedSchoolsLeftJoinDF", header=0, parse_cols="A,B")
schoolsLookupDF = pd.read_excel(rawInitialSchoolsData, sheetname="Schools", header=0, parse_cols="A,B")
schoolsLookupTyposFromEnrolmentsDF = pd.read_excel(outInitialSchoolsTyposManualData, sheetname="FromEnrolments", header=0, parse_cols="A,B")
schoolsLookupTyposFromTeachersDF = pd.read_excel(outInitialSchoolsTyposManualData, sheetname="FromTeachers", header=0, parse_cols="A,B")
schoolsLookupTyposFromTeachersDF = schoolsLookupTyposFromTeachersDF.rename(columns = {'RawSchools': 'schName','SchNoMappingCompleteMe': 'schNo'})
schoolsLookupTyposFromAccreditationsDF = pd.read_excel(outInitialSchoolsTyposManualData, sheetname="FromAccreditations", header=0, parse_cols="A,B")
schoolsLookupTyposFromAccreditationsDF = schoolsLookupTyposFromAccreditationsDF.rename(columns = {'RawSchools': 'schName','SchNoMappingCompleteMe': 'schNo'})
# remove duplicates
schoolsLookupTyposFromTeachersDF.drop_duplicates(['schName','schNo'], inplace=True)
schoolsLookupTyposFromAccreditationsDF.drop_duplicates(['schName','schNo'], inplace=True)

# Add the typos constructed lookups containing to new manually fixed schools lookup
schoolsLookupDF = schoolsLookupDF.append(schoolsLookupTyposFromEnrolmentsDF)
schoolsLookupDF = schoolsLookupDF.append(schoolsLookupTyposFromTeachersDF)
schoolsLookupDF = schoolsLookupDF.append(schoolsLookupTyposFromAccreditationsDF)
schoolsLookupDF.drop_duplicates(['schName'], inplace=True)
# remove the ones without an key (NaN means they have a typo of some sort)
schoolsLookupDF = schoolsLookupDF.dropna()
print('schoolsLookupDF:')
print(schoolsLookupDF)

# Final dataset containing a correct school ID for each school names in various
# spreadsheets including all the ones with typos and differently spelled
schoolsLookup = schoolsLookupDF.set_index('schName').to_dict()['schNo']
schoolsLookupByName = {y:x for x,y in schoolsLookup.items()}
print('schoolsLookup:')
print(schoolsLookup)

# Writing data to sheets
writer = pd.ExcelWriter(outSchoolsLookup)
schoolsLookupDF.to_excel(writer, sheet_name='SchoolsLookups', index=False)
writer.save()

In [22]:
# Get some offical lookups for use in various data processing throughout cells
gradeLevelsLookupsDF = pd.read_excel(rawFSMLookups, sheetname="lkpLevels", skiprows=44, parse_cols="A,B")
print('gradeLevelsLookupsDF:')
print(gradeLevelsLookupsDF.head(5))

electLLookupsDF = pd.read_excel(rawFSMLookups, sheetname="lkpElectorateL", skiprows=85, parse_cols="A,B")
print('electLLookupsDF:')
print(electLLookupsDF.head(5))

electNLookupsDF = pd.read_excel(rawFSMLookups, sheetname="lkpElectorateN", skiprows=36, parse_cols="A,B")
print('electNLookupsDF:')
print(electNLookupsDF.head(5))

# islands and municipalities
islandsLookupsDF = pd.read_excel(rawFSMLookups, sheetname="Islands", skiprows=94, parse_cols="A,B")
print('islandsLookupsDF:')
print(islandsLookupsDF.head(5))
islandsLookup = islandsLookupsDF.set_index('iName').to_dict()['iCode']
islandsLookupByName = {y:x for x,y in islandsLookup.items()}
print('islandsLookup:')
print(islandsLookup)


teacherRoleLookupsDF = pd.read_excel(rawFSMLookups, sheetname="lkpTeacherRole", skiprows=116, parse_cols="A,B")
teacherRoleLookups = teacherRoleLookupsDF.set_index('codeDescription').to_dict()['codeCode']
teacherRoleByNameLookups = teacherRoleLookupsDF.set_index('codeCode').to_dict()['codeDescription']
print('teacherRoleLookupsDF:')
print(teacherRoleLookupsDF.head(5))

teacherQualLookupsDF = pd.read_excel(rawFSMLookups, sheetname="lkpTeacherQual", skiprows=55, parse_cols="A,B")
print('teacherQualLookupsDF:')
print(teacherQualLookupsDF.head(5))

roleGradesLookupsDF = pd.read_excel(rawFSMLookups, sheetname="RoleGrades", skiprows=82, parse_cols="A,C")
roleGradesLookups = roleGradesLookupsDF.set_index('roleCode').to_dict()['rgCode']
print('roleGradesLookupsDF:')
print(roleGradesLookupsDF.head(5))

gradeLevelsLookupsDF:
  codeCode   codeDescription
0    GPRES        Pre-school
1       GK  ECE/Kindergarten
2       G1           Grade 1
3       G2           Grade 2
4       G3           Grade 3
electLLookupsDF:
  codeCode           codeDescription
0      MOR          Mortlocks Region
1       NN  Northern Namoneas Region
2       SN  Southern Namoneas Region
3      FAI                   Faichuk
4       NW                 Northwest
electNLookupsDF:
  codeCode codeDescription
0      CHU           Chuuk
1      PNI         Pohnpei
2      YAP             Yap
3      KOS          Kosrae
islandsLookupsDF:
   iCode      iName
0      1      Ettal
1      2      Kuttu
2      3  Lekinioch
3      4      Losap
4      5       Moch
islandsLookup:
{'Ettal': 1, 'Kuttu': 2, 'Lekinioch': 3, 'Losap': 4, 'Moch': 5, 'Namoluk': 6, 'Nema': 7, 'Oneop': 8, 'Piisemwar': 9, 'Satowan': 10, 'Ta': 11, 'Fonoton': 12, 'Piis Paneu': 13, 'Weno': 14, 'Etten': 15, 'Fefen': 16, 'Parem': 17, 'Siis': 18, 'Tonoas': 19, 'Uman': 

In [20]:
islandsLookupsDF

    iCode        iName
0       1        Ettal
1       2        Kuttu
2       3    Lekinioch
3       4        Losap
4       5         Moch
5       6      Namoluk
6       7         Nema
7       8        Oneop
8       9    Piisemwar
9      10      Satowan
10     11           Ta
11     12      Fonoton
12     13   Piis Paneu
13     14         Weno
14     15        Etten
15     16        Fefen
16     17        Parem
17     18         Siis
18     19       Tonoas
19     20         Uman
20     21          Eot
21     22   Fanapanges
22     23         Onei
23     24        Paata
24     25        Polle
25     26      Romanum
26     27     Tolensom
27     28         Udot
28     29       Fananu
29     30         Houk
..    ...          ...
57     58         Maap
58     59        Gagil
59     60        Tomil
60     61        Fanif
61     62        Weloy
62     63  Delipebinaw
63     64         Rull
64     65      Kanifay
65     66       Gilman
66     67         Asor
67     68         Fais
68     69  

In [30]:
# Process schools data

schoolsDF = pd.read_excel(rawInitialSchoolsData, sheetname="Schools", header=0)
schoolsDF = schoolsDF.drop(['schElectL'], 1) # better data in other source

# Use 'iCode' from schoolsDF for Island/Municipality (i.e. iCode)
schoolsDF = schoolsDF.replace(to_replace={'iCode':islandsLookup})

schoolsExtraDF = pd.read_excel(rawSchoolsExtraData, sheetname="List final FSMED", header=0)
# Because data from dropped columns below are better in schoolsDF
schoolsExtraDF = schoolsExtraDF.drop(['State','Location','Region/Zone/Municipality','School Type','School Level','Enrollment'], 1)
schoolsExtraDF = schoolsExtraDF.replace(to_replace={'School Name':schoolsLookup})
schoolsExtraDF = schoolsExtraDF.rename(columns = {'School Name': 'schNo','Lat': 'schLat','Long': 'schLong'})

def assignElectLOrNull(x):
    # will get iCode another way
    # if x in islandsLookupsDF['iName'].values:        
    #     # return the iCode
    #     return islandsLookupsDF[islandsLookupsDF['iName'] == x]['iCode'].values[0]
    if x in electLLookupsDF['codeDescription'].values:
        # return the codeCode
        return electLLookupsDF[electLLookupsDF['codeDescription'] == x]['codeCode'].values[0]
    else:
        return 'NULL'

# Use 'IslandsOrElectorate' to infer Local electorate (i.e. schElectL)
schoolsExtraDF = schoolsExtraDF.assign(schElectL = schoolsExtraDF['IslandsOrElectorate'])
schoolsExtraDF['schElectL'] = schoolsExtraDF['schElectL'].apply(assignElectLOrNull)
schoolsExtraDF = schoolsExtraDF.drop(['IslandsOrElectorate'], 1)

schoolsFinalDF = pd.merge(schoolsDF, schoolsExtraDF, on='schNo', how='left')

# Put NULL for all missing values
schoolsFinalDF = schoolsFinalDF.fillna('NULL')

print('schoolsDF:')
print(schoolsDF.head(2))
print('schoolsExtraDF:')
print(schoolsExtraDF.head(2))
print('schoolsFinalDF:')
print(schoolsFinalDF.head(5))

# Writing data to sheets
# writer = pd.ExcelWriter(outSchoolsFinalData)
# schoolsFinalDF.to_excel(writer, sheet_name='Schools', index=False)
# writer.save()

schoolsDF:
    schNo                     schName schVillage schType schLandOwner schPh1  \
0  CHU100           Chuuk High School    Nantaku    STSS       Public    NaN   
1  CHU101  Iras Demo Elementay School       Iras   STEPS       Public    NaN   

  schAuth schLang schRegStatus    schRegStatusDate  schClosed schCloseReason  \
0     NDE      EN            A 1978-01-01 00:00:00          0            NaN   
1     NDE      EN            A 1978-01-01 01:00:00          0            NaN   

  schElectN iCode  
0       CHU    14  
1       CHU    14  
schoolsExtraDF:
    schNo    schLat     schLong schElectL
0  CHU175  7.453103  151.879960        NN
1  CHU133  7.361557  151.632213       FAI
schoolsFinalDF:
    schNo                     schName schVillage schType schLandOwner schPh1  \
0  CHU100           Chuuk High School    Nantaku    STSS       Public   NULL   
1  CHU101  Iras Demo Elementay School       Iras   STEPS       Public   NULL   
2  CHU102  Mechitiw Elementary School   Mechitiw 

In [38]:
schoolsFinalLocationDF = schoolsFinalDF.drop(['schName','schType','schLandOwner','schPh1','schAuth','schLang','schRegStatus','schRegStatusDate','schClosed','schCloseReason','schLat','schLong'], 1)
schoolsFinalLocationDF

#outSchoolsFinalLocationData = os.path.join(fsmDir,'project-life-cycle/development-phase/fsm-schools-location-data.xlsx')
outSchoolsFinalLocationData = '/Users/ghachey/Documents/JobsAndClients/Nuzusys/Clients/FederatedStatesOfMicronesiaPublicSchoolSystem/project-life-cycle/development-phase/fsm-schools-location-data.xlsx'

# Writing data to sheets
writer = pd.ExcelWriter(outSchoolsFinalLocationData)
schoolsFinalLocationDF.to_excel(writer, sheet_name='SchoolsLocation', index=False)
writer.save()

In [43]:
# Prepare student enrolments auxiliary raw data
# Also get all schools to identify typos and associate with correct school (in other cell)
# And some grade level data processing too

# Read the whole Student Roster in
rawRosterDF = pd.read_excel(rawCombinedData, sheetname="CombinedStudents", header=0, index_col=None, parse_cols="A,C,I,J,M,N,P,S,T,U,X")
print("rawRosterDF: ")
print(rawRosterDF.head(1))

# Prepare grade levels helper lookup (from student roster)
uniqueLevelsInStudentRoster = rawRosterDF['Grade Level'].unique()
print('uniqueLevelsInStudentRosterDF: ')
print(uniqueLevelsInStudentRosterDF)

uniqueLevelsInStudentRosterDF = pd.DataFrame(uniqueLevelsInStudentRoster)
uniqueLevelsInStudentRosterDF = uniqueLevelsInStudentRosterDF.rename(columns = {0: 'codeDescription'})
uniqueLevelsInStudentRosterDF = uniqueLevelsInStudentRosterDF.assign(codeCode=pd.Series(['GK','G1','G2','G3','G4','G5','G6',
                                                                                         'G7','G8','G9','G10','G11','G12',
                                                                                         'G4','GK']).values)

levelsLookup = uniqueLevelsInStudentRosterDF.set_index('codeDescription').to_dict()['codeCode']
print('levelsLookup:')
print(levelsLookup)

# Get unique survey years and gender for observation
uniqueSurveyYears = rawRosterDF['SchoolYear'].unique()
print("uniqueSurveyYears: ",)
print(uniqueSurveyYears)

uniqueGender = rawRosterDF['Gender'].unique()
print("uniqueGender: ")
print(uniqueGender)

uniqueRepeat = rawRosterDF['Repeat Previous Year Grade'].unique()
print("uniqueRepeat: ")
print(uniqueRepeat)

uniqueTrin = rawRosterDF['Transferred From which school'].unique()
print("uniqueTrin: ")
print(uniqueTrin)

uniqueTrout = rawRosterDF['Transferred TO which school'].unique()
print("uniqueTrout: ")
print(uniqueTrout)

uniqueDropout = rawRosterDF['Drop-Out'].unique()
print("uniqueDropout: ")
print(uniqueDropout)


uniqueSchoolsInStudentRoster = rawRosterDF['School Name'].unique()
uniqueSchoolsInStudentRosterDF = pd.DataFrame(uniqueSchoolsInStudentRoster)
uniqueSchoolsInStudentRosterDF = uniqueSchoolsInStudentRosterDF.rename(columns = {0: 'schName'})
uniqueSchoolsInStudentRosterDF

cleanMergedSchoolsRightJoinDF = pd.merge(schoolsDF, uniqueSchoolsInStudentRosterDF, on='schName', how='right')
cleanMergedSchoolsLeftJoinDF = pd.merge(schoolsDF, uniqueSchoolsInStudentRosterDF, on='schName', how='left')
cleanMergedSchoolsRightJoinDF


# Writing data to sheets
# writer = pd.ExcelWriter(outEnrolmentTransitData)
# schoolsDF.to_excel(writer, sheet_name='Schools Lookups', index=False)
# cleanMergedSchoolsRightJoinDF.to_excel(writer, sheet_name='cleanMergedSchoolsLeftJoinDF', index=False)
# #cleanMergedSchoolsLeftJoinDF.to_excel(writer, sheet_name='cleanMergedSchoolsRightJoinDF', index=False)
# writer.save()

# Some pre-processing data cleanup

# Clean repeater, transfers in/out, dropouts
repeatLookup = {
    'No': 'No',
    'Yes': 'Yes',
    'NO': 'No',
    'YES': 'Yes',
    'yes': 'Yes',
    'Missing': 'No',
    ' ': 'No'
}

repeat = rawRosterDF['Repeat Previous Year Grade'].map(repeatLookup)
#trin = rawRosterDF['Transferred From which school'].map(trinLookup)
#trout = rawRosterDF['Transferred TO which school'].map(troutLookup)
#dropout = rawRosterDF['Drop-Out'].map(dropoutLookup)
rawRosterDF = rawRosterDF.assign(repeat=repeat)

# Clean grade levels and age
rawRosterDF = rawRosterDF.replace(to_replace={'Grade Level':levelsLookup})
rawRosterDF = rawRosterDF.rename(columns = {'Grade Level': 'enLevel'})
rawRosterDF = rawRosterDF.rename(columns = {'Age as of September 30 of that School Year': 'enAge'})

# Clean schools
rawRosterDF = rawRosterDF.replace(to_replace={'School Name':schoolsLookup})
closedSchools = ['Kanifay ECE Center', 'Colonia ECE Center', 'Mizpah Christian High School', 'Mizpah High', 'Rumung Elementary School',
                 'Nukaf Elem/Sapota Paata Elem']
rawRosterDF = rawRosterDF[~rawRosterDF['School Name'].isin(closedSchools)]
rawRosterDF = rawRosterDF.rename(columns = {'School Name': 'schNo'})
rawRosterDF = rawRosterDF.rename(columns = {'Full Name': 'Name'})
rawRosterDF = rawRosterDF.rename(columns = {'Date of Birth': 'DoB'})

# Remove student with unknown DoB and Age
rawRosterDF = rawRosterDF[~(rawRosterDF['DoB'] == 'Unknown')]

# Clean survey years and genders
surveyYearsLookup = {
    'SY2016-2017': 2016,
    'SY2015-2016': 2015,
    'SY2014-2015': 2014,
    'SY2013-2014': 2013,
    'SY2012-2013': 2012
}
gendersLookup = {
    'male': 'M',
    'female': 'F',
    'MAle': 'M',
    'MALE': 'M',
    'Female': 'F',
    'Male': 'M',
}
rawRosterDF = rawRosterDF.replace(to_replace={'SchoolYear':surveyYearsLookup, 'Gender':gendersLookup})
rawRosterDF = rawRosterDF.rename(columns = {'SchoolYear': 'svyYear'})
print('rawRosterDF:')
print(rawRosterDF.head(10))

rawRosterDF:
   svyYear   schNo                 Name Gender                  DoB  enAge  \
0     2016  CHU175        Aorda Killion      F  2010-12-22 00:00:00    6.0   
1     2016  CHU175         Conrat Kukku      M  2011-04-29 00:00:00    5.0   
2     2016  CHU175          Envin Efson      M  2010-03-27 00:00:00    6.0   
3     2016  CHU175            IS Saimon      M  2009-05-09 00:00:00    7.0   
4     2016  CHU175      Jennifer Billuk      F  2010-11-22 00:00:00    6.0   
5     2016  CHU175      Jilloria Anitok      F  2010-10-27 00:00:00    6.0   
6     2016  CHU175            JO Joseph      M  2011-03-31 00:00:00    5.0   
7     2016  CHU175   Juliann  Tipingeni      F  2010-04-25 00:00:00    6.0   
8     2016  CHU175  Jurshiann Tipingeni      F  2010-04-25 00:00:00    6.0   
9     2016  CHU175           KR  Nowell      M  2010-09-02 00:00:00    6.0   

  enLevel Repeat Previous Year Grade Transferred From which school  \
0      GK                         No                      

In [46]:
# Process repeaters
repeatDF = rawRosterDF[rawRosterDF['repeat'] == 'Yes']
pupilTableColumns = ['ssID','ptCode','ptAge','ptLevel','ptPage','ptRow','ptCol','ptM','ptF','ptSum','ptTableDef','ptTable'] 
print("repeatDF: ")
print(repeatDF.head(3))




repeatDF: 
    svyYear   schNo          Name Gender                  DoB  enAge enLevel  \
87     2016  CHU133  Eighty Takky      F  2009-06-15 00:00:00    7.0      G1   
88     2016  CHU133     Fifa Koky      F  2010-05-12 00:00:00    6.0      G1   
89     2016  CHU133   Inoti Chiro      M  2010-06-06 00:00:00    6.0      G1   

   Repeat Previous Year Grade Transferred From which school  \
87                        Yes                       Missing   
88                        Yes                       Missing   
89                        Yes                       Missing   

   Transferred TO which school Drop-Out repeat  
87                     Missing  Missing    Yes  
88                     Missing  Missing    Yes  
89                     Missing  Missing    Yes  
svyYear                          1875
schNo                            1875
Name                             1875
Gender                           1875
DoB                              1875
enAge                        

In [None]:
# Process enrolment data (VERY LONG TO PROCESS)

# svyYear, schNo, Name, Gender, DoB, enAge, enLevel  
enrolmentColumns = ['svyYear','schNo','enAge','enLevel','enM','enF']
rawEnrolmentDF = pd.DataFrame(columns=enrolmentColumns)
rawEnrolmentDF

# df = rawRosterDF[(rawRosterDF['svyYear'] == 2016) &
#                         (rawRosterDF['schNo'] == 'CHU175') &
#                         (rawRosterDF['enAge'] == 6) &
#                         (rawRosterDF['enLevel'] == 'GK')]

# Work on small sample to get this working first
rawRosterCleanedSampleDF = rawRosterDF #[:1000]

rawEnrolmentDF = rawRosterCleanedSampleDF.drop_duplicates(subset=['svyYear','schNo','enAge','enLevel'])
rawEnrolmentDF = rawEnrolmentDF.drop(['Name','Gender','DoB'], 1)
rawEnrolmentDF = rawEnrolmentDF.assign(enM=0,enF=0)
rawEnrolmentDF.reset_index(drop=True, inplace=True)

print("rawEnrolmentDF Empty: ")
print(rawEnrolmentDF)
print("rawRosterCleanedSampleDF: ")
print(rawRosterCleanedSampleDF)

# # Check if record exist and if not create it.
for student in rawRosterCleanedSampleDF.itertuples(): # .iterrows():
    #print(student)

    # What enrolment record is this student to update
    enrolRecord = rawEnrolmentDF[(rawEnrolmentDF['svyYear'] == student.svyYear) &
                              (rawEnrolmentDF['schNo'] == student.schNo) &
                              (rawEnrolmentDF['enAge'] == student.enAge) &
                              (rawEnrolmentDF['enLevel'] == student.enLevel)]

    try: 
        if student.Gender == 'M':
            rawEnrolmentDF.iloc[enrolRecord.index, rawEnrolmentDF.columns.get_loc('enM')] = rawEnrolmentDF.iloc[enrolRecord.index, rawEnrolmentDF.columns.get_loc('enM')] + 1
        elif student.Gender == 'F':
            rawEnrolmentDF.iloc[enrolRecord.index, rawEnrolmentDF.columns.get_loc('enF')] = rawEnrolmentDF.iloc[enrolRecord.index, rawEnrolmentDF.columns.get_loc('enF')] + 1
    except IndexError:
        print(rawEnrolmentDF)
        print("Index at fault: ", enrolRecord.index)

rawEnrolmentDF = pd.read_excel(outEnrolmentTransitData, sheetname="EnrolmentsRaw", header=0, parse_cols="A:F")
rawEnrolmentDF = rawEnrolmentDF[(rawEnrolmentDF['enAge'] != -195) &
                                (rawEnrolmentDF['enAge'] != -1) &
                                (rawEnrolmentDF['enAge'] != 0) &
                                (~rawEnrolmentDF['enAge'].isnull())]
print("Uniques age values: ", rawEnrolmentDF['enAge'].unique())
print("rawEnrolmentDF: ")
print(rawEnrolmentDF)

# Writing enrolment data to sheets
# writer = pd.ExcelWriter(outEnrolmentTransitData)
# rawEnrolmentDF.to_excel(writer, sheet_name='EnrolmentsRaw', index=False)
# writer.save()

In [None]:
# Process final Enrolments ['ssID', 'enAge', 'enLevel', 'enM', 'enF']
schoolSurveyDF['ssIDTemp'] = schoolSurveyDF['svyYear'].map(str) + schoolSurveyDF['schNo']
schoolSurveyLookup = schoolSurveyDF.set_index(['ssIDTemp']).to_dict()['ssID']

enrolmentsDF = rawEnrolmentDF
enrolmentsDF['ssIDTemp'] = enrolmentsDF['svyYear'].map(str) + enrolmentsDF['schNo']
enrolmentsDF['ssID'] = enrolmentsDF['ssIDTemp'].map(schoolSurveyLookup)
enrolmentsDF = enrolmentsDF.drop(['svyYear','schNo','ssIDTemp'], 1)

order_cols = ['ssID', 'enAge', 'enLevel', 'enM', 'enF']
enrolmentsDF = enrolmentsDF[order_cols]

print("rawEnrolment: ")
print(rawEnrolmentDF.head(3))
print("schoolSurveyDF: ")
print(schoolSurveyDF.head(3))
print("enrolments: ")
print(enrolmentsDF)

# Writing Enrolments data to sheets
# writer = pd.ExcelWriter(outEnrolmentData)
# enrolmentsDF.to_excel(writer, sheet_name='Enrolments', index=False)
# writer.save()

In [None]:
# Process School Surveys
schoolTypesLookup = schoolsDF.set_index('schNo').to_dict()['schType']

rawEnrolmentDF = pd.read_excel(outEnrolmentTransitData, sheetname="EnrolmentsRaw", header=0, parse_cols="A:F")
schoolSurveyDF = rawEnrolmentDF.drop(['enAge','enLevel'], 1)
schoolSurveyDF = schoolSurveyDF.groupby(['svyYear','schNo'], as_index=False).sum()
schoolSurveyDF = schoolSurveyDF.rename(columns = {'enF': 'ssEnrolF', 'enM': 'ssEnrolM'})
schoolSurveyDF = schoolSurveyDF.assign(ssEnrol = lambda x: x.ssEnrolF + x.ssEnrolM)
schoolSurveyDF.insert(0,'ssID',range(1,len(schoolSurveyDF.index)+1))

schoolSurveyDF['ssSchType'] = schoolSurveyDF['schNo'].map(schoolTypesLookup)
print("schoolSurveyDF: ")
print(schoolSurveyDF.head(3))

# Writing SchoolSurveys data to sheets
# writer = pd.ExcelWriter(outInitialSchoolSurveysData)
# schoolSurveysDF.to_excel(writer, sheet_name='SchoolSurveys', index=False)
# writer.save()

In [None]:
# Process accreditation data

rawAccreditationDataDF = pd.read_excel(rawAccreditationData, sheetname="UpdatedSheet", header=1)

# Remove all records where no school inspection exists for now
rawAccreditationDataDF = rawAccreditationDataDF[~rawAccreditationDataDF['L1'].isnull()]

rawAccreditationDataDF['Year'].fillna(value=2016,inplace=True)

schoolWithExistingLookup = list(schoolsLookup.keys())
schoolWithExistingLookup.sort()
rawUniqueSchoolNamesFromAccreditation = list(rawAccreditationDataDF['School Name'].unique())
rawUniqueSchoolNamesFromAccreditation.pop()
rawUniqueSchoolNamesFromAccreditation.sort()
print('schoolWithExistingLookup: ', schoolWithExistingLookup[:3])
print('rawUniqueSchoolNamesFromAccreditation: ', rawUniqueSchoolNamesFromAccreditation[:3])
# set(rawUniqueSchoolNames).difference(schoolWithExistingLookup)

# First cleanup schools from raw accreditation data (to be fixed manually and re-entered into the schoolsLookup in other cell)
uniqueSchoolsFromRawAccreditationDF = pd.DataFrame({'RawSchools': rawUniqueSchoolNamesFromAccreditation})
#create a mapping for those schools that actually match a school from other records
SchNoMapping = uniqueSchoolsFromRawAccreditationDF['RawSchools'].map(schoolsLookup)
uniqueSchoolsFromRawAccreditationDF = uniqueSchoolsFromRawAccreditationDF.assign(SchNoMappingCompleteMe = SchNoMapping)
SchNameMapping = uniqueSchoolsFromRawAccreditationDF['SchNoMappingCompleteMe'].map(schoolsLookupByName)
uniqueSchoolsFromRawAccreditationDF = uniqueSchoolsFromRawAccreditationDF.assign(SchoolName = SchNameMapping)
print('uniqueSchoolsFromRawAccreditationDF: ')
print(uniqueSchoolsFromRawAccreditationDF.head(5))
# Writing all schools anomalies for hand fixing
# uniqueSchoolsFromRawAccreditationDF.to_excel(schoolTypoWriter, sheet_name='SchoolsFromAccreditation', index=False)
# schoolTypoWriter.save()

# Prepare data for InspectionSet (inspsetID,inspsetName,inspsetType,inspsetYear)
InspectionSets = {
    'inspsetID': [1,2],
    'inspsetName': ['2016','2017'],
    'inspsetType': ['SCHACCR','SCHACCR'],
    'inspsetYear': [2016,2017]
}

inspectionSetsLookup = {
    2016: 1,
    2017: 2
}

inspectionSetDF = pd.DataFrame(data=InspectionSets)
print('inspectionSetDF:')
print(inspectionSetDF.head(5))
      
# Process SchoolAccreditation starting from the raw data
schNo = rawAccreditationDataDF['School Name'].map(schoolsLookup)
inspsetID = rawAccreditationDataDF['Year'].map(inspectionSetsLookup)
rawAccreditationDataDF = rawAccreditationDataDF.assign(schNo = schNo, inspsetID = inspsetID)
rawAccreditationDataDF = rawAccreditationDataDF.drop(['State','School Name','Column22','Column3','Column1','Year'], 1)
rawAccreditationDataDF = rawAccreditationDataDF.rename(columns = {'Date Visited': 'inspStart',
                                                                  'L1':'saL1', 'L2':'saL2', 'L3':'saL3', 'L4':'saL4',
                                                                  'T1':'saT1', 'T2':'saT2', 'T3':'saT3', 'T4':'saT4',
                                                                  'D1':'saD1', 'D2':'saD2', 'D3':'saD3', 'D4':'saD4',
                                                                  'N1':'saN1', 'N2':'saN2', 'N3':'saN3', 'N4':'saN4',
                                                                  'F1':'saF1', 'F2':'saF2', 'F3':'saF3', 'F4':'saF4',
                                                                  'S1':'saS1', 'S2':'saS2', 'S3':'saS3', 'S4':'saS4',
                                                                  'CO1':'saCO1', 'CO2':'saCO2',
                                                                  'Tally 1':'saLT1','Tally 2':'saLT2','Tally 3':'saLT3','Tally 4':'saLT4',
                                                                  'Total':'saT','Level':'saSchLevel'})
rawAccreditationDataDF.insert(0,'inspID',range(1,len(rawAccreditationDataDF.index)+1))
rawAccreditationDataDF.insert(0,'saID',rawAccreditationDataDF['inspID'])
# TODO remove records with no school ID for now until the schools are all sorted out
rawAccreditationDataDF = rawAccreditationDataDF[~rawAccreditationDataDF['schNo'].isnull()]

schoolAccreditationDF = rawAccreditationDataDF.drop(['inspID','inspStart','inspsetID','schNo'],1)
schoolAccreditationDF.fillna('NULL', inplace=True)
print('schoolAccreditationDF:')
print(schoolAccreditationDF.head(5))
      
# Process SchoolInspection data (inspID,schNo,inspPlanned,inspStart,inspEnd,inspNote,inspBy,inspsetID)
schoolInspectionDF = rawAccreditationDataDF.drop(list(schoolAccreditationDF.columns.values),1)
schoolInspectionDF.fillna('NULL', inplace=True)
print('schoolInspectionDF:')
print(schoolInspectionDF.head(5))
      
# Writing all school accreditation and related data to sheets
# writer = pd.ExcelWriter(outAccreditationData)
# inspectionSetDF.to_excel(writer, sheet_name='InspectionSet', index=False)
# schoolAccreditationDF.to_excel(writer, sheet_name='SchoolAccreditation', index=False)
# schoolInspectionDF.to_excel(writer, sheet_name='SchoolInspection', index=False)
# writer.save()

In [9]:
# Process teachers data

# Prepare writer for all teacher related data
writer = pd.ExcelWriter(outTeachersData)

# All raw fields 'State','Municipality/Zone/Region','Island Name','First Name','Last Name','Job Title','Ethnicity','Citizenship','Staff Type','Teacher-Type','Organization','Gender','Highest Degree Achieved','Copy of Degree/Certificate Available','Field of Study','Certified','Expiration','Date of Hire ','Date of Birth','Annual Salary','Funding Source','School Name','School Type','School Level','Grade Taught','Employment Status','Reason','Date of Exit','Total # of days absent'
rawTeachersAppointmentsDF = pd.read_excel(rawTeachersData, sheetname="CombinedSchoolStaff", header=0)
# Change all the string missing to actual pandas NULL since this is what we'll be inserting in DB
rawTeachersAppointmentsDF.replace('Missing', 'NULL', inplace=True)
# Handle as many dates as possible setting the bad ones to NaN 	 
tDatePSAppointed = pd.to_datetime(rawTeachersAppointmentsDF['Date of Hire '], errors="coerce").dt.date #, format="%m/%d/%Y"
tDOB = pd.to_datetime(rawTeachersAppointmentsDF['Date of Birth'], errors="coerce").dt.date #, format="%m/%d/%Y"
tDatePSClosed = pd.to_datetime(rawTeachersAppointmentsDF['Date of Exit'], errors="coerce").dt.date #, format="%m/%d/%Y"

# Only work on the first 500 rows until this is cleaned
#rawTeachersAppointmentsDF = rawTeachersAppointmentsDF #[0:499]
# Cleanup
# Drop what we will not need at all here onwards, rename, clean dates...
rawTeachersAppointmentsDF = rawTeachersAppointmentsDF.assign(tDatePSAppointed = tDatePSAppointed, tDOB = tDOB, tDatePSClosed = tDatePSClosed)
rawTeachersAppointmentsDF = rawTeachersAppointmentsDF.drop(['State','Municipality/Zone/Region','Island Name','School Type','School Level',
                                                            'Date of Hire ','Date of Birth','Date of Exit'], 1)
rawTeachersAppointmentsDF = rawTeachersAppointmentsDF.rename(columns = {
    'First Name': 'tGiven', 'Last Name': 'tSurname', 'Gender': 'tSex', 'Reason': 'tCloseReason'
})
rawTeachersAppointmentsDF = rawTeachersAppointmentsDF.fillna('NULL')

# Get a whole bunch of unique repeating values in lookup like columns
# as first step in cleaning them
uniqueStaffTypeFromRawTeacher = rawTeachersAppointmentsDF['Staff Type'].unique()
print('uniqueStaffTypeFromRawTeacher:')
print(uniqueStaffTypeFromRawTeacher)

uniqueTeacherTypeFromRawTeacher = rawTeachersAppointmentsDF['Teacher-Type'].unique()
print('uniqueTeacherTypeFromRawTeacher:')
print(uniqueTeacherTypeFromRawTeacher)

uniqueOrganizationFromRawTeacher = rawTeachersAppointmentsDF['Organization'].unique()
print('uniqueOrganizationFromRawTeacher:')
print(uniqueOrganizationFromRawTeacher)

uniqueCertifiedFromRawTeacher = rawTeachersAppointmentsDF['Certified'].unique()
print('uniqueCertifiedFromRawTeacher:')
print(uniqueCertifiedFromRawTeacher)

uniqueEthnicitiesFromRawTeacher = rawTeachersAppointmentsDF['Ethnicity'].unique()
print('uniqueEthnicitiesFromRawTeacher:')
print(uniqueEthnicitiesFromRawTeacher)

uniqueJobTitlesFromRawTeacher = rawTeachersAppointmentsDF['Job Title'].unique() # role in Pineapple
print('uniqueJobTitlesFromRawTeacher:')
print(uniqueJobTitlesFromRawTeacher)

uniqueHighestDegreesFromRawTeacher = rawTeachersAppointmentsDF['Highest Degree Achieved'].unique()
print('uniqueHighestDegreesFromRawTeacher:')
print(uniqueHighestDegreesFromRawTeacher) 

uniqueSchoolsFromRawTeacher = rawTeachersAppointmentsDF['School Name'].unique()
print('uniqueSchoolsFromRawTeacher:')
print(uniqueSchoolsFromRawTeacher)

uniqueTeacherAppointmentSchoolYearFromRawTeacher = rawTeachersAppointmentsDF['SchoolYear'].unique()
print('uniqueTeacherAppointmentSchoolYearFromRawTeacher:')
print(uniqueTeacherAppointmentSchoolYearFromRawTeacher)

uniqueGenderFromRawTeacher = rawTeachersAppointmentsDF['tSex'].unique()
print('uniqueGenderFromRawTeacher:')
print(uniqueGenderFromRawTeacher)

# Manually constructed lookups from above unique values
# these must be rebuilt when/if new data would come in as input
rawNationalitiesLookup = {
    'Chuukese': 'CHU',
    'American': 'USA',
    'Australian': 'AUS',
    'Romanian': 'ROU',
    'Finnish': 'FIN',
    'Belgian': 'BEL',
    'Yapese ': 'YAP',
    'Filipino': 'PHL',
    'Brazilian': 'BRA',
    'Russian': 'RUS',
    'Other': 'O',
    'Pohnpeian': 'PNI',
    'Indonesian': 'IDN',
    'Japanese': 'JPN',
    'Yap': 'YAP',
    'Chuuk': 'CHU',
    'Yapese': 'YAP',
    'Palauan': 'PLW',
    'Caucasian': 'USA',
    'N. American': 'USA',
    'Pakistani': 'PAK',
    'Vietnamese': 'VNM',
    'Norwegian': 'NOR' 
}

rawRolesLookup = {
    'Classroom Teacher I': 'CTI',
    'Classroom Teacher II': 'CTII',
    'Classroom Teacher IV': 'CTIV',
    'Classroom Teacher III': 'CTIII',
    'Classroom Teacher V': 'CTV',
    'Classroom Mentor': 'CM',
    'Vocational Coordinator': 'VC',
    'Classroom Teacher': 'CT',
    'School Principal II': 'SPII',
    'Vocational Teacher II': 'VTII',
    'Assistant Principal III': 'ASPIII',
    'School Principal I': 'SPI',
    'Teacher Assistant': 'TA',
    'Houseparent I': 'HI',
    'Cook III': 'CIII',
    'Head Teacher': 'HT',
    'School Principal III': 'CPIII',
    'Peace Corp Volunteer': 'PCV',
    'Acting Principal I': 'API',
    'School Principal (Middle School)': 'SP',
    'School Principal(Primary Grade)': 'SP',
    'School Principal': 'SP',
    'Classroom Teacher_Regular': 'CT',
    'Classroom Teacher_ECE': 'CT',
    'Classroom Teacher_Special Ed.': 'CT',
    'Teacher Aide_WD&ST': 'TA',
    'Principal/Classroom Teacher_Regular': 'SP',
    'Teacher Aide_Special Ed': 'TA',
    'Teacher Aide_Special Ed.': 'TA',
    'Classroom Teacher_VocEd': 'VTI',
    'Teacher Aide_ECE': 'TA',
    'Related Services Assistant': 'TA',
    'Clasroom Teacher_Regular': 'CT',
    'Principal': 'SP',
    'Teacger Aide_WD&ST': 'TA',
    'Principal III': 'SPIII',
    'Classroom Teacher': 'CT',    
    'Vice Principal I': 'SVPI',
    'Classroom Teacher I (SpEd)': 'CTI',
    'Classroom Teacher II (SpEd)': 'CTII',
    'Classroom Teacher (Contract)': 'CT',
    'School Principal II': 'SPII',
    'Assistant Principal ': 'AP',
    'Vocational Teacher': 'VT',
    'Vocational Teacher I': 'VTI',
    'Classroom Teacher III (SpEd)': 'CTIII',
    'Classroom Teacher  II': 'CTII',
    'Classroom Teacher  I': 'CTI',
    'Classroom Teache II': 'CTII',
    'Vocational Teacher III': 'VTIII',
    'Classroom Teacher IV (SpEd)': 'CTIV',    
    'Classroom Teacher (SpEd)': 'CT',
    'Principal I': 'SPI',
    'Calssroom Teacher II': 'CTII',
    'Acting School Principal': 'AP',
    'Calssroom Teacher I': 'CTI',
    'Classroom Teacher Ii': 'CTII',
    'Classroom Teacher  (SpEd)': 'CT',
    'Classroom Teacher II ': 'CTII',
    'Classroom Teacher (Contract)1yr': 'CT',
    'Classroom Teacher (Contract 1YR)': 'CT',
    'School Principal III': 'SPIII',
    'Classroom Teacher I (Contract-NTE 1YR)': 'CTI',
    'Acting Principal I': 'API', #test dup
    'Vocational Instructor': 'VTI',
    'Classroom Teacher i': 'CTI',
    'School Principal II (Contract)': 'SPII',
    'Acting Head Teacher': 'AHT',
    'Classroom Teacher l': 'CTI',
    'School Prncipal': 'SP',
    'School Princiapl II': 'SPII',
    'School Principal (Middle School)': 'SP',
    'School Principal (Primary Grade)': 'SP',
    'Teacher Aide': 'TA',
    'Vice Principal': 'SVP',
    'Classroom Teacher (PCV)': 'CT',
    'Classroom Teacher (Bible)': 'CT',
    'Culture Teacher': 'CT',
    'Vocational Education Teacher': 'VT',
    'Culture Resource Teacher': 'CT',
    'Teacher ': 'CT',    
    'Teacher': 'CT',
    'Librarian': 'L',
    'Cook': 'C',
    'Bus Driver': 'BD',
    'Clerk': 'CL',
    'Couselor': 'SC',
    'Assistant Marine Instructor': 'VT',
    'Principla': 'SP',
    'Counselor': 'SC',
    'Kitchen Helper': 'KH',
    'Classroom Teacher-Regular': 'CT',
    'Prinipal': 'SP',
    'Boat Operator': 'BO',
    'ClassroomTeacher_Regular': 'CT',
    'Houseparent': 'H',
    'Driver': 'D',
    'Supply Technician': 'ST',
    'Print Disability Specialist': 'PDS',
    'Secretary': 'SE',
    'House Parent': 'HP',    
    'Education Specialis': 'ES',
    'Pricipal': 'SP',
    'School PrincipalIII': 'SPIII',
    'Classroom Mentor': 'CM',    
    'Vocational Coordinator': 'VC',
    'Consultant (Reform Plan)': 'CO',
    'Counselor IV': 'SCIV',
    'Secretary I': 'SEI',
    'Maintenance': 'MA',
    'Security Guard I': 'SGI',
    'Security Guard Supervisor': 'SGS',
    'Campus Maintenance': 'MA',
    'Registrar': 'RE',
    'School PrincipalII': 'SPIII',
    'Registrar I': 'REI',    
    'Security Guard': 'SG',
    'Trademan': 'TR',
    'Clerk Typist III': 'CLIII',
    'Custodian': 'CU',
    'Clerk Typist': 'CL',
    'Building Maitenance I': 'MAI',
    'Security Guard II': 'SGII',
    'Cook III': 'CIII',
    'Houseparent I': 'HI',
    'Cook I': 'CI',
    'School PrincipalI': 'SPI',
    'Assistant School PrincipalIII': 'ASPIII',
    'Data Clerk I': 'CI',
    'Peace Corp Volunteer': 'PCV',
    'School Principal(Middle School)': 'SP',
    'School Principal(Primary Grade)': 'SP',
    'Primary Consulting Resource Teacher': 'C',
    'Substitute Teacher': 'SUB',
    'School Counselor': 'SC',
    'Secondary Consulting Resource Teacher': 'C',
    'Administrative Officer': 'AO',
    'Maintenance Specialist': 'MA',
    'Secondary Transition Specialist': 'ES',
    'Secondary Transition Supervisor': 'TS',
    'PE Instuctor': 'VT',
    'School Accountant/Administrative Assistant': 'AO',
    'Supervisor': 'SU',
    'Moonitor': 'MO',
    'Principal/Administrator': 'SP',
    'Monitor': 'MO',
    'Canteen supervisor': 'CS',
    'Admin. Assistant': 'AA',
    'Voc Ed. Coordinator': 'VC',
    'Vice Princiapl': 'SVP',
    'School Counselor V': 'SCV',
    'Maintenance Worker III': 'MAIII',
    'Security Guard III': 'SGIII',
    'Bus Driver II': 'BDII',
    'Bus Driver I': 'BDI',
    'Data Clerk IV': 'CLIV',
    'Custodial Worker II'
    'Registrar II': 'REII',
    'Teacher Assistant': 'TA',
    'Maintenance Worker I': 'MAI',    
    'House Parent II': 'HII',
    'Librarian I': 'LI',
    'School Counselor I': 'SCI',
    'Cook II': 'CII',
    'Assistant Principal': 'ASP',
    'Resource Teacher': 'CT',
    'Teacher Aide (WD&ST)': 'TA',
    'Cook ': 'C',    
    'Teaching Principal': 'SP',
    'Bus Driver ': 'BD',
    'Contract Instructor': 'VT',
    'Home Arts Instructor': 'VT',
    'cook': 'C',
    'Ground Keeper': 'GK',
    'Assistant Librarian': 'AL',
    'Job Career Counselor': 'SC',
    'Related Services Assstant': 'RSA',
    'ECE Supervisor': 'SU',
    'Resource Teacher ': 'CT',
    'Teacher-WDNST': 'CT',
    'Teacher/PE Instuctor': 'CT',
    'Asst Director': 'ADI',
    'Director': 'DI',
    'Accountant': 'ACC',
    'Consultant  (R/Plan)': 'C',
    'Classroom Teacer IV': 'CTIV',
    'Acting Principal': 'AP',
    'Classroom Teache I': 'CTI',
    'Acting School Principal I': 'API',
    'World Teach': 'U'
}

# staff, teacher type, organisation for the 
rawDegreesLookup = {
    'AS': 'AS',
    'BA': 'BA',
    'AA': 'AA',
    'MA': 'MA',
    'none': 'NULL',
    'BS': 'BS',
    'As': 'AS',
    'None': 'NULL',
    'AS ': 'AS',
    'NULL': 'NULL',
    'AAS': 'AAS',
    'MS': 'MS',
    'BA ': 'BA',
    'HS Graduate': 'HS',
    'AA/AS': 'AS',
    'No Degree': 'NULL',
    'BA/BS': 'BS',
    'MA/MS': 'MS',
    'Certificate of Achievement': 'NULL',
    'AS Degree': 'AS',
    'Other': 'NULL',
    'AAA': 'AAA',
    'High School Diploma': 'HS',
    'AA Degree': 'AA',
    'BS ': 'BS',
    'BS, Diploma': 'BS',
    'CA': 'C',
    'PHD': 'PHD',
    'BS/MA': 'MA',
    'Finished 8th Gr': 'NULL',
    'High School': 'HS',
    'Some High Schoo': 'NULL',
    'AA\\': 'AA',
    'Some Elementary': 'NULL',
    'Some High School': 'NULL',
    'BS/B.Sc.': 'BS',
    'M.Ed.': 'MED',
    '3RD YR.': 'NULL',
    'Certificate': 'C',
    'BBA': 'BA',
    'Elementary Graduate': 'NULL',
    'Third Year': 'NULL',
    'Certificate of Completion': 'NULL'
}

# In the EMIS, there is no specific place to stored the equivalent of the columns "Staff Type", "Teacher-Type", "Organization"
      
# Built based on Staff Type, Teacher-Type, Organization
#       Local Regular Teaching Staff
#       Local Special Education Teaching Staff
#       Local Early Childhood Teaching Staff
#       Local Volunteer Teaching Staff
#       DOE Regular Teaching Staff
#       DOE Special Education Teaching Staff
#       DOE Early Childhood Teaching Staff
#       DOE Volunteer Teaching Staff
#       World Teacher Regular Teaching Staff
#       World Teacher Special Education Teaching Staff
#       World Teacher Early Childhood Teaching Staff
#       World Teacher Volunteer Teaching Staff
#       Peace Corp Regular Teaching Staff
#       Peace Corp Special Education Teaching Staff
#       Peace Corp Early Childhood Teaching Staff
#       Peace Corp Volunteer Teaching Staff
      
staffTypeFromRawTeacherLookup = {
    'Teaching Staff': 'TS',
    'Teaching Staff-TS': 'TS',
    'None Teaching Staff': 'NTS',
    'TS': 'TS',
    'NTS': 'NTS',
    'Principal': 'TS',
    'Vice Principal': 'TS',
    'Counselor': 'NTS',
    'Admin. Assistant': 'NTS',
    'Clerk': 'NTS',
    'Voc Ed. Coordinator': 'TS',
    'Maintenance': 'NTS',
    'Librarian': 'NTS',
    'Clerk ': 'NTS',
    'Librarian Staff': 'NTS',
    'Clerk Staff': 'NTS',
    'Supervisor': 'NTS',
    'T': 'TS',
    'NT': 'NTS',
    'Volunteer': 'TS',
    'NST': 'NTS',
    'Library': 'NTS'
}

certifiedFromRawTeacherLookup = {
    'NULL': 'NULL',
    'Certified': 'NSTT',
    'Yes': 'NSTT',
    'No': 'NULL',
    'yes': 'NSTT',
    ' Yes': 'NSTT',
    'Processing': 'NULL'
}

genderFromRawTeacherLookup = {
    'NULL': 'NULL',
    'Male': 'M',
    'Female': 'F'
}

# Add helper column to identify teachers
rawTeachersAppointmentsDF = rawTeachersAppointmentsDF.assign(theTeacher = lambda x: x.tGiven + '-' + x.tSurname)
# clean staff type into another column staffTypeTemp
staffTypeTemp = rawTeachersAppointmentsDF['Staff Type'].map(staffTypeFromRawTeacherLookup)
rawTeachersAppointmentsDF = rawTeachersAppointmentsDF.assign(staffTypeTemp = staffTypeTemp)
# clean certifications into another column certifiedTemp    
certifiedTemp = rawTeachersAppointmentsDF['Certified'].map(certifiedFromRawTeacherLookup)
rawTeachersAppointmentsDF = rawTeachersAppointmentsDF.assign(staffTypeTemp = staffTypeTemp, certifiedTemp = certifiedTemp)
rawTeachersAppointmentsDF.to_excel(writer, sheet_name='RawTeacherAppointments', index=False)
# remove all non-teachers for now as raw data contains all staff
rawTeachersAppointmentsDF = rawTeachersAppointmentsDF[rawTeachersAppointmentsDF['staffTypeTemp'] == 'TS']
# print('rawTeachersAppointmentsDF:')
# print(rawTeachersAppointmentsDF.head(3))
      
# Teacher DB (tID,tDOB,tSex,tGiven,tSurname)
teachersDF = rawTeachersAppointmentsDF[['theTeacher','tDOB','tSex','tGiven','tSurname',
                                        'Highest Degree Achieved','Field of Study','certifiedTemp',
                                        'tDatePSAppointed','tDatePSClosed','tCloseReason']]
teachersDF = teachersDF.replace({'tSex': {'Male': 'M', 'Female': 'F'}})
# Small possibility of loosing some data here (some teacher duplicate might contain the certification while others not)
# the drop_dup only keeps the first occurance
teachersDF = teachersDF.drop_duplicates(['theTeacher'])
teachersDF.insert(0,'tID',range(1,len(teachersDF.index)+1))
# create teachers lookup
teachersLookup = teachersDF.set_index('theTeacher').to_dict()['tID']
print('teachersDF:')
print(teachersDF.head(1))

# teacherTypeFromRawTeacherLookup = {
#     'Missing': Nan,
#     'Regular-R': 'R',
#     'Special Ed.': 'SE',
#     'ECE': 'ECE',
#     'R': 'R',
#     'SE': 'SE',
#     'Volunteer': 'V',
#     'RSA': ?,
#     'Regular': 'R',
#     'SPED': 'SE',
#     'Early Childhood Education': 'ECE',
#     'SDA': ?,
#     'World Teacher': ?,
#     'AVS II', ?,
#     'SM': ?,
#     'RT': 'R',
#     'SET': 'SE',
#     'Resident': ?,
#     'Local': ?,
#     'WT': ?
# }
      
# organizationFromRawTeacherLookup = {
#     'Missing': NaN,
#     'Local': '',
#     'CDOE': 'DOE',
#     'Special Education': 'DOE',
#     'World Teacher': 'WT',
#     'Peace Corp': 'PC',
#     'Private School': ?,
#     'JIV': ?,
#     'DOE': 'DOE',
#     'DOE ': 'DOE',
#     'KDOE': ?,
#     'World Teach': 'WT',
#     'KMG': ?,
#     'YCHS': ?
#     'Private': ?
# }

# First cleanup schools (to be written to file for manual association to correct school)
uniqueSchoolsFromRawTeacherDF = pd.DataFrame({'RawSchools': uniqueSchoolsFromRawTeacher})
#create a mapping for those schools that actually match a school from other records
SchNoMapping = uniqueSchoolsFromRawTeacherDF['RawSchools'].map(schoolsLookup)
uniqueSchoolsFromRawTeacherDF = uniqueSchoolsFromRawTeacherDF.assign(SchNoMappingCompleteMe = SchNoMapping)
SchNameMapping = uniqueSchoolsFromRawTeacherDF['SchNoMappingCompleteMe'].map(schoolsLookupByName)
uniqueSchoolsFromRawTeacherDF = uniqueSchoolsFromRawTeacherDF.assign(SchoolName = SchNameMapping)
# Writing all schools anomalies for hand fixing
schoolTypoWriter = pd.ExcelWriter(outInitialSchoolsTyposData)
uniqueSchoolsFromRawTeacherDF.to_excel(schoolTypoWriter, sheet_name='SchoolsFromTeachers', index=False)
schoolTypoWriter.save()
      
# Process teacher training (tID,trInstitution,trQual,trMajor) or (tID,trQual,trMajor)
# this is the academic degrees
teachersTrainingDF = teachersDF[['tID','Highest Degree Achieved','Field of Study']]
trQual= teachersTrainingDF['Highest Degree Achieved'].map(rawDegreesLookup) #teachersTrainingDF.loc['trQual']
teachersTrainingDF = teachersTrainingDF.rename(columns = {'Field of Study': 'trMajor'})
teachersTrainingDF = teachersTrainingDF.assign(trQual = trQual)
teachersTrainingDF = teachersTrainingDF.drop(['Highest Degree Achieved'], 1)
# and now the NSTT certification
teachersCertifiedDF = teachersDF[['tID','certifiedTemp']]
teachersCertifiedDF = teachersCertifiedDF[teachersCertifiedDF['certifiedTemp'] == 'NSTT']
teachersCertifiedDF = teachersCertifiedDF.rename(columns = {'certifiedTemp': 'trQual'})
#teachersCertifiedDF.insert(0,'trInstitution','FSM National Standard Teacher Certification') # range(1,len(teachersCertifiedDF.index)+1)
teachersTrainingDF = teachersTrainingDF.append(teachersCertifiedDF)
teachersTrainingDF = teachersTrainingDF.fillna('NULL')
teachersTrainingDF = teachersTrainingDF[~(teachersTrainingDF['trQual'] == 'NULL')]
print(teachersTrainingDF.head(1))

# Process teacher appointments. (tID,taDate,SchNo,taRole,estpNo,taEndDate)
teachersAppointmentsDF = rawTeachersAppointmentsDF[['theTeacher','SchoolYear','School Name','Job Title']]
# Set taDate and taEndDate based on SchoolYear ??
teachersAppointmentStartDatesLookup = {
    'SY2013-2014': '2013-08-01 00:00:00.000',
    'SY2014-2015': '2014-08-01 00:00:00.000',
    'SY2015-2016': '2015-08-01 00:00:00.000',
    'SY2016-2017': '2016-08-01 00:00:00.000',
}
teachersAppointmentEndDatesLookup = {
    'SY2013-2014': '2014-07-31 00:00:00.000',
    'SY2014-2015': '2015-07-31 00:00:00.000',
    'SY2015-2016': '2016-07-31 00:00:00.000',
    'SY2016-2017': '2017-07-31 00:00:00.000',
}
taDate = teachersAppointmentsDF['SchoolYear'].map(teachersAppointmentStartDatesLookup)
taEndDate = teachersAppointmentsDF['SchoolYear'].map(teachersAppointmentEndDatesLookup)
teachersAppointmentsDF = teachersAppointmentsDF.assign(taDate = taDate, taEndDate = taEndDate)
# # Need to link to correct teacher
tID = teachersAppointmentsDF['theTeacher'].map(teachersLookup)
# Need to link to correct schools
SchNo = teachersAppointmentsDF['School Name'].map(schoolsLookup)
taRole = teachersAppointmentsDF['Job Title'].map(rawRolesLookup)
teachersAppointmentsDF = teachersAppointmentsDF.assign(tID = tID, SchNo = SchNo, taRole = taRole)
# remove all appointments where we don't know the school and role since this is invalid data
# by nature (how can this be an appointment without knowing the school/role)
teachersAppointmentsDF = teachersAppointmentsDF[(pd.notnull(teachersAppointmentsDF['SchNo'])) & (pd.notnull(teachersAppointmentsDF['taRole']))]
print('teachersAppointmentsDF:')
print(teachersAppointmentsDF.head(1))

# Process establishments (estpNo,schNo,estpRoleGrade,estpActiveDate,estpTitle)
estpPositions = {}

def getEstpNo(row):
    estpKey = str(row.schNo) + '-' + str(row.taRole)
    if (estpKey in estpPositions):
        # Next position to assign to the school
        schoolPositions = estpPositions[estpKey]
        lastPositionIndex = schoolPositions.pop()
        newPositionIndex = lastPositionIndex+1
        estpPositions[estpKey] = schoolPositions + [lastPositionIndex,newPositionIndex]
        schoolRole = estpKey + '-' + str(newPositionIndex)
        return schoolRole
    else:
        # First position assigned to the school
        estpPositions[estpKey] = [1]
        schoolRole = str(estpKey) + '-1'        
        return schoolRole
    
# To simplify things we'll create a new establishment school position for each teacher appointment
# This is a simpler way to get off the ground faster, I think (hope)
#teachersAppointmentsWithEstpSetDF = teachersAppointmentsDF[teachersAppointmentsDF['taDate'] == '2016-08-01 00:00:00.000']
#establishmentsDF = teachersAppointmentsDF[teachersAppointmentsDF['taDate'] == '2016-08-01 00:00:00.000']
establishmentsDF = teachersAppointmentsDF
establishmentsDF = establishmentsDF.rename(columns = {'taDate': 'estpActiveDate', 'SchNo': 'schNo'})
estpTitle = establishmentsDF['taRole'].map(teacherRoleByNameLookups)
estpRoleGrade = establishmentsDF['taRole'].map(roleGradesLookups)
establishmentsDF = establishmentsDF.assign(estpTitle = estpTitle, estpRoleGrade = estpRoleGrade)
estpNo = establishmentsDF.apply(lambda row: getEstpNo(row),axis=1)
establishmentsDF.insert(0, 'estpNo', estpNo)
teachersAppointmentsDF = teachersAppointmentsDF.assign(estpNo = estpNo)
print('establishmentsDF:')
print(establishmentsDF.head(1))

# final cleanups of unecessary columns
teachersDF = teachersDF.drop(['theTeacher','Highest Degree Achieved','Field of Study','certifiedTemp'], 1)
teachersAppointmentsDF = teachersAppointmentsDF.drop(['SchoolYear','School Name','Job Title','theTeacher'], 1)
establishmentsDF = establishmentsDF.drop(['tID','taRole','taEndDate','theTeacher','SchoolYear','School Name','Job Title'], 1)

# Write sheets
teachersDF.to_excel(writer, sheet_name='Teacher', index=False)
teachersTrainingDF.to_excel(writer, sheet_name='TeacherTraining', index=False)
teachersAppointmentsDF.to_excel(writer, sheet_name='TeacherAppointment', index=False)
establishmentsDF.to_excel(writer, sheet_name='Establishment', index=False)
writer.save()

establishmentsDF:
        estpNo     theTeacher   SchoolYear             School Name  \
0  PNI308-CT-1  Adriano-Donre  SY2014-2015  Awak Elementary School   

                   Job Title           estpActiveDate  \
0  Classroom Teacher_Regular  2014-08-01 00:00:00.000   

                 taEndDate   schNo  tID taRole estpRoleGrade  \
0  2015-07-31 00:00:00.000  PNI308    1     CT         CT.RG   

           estpTitle  
0  Classroom Teacher  
   tID trMajor trQual
0    1    NULL     AS
teachersAppointmentsDF:
      theTeacher   SchoolYear             School Name  \
0  Adriano-Donre  SY2014-2015  Awak Elementary School   

                   Job Title                   taDate  \
0  Classroom Teacher_Regular  2014-08-01 00:00:00.000   

                 taEndDate   SchNo  tID taRole  
0  2015-07-31 00:00:00.000  PNI308    1     CT  
teachersDF:
   tID     theTeacher        tDOB tSex   tGiven tSurname  \
0    1  Adriano-Donre  1960-08-09    M  Adriano    Donre   

  Highest Degree Achie