In [3]:
%%time
###############################################################################
# This notebook focuses on processing data from excel spreadsheet directly    #
# into another format ready to load into OnlineSBA                            #
###############################################################################

# import everything we need throughout the notebook
# core stuff
import itertools
import os
from pathlib import Path
import json

# Data stuff
import pandas as pd # Data analysis
import xlrd # excel 
import pyodbc # SQL DB
import numpy as np

# Fuzzy searching stuff
from fuzzywuzzy import fuzz
# process is used to compare a string to MULTIPLE other strings
from fuzzywuzzy import process

# Pretty printing stuff
from IPython.display import display, HTML
import pprint
pp = pprint.PrettyPrinter(indent=4)

# Generate unique identifiers stuff
import uuid
import random

rd = random.Random()
rd.seed(0)

# Initial setup
test = 'MISAT' # NMCT
country = 'RMI' # FSM
cwd = os.getcwd()

# Configuration
with open('config.json', 'r') as file:
     config = json.load(file)

# It is important to keep the order of the cells since there are inplace 
# operations on DataFrames

CPU times: user 393 µs, sys: 720 µs, total: 1.11 ms
Wall time: 3.06 ms


In [4]:
# Load the schools, student enrollments and teachers from the database
# For students and teachers currenly only those of that year of loaded and used to compare with exams data
# e.g. student enrolled in 2018-19 and teachers teaching in 2018-19 and compared with exams data
# for 2018-19

# Establish a database server connection
conn = """
    Driver={{ODBC Driver 17 for SQL Server}};
    Server={},{};
    Database={};
    authentication=SqlPassword;UID={};PWD={};
    TrustServerCertificate=yes;
    autocommit=True
    """.format(config['server_ip'], config['server_port'], config['database'], config['uid'], config['pwd'])

sql_conn = pyodbc.connect(conn)

query_student_enrol = """
SELECT
	stuCardID
	, CONCAT(stuGiven,' ',stuFamilyName) AS Student -- stuMiddleNames,' ',
	, stuGender
	, stuDoB
	, schNo
	, stueYear
	FROM Student_ S
	INNER JOIN StudentEnrolment_ SE ON S.stuID = SE.stuID
"""

query_schools = """
SELECT
	schNo
	, schName
	FROM Schools
"""

# Not used yet
#query_teachers = """
#"""

df_student_enrol = pd.read_sql(query_student_enrol, sql_conn)
print('df_student_enrol')
display(df_student_enrol.head(3))

df_schools = pd.read_sql(query_schools, sql_conn)
print('df_schools')
display(df_schools.head(3))

df_student_enrol


Unnamed: 0,stuCardID,Student,stuGender,stuDoB,schNo,stueYear
0,MH010866,Alienson Sauel,M,2010-02-08,AIL100,2018.0
1,MH010867,Billy James,M,2011-03-19,AIL100,2018.0
2,MH010868,Hope Kelen,F,2012-01-19,AIL100,2018.0


df_schools


Unnamed: 0,schNo,schName
0,MAL101,Aerok Elementary School
1,ALU101,Ailuk Elementary School
2,AIL100,Airok Elementary School


In [5]:
def load_excel_to_df(filename):
    """Loads an Excel filename to a Pandas DataFrame.

    Parameters
    ----------
    filename : str, required
        The filename of the excel file to load

    Raises
    ------
    NotImplementedError
        Could raise unknown error. Implement if it happens
    
    Returns
    -------
    DataFrame
    """
    file_path = Path(filename)
    file_extension = file_path.suffix.lower()[1:]

    if file_extension == 'xlsx':
        df_student_results = pd.read_excel(filename, index_col=None, header=0, engine='openpyxl')
    elif file_extension == 'xls':
        df_student_results = pd.read_excel(filename, index_col=None, header=0)
    elif file_extension == 'csv':
        df_student_results = pd.read_csv(filename, index_col=None, header=0)
    else:
        raise Exception("File not supported")

    return df_student_results

In [6]:
# Load a single SOE Assessment workbook (for testing,)
# in particular the sheet with the raw data
cwd = os.getcwd()
#filename = os.path.join(cwd, 'data/RMI/MISAT/MISAT 2019/3GrEng2019/AllSchools_A03_2018-19_Results.xls')
#filename = os.path.join(cwd, 'data/RMI/MISAT/MISAT 2012/6grEng12/AllSchools_A06_2011-12_Results.xls')
#filename = os.path.join(cwd, 'data/RMI/MISAT/MISAT 2009/3GrEng09/AllSchools_A03_2008-09_Results.xls')
filename = os.path.join(cwd, 'data/RMI/MISAT/MISAT 2019/Gr6Math2019/AllSchools_M06_2018-19_Results.xls')

df_student_results = load_excel_to_df(filename)
print('df_student_results')
display(df_student_results)

df_student_results


Unnamed: 0,RecordNo,SchoolYear,TestID,TestName,IslandName,SchoolID,SchoolName,StudentID,StudentName,Gender,...,Item_031_MS0603030103m_ccc,Item_032_MS0603030104h_ddd,Item_033_MS0603050101e_bbb,Item_034_MS0603050102m_ddd,Item_035_MS0603050103m_aaa,Item_036_MS0603050104h_bbb,Item_037_MS0604020101m_ccc,Item_038_MS0604020102e_ddd,Item_039_MS0604020103m_ccc,Item_040_MS0604020104h_ccc
0,1,2018-19,M06,M06 - Grade 6 - Math Form A,Private,MAJ132,Ajeltake Christian Academy,33,Bin Langmour,m,...,D,D,A,B,A,A,C,D,A,C
1,2,2018-19,M06,M06 - Grade 6 - Math Form A,Private,MAJ103,Assumption,34,Alfonso Agustin,m,...,D,D,B,C,C,D,B,C,A,C
2,3,2018-19,M06,M06 - Grade 6 - Math Form A,Private,MAJ103,Assumption,35,Chorister Deke,m,...,C,D,B,B,C,A,D,B,C,D
3,4,2018-19,M06,M06 - Grade 6 - Math Form A,Private,MAJ103,Assumption,36,Chris Paul,m,...,D,D,B,D,C,B,C,D,A,C
4,5,2018-19,M06,M06 - Grade 6 - Math Form A,Private,MAJ103,Assumption,37,Delaney Lorennij,m,...,C,D,A,D,C,A,C,D,C,D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1031,1032,2018-19,M06,M06 - Grade 6 - Math Form A,Private,Majuro Coop,Majuro Coop,166,Kirby Alik SPED,m,...,D,D,B,C,A,B,D,A,B,D
1032,1033,2018-19,M06,M06 - Grade 6 - Math Form A,Private,Majuro Coop,Majuro Coop,177,Yanzhe Hrang SPED,f,...,D,D,B,D,A,B,C,D,D,C
1033,1034,2018-19,M06,M06 - Grade 6 - Math Form A,Public,MAJ122,RES,758,Rema Moja SPED,f,...,A,C,"(A,B)",A,C,"(A,D)",A,"(B,D)",B,C
1034,1035,2018-19,M06,M06 - Grade 6 - Math Form A,Public,MAJ122,RES,763,ROdney Anni SPED,m,...,B,C,A,D,C,B,A,D,C,B


In [7]:
%%time
# Load all SOE Assessment workbook inside a directory
# (~50 seconds on iMac with i9 CPU and 32GB RAM)
cwd = os.getcwd()
data_dir = 'data/'+country+'/'+test
path = os.path.join(cwd, data_dir)

df_student_results_list = []

for root, directories, files in os.walk(path, topdown=False):
    for name in files:
        filename = os.path.join(root, name)
        print('Loading into DataFrame file:', filename)
        try:
            df_student_results_list.append(load_excel_to_df(filename))
        except:
            print('Problem loading file:', filename)
            #print('Error was:', )            

print('Completed loading excel files')

Loading into DataFrame file: /mnt/c/Users/Ghislain Hachey/Google Drive (ghachey@nuzusys.com)/Development/Pacific EMIS/repositories/pacific-emis-exams-data-jupyter-python/data/RMI/MISAT/MISAT 2009/3GrEng2009/AllSchools_A03_2008-09_Results.xls
Loading into DataFrame file: /mnt/c/Users/Ghislain Hachey/Google Drive (ghachey@nuzusys.com)/Development/Pacific EMIS/repositories/pacific-emis-exams-data-jupyter-python/data/RMI/MISAT/MISAT 2009/3GrKM2009/AllSchools_B03_2008-09_Results.xls
Loading into DataFrame file: /mnt/c/Users/Ghislain Hachey/Google Drive (ghachey@nuzusys.com)/Development/Pacific EMIS/repositories/pacific-emis-exams-data-jupyter-python/data/RMI/MISAT/MISAT 2009/3GrMath2009/AllSchools_M03_2008-09_Results.xls
Loading into DataFrame file: /mnt/c/Users/Ghislain Hachey/Google Drive (ghachey@nuzusys.com)/Development/Pacific EMIS/repositories/pacific-emis-exams-data-jupyter-python/data/RMI/MISAT/MISAT 2009/6GrEng2009/AllSchools_A06_2008-09_Results.xls
Loading into DataFrame file: /mn

In [8]:
def merge_exams_data_with_student_enrol_df(df_student_results, df_student_enrol, testing=False):
    """ Merge both the dirty exams data with the clean student enrollments dataset

    Parameters
    ----------
    df_student_results : DataFrame, required
        The student results DataFrame (from SOE Assessment response sheet)
    df_student_enrol : DataFrame, required
        The student enrolment DataFrame (from EMIS)
        
    Raises
    ------
    NotImplementedError
        Could raise unknown error. Implement if it happens
    
    Returns
    -------
    DataFrame
    """
    
    # lower case to make join case insensitive (like SQL Server, the default collation of Pacific EMIS anyway)
    try:
        df_student_results['StudentName2'] = df_student_results['StudentName'].str.lower()
        df_student_enrol['Student2'] = df_student_enrol['Student'].str.lower()
    except KeyError:        
        print('StudentName column is not present or misspelled (hint from data): ', df_student_results[:1].iloc[:, : 5].to_csv(index=False, header=False))
        return
    except:
        print('Unknown error')
        return

    # Also need to trim spaces to make it exactly like the SQL Server join
    df_student_results['StudentName2'] = df_student_results['StudentName2'].str.strip()
    df_student_enrol['Student2'] = df_student_enrol['Student2'].str.strip()

    # Before we attempt to merge
    # Only keep one of the duplicates from the EMIS
    df_student_enrol.drop_duplicates(keep='last', inplace=True)
    if testing: print('Total student enrol: ', len(df_student_enrol.index))

    # isolate into a seperate DataFrame students with
    # same name but different DoB, school, etc. (i.e. different students of same name)
    df_student_enrol.duplicated(subset=['Student2'])
    df_student_enrol_nonambiguous = df_student_enrol[~df_student_enrol.duplicated(subset=['Student2'], keep=False)]
    df_student_enrol_ambiguous = df_student_enrol[df_student_enrol.duplicated(subset=['Student2'], keep=False)]
    if testing: print('Total student enrol that are not ambiguous: ', len(df_student_enrol_nonambiguous.index))
    if testing: print('Total student enrol that are ambiguous: ', len(df_student_enrol_ambiguous.index))
    if testing: print('Check ambiguous + not ambiguous equals all enrolled (minus duplicates): ', len(df_student_enrol_nonambiguous.index) + len(df_student_enrol_ambiguous.index))
    if testing: 
        print('df_student_enrol_nonambiguous') 
        display(df_student_enrol_nonambiguous.head(2))
    df_student_enrol_ambiguous.sort_values(by=['Student2'])

    # For now, process using only non-ambiguous student enrolment records
    # It would only be possible to use non-ambiguous student enrolment records
    # if the exams data would contain the correct school, DoB or other data
    # that could disambiguate students with same name

    # Merge student exams data with student enrolments
    df_students_results_and_enrol = df_student_results.set_index('StudentName2').join(df_student_enrol_nonambiguous.set_index('Student2'), lsuffix='_caller', rsuffix='_other')
    df_students_results_and_enrol = df_student_results.merge(df_student_enrol_nonambiguous, how='left', left_on='StudentName2', right_on='Student2', suffixes=('_from_exams', '_from_db'), indicator=False)
    if testing: 
        print('df_students_results_and_enrol') 
        display(df_students_results_and_enrol.head(2))
    
    return df_students_results_and_enrol

In [9]:
# Merge student exams data with student enrollments
# Working with the single student exams file (for testing)

df_students_results_and_enrol = merge_exams_data_with_student_enrol_df(df_student_results, df_student_enrol, True)
print('df_students_results_and_enrol')
df_students_results_and_enrol

Total student enrol:  85257
Total student enrol that are not ambiguous:  9332
Total student enrol that are ambiguous:  75925
Check ambiguous + not ambiguous equals all enrolled (minus duplicates):  85257
df_student_enrol_nonambiguous


Unnamed: 0,stuCardID,Student,stuGender,stuDoB,schNo,stueYear,Student2
2,MH010868,Hope Kelen,F,2012-01-19,AIL100,2018.0,hope kelen
3,MH010869,James Lakjohn,M,2012-01-21,AIL100,2018.0,james lakjohn


df_students_results_and_enrol


Unnamed: 0,RecordNo,SchoolYear,TestID,TestName,IslandName,SchoolID,SchoolName,StudentID,StudentName,Gender,...,Item_039_MS0604020103m_ccc,Item_040_MS0604020104h_ccc,StudentName2,stuCardID,Student,stuGender,stuDoB,schNo,stueYear,Student2
0,1,2018-19,M06,M06 - Grade 6 - Math Form A,Private,MAJ132,Ajeltake Christian Academy,33,Bin Langmour,m,...,A,C,bin langmour,MH007442,Bin Langmour,F,2006-02-24,MAJ101,2016.0,bin langmour
1,2,2018-19,M06,M06 - Grade 6 - Math Form A,Private,MAJ103,Assumption,34,Alfonso Agustin,m,...,A,C,alfonso agustin,,,,,,,


df_students_results_and_enrol


Unnamed: 0,RecordNo,SchoolYear,TestID,TestName,IslandName,SchoolID,SchoolName,StudentID,StudentName,Gender,...,Item_039_MS0604020103m_ccc,Item_040_MS0604020104h_ccc,StudentName2,stuCardID,Student,stuGender,stuDoB,schNo,stueYear,Student2
0,1,2018-19,M06,M06 - Grade 6 - Math Form A,Private,MAJ132,Ajeltake Christian Academy,33,Bin Langmour,m,...,A,C,bin langmour,MH007442,Bin Langmour,F,2006-02-24,MAJ101,2016.0,bin langmour
1,2,2018-19,M06,M06 - Grade 6 - Math Form A,Private,MAJ103,Assumption,34,Alfonso Agustin,m,...,A,C,alfonso agustin,,,,,,,
2,3,2018-19,M06,M06 - Grade 6 - Math Form A,Private,MAJ103,Assumption,35,Chorister Deke,m,...,C,D,chorister deke,,,,,,,
3,4,2018-19,M06,M06 - Grade 6 - Math Form A,Private,MAJ103,Assumption,36,Chris Paul,m,...,A,C,chris paul,,,,,,,
4,5,2018-19,M06,M06 - Grade 6 - Math Form A,Private,MAJ103,Assumption,37,Delaney Lorennij,m,...,C,D,delaney lorennij,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1031,1032,2018-19,M06,M06 - Grade 6 - Math Form A,Private,Majuro Coop,Majuro Coop,166,Kirby Alik SPED,m,...,B,D,kirby alik sped,,,,,,,
1032,1033,2018-19,M06,M06 - Grade 6 - Math Form A,Private,Majuro Coop,Majuro Coop,177,Yanzhe Hrang SPED,f,...,D,C,yanzhe hrang sped,,,,,,,
1033,1034,2018-19,M06,M06 - Grade 6 - Math Form A,Public,MAJ122,RES,758,Rema Moja SPED,f,...,B,C,rema moja sped,,,,,,,
1034,1035,2018-19,M06,M06 - Grade 6 - Math Form A,Public,MAJ122,RES,763,ROdney Anni SPED,m,...,C,B,rodney anni sped,,,,,,,


In [10]:
%%time
# Merge student exams data with student enrollments
# Working with all student exams files (~23 seconds on iMac with i9 CPU and 32GB RAM)
df_students_results_and_enrol_list = []

for df in df_student_results_list:
    df_students_results_and_enrol_list.append(merge_exams_data_with_student_enrol_df(df, df_student_enrol, False))

df_students_results_and_enrol_list
# Remove any None item from list (those DataFrames could not be merged)
df_students_results_and_enrol_list = list(filter(lambda x: x is not None, df_students_results_and_enrol_list))

CPU times: user 19.5 s, sys: 95.7 ms, total: 19.6 s
Wall time: 19.6 s


In [12]:
def clean_schools(df, df_schools, testing=False):
    """ Does any cleanup/validation needed with SchoolIDs.

    Parameters
    ----------
    df: DataFrame, required
        The student results and enrol DataFrame
    df_schools : DataFrame, required
        The schools DataFrame (from EMIS)
        
    Raises
    ------
    NotImplementedError
        Could raise unknown error. Implement if it happens
    
    Returns
    -------
    DataFrame
    """        

    # From EMIS, get school ID to name official mapping
    schools_lookup = df_schools.set_index('schNo').to_dict()['schName']
    schools_lookup_byname = df_schools.set_index('schName').to_dict()['schNo']
    if testing: 
        print('schools_lookup_byname')
        #pp.pprint(dict(schools_lookup_byname))
        pp.pprint(dict(itertools.islice(schools_lookup_byname.items(), 3)))

    # This list is to be confirmed and updated as necessary
    # If a school name is in an exam file but not in here we need to generate an error message
    # and update this list with the correct mapping to the canonical school ID
    schools_lookup_from_exams_byname = {
        'Aerok A-Aelonlaplap': 'AIL100',        
        'Buoj-Aelonlaplap': 'AIL101',
        'Enewa-Aelonlaplap': 'AIL102',
        'Jah-Aelonlaplap': 'AIL103',
        'Jebwan-Aelonlaplap': 'AIL105',
        'Jeh-Aelonlaplap': 'AIL104',
        'Kattiej-Aelonlaplap': 'AIL106',
        'Mejel-Aelonlaplap': 'AIL107',
        'Woja A-Aelonlaplap': 'AIL108',
        'Ailuk-Ailuk': 'ALU101',
        'Enejelaar-Ailuk': 'ALU102',
        'Arno-Arno': 'ARN101',
        'Bikarej-Arno': 'ARN102',
        'Ine-Arno': 'ARN103',
        'Japo-Arno': 'ARN104',
        'Kilange-Arno': 'ARN105',
        'Longar-Arno': 'ARN106',
        'Lukoj-Arno': 'ARN107',
        'Matolen-Arno': 'ARN108',
        'Tinak-Arno': 'ARN109',
        'Tutu-Arno': 'ARN110',
        'Ulien-Arno': 'ARN111',
        'Aur-Aur': 'AUR101',
        'Tobal-Aur': 'AUR102',
        'Ebon-Ebon': 'EBO101',
        'Enekoion-Ebon': 'EBO102',
        'Toka-Ebon': 'EBO103',
        'Enewetak-Enewetak': 'ENE101',
        'Imiej-Jaluit': 'JAL101',
        'Imroj-Jaluit': 'JAL102',
        'Jabnoden-Jaluit': 'JAL103',
        'Jabor-Jaluit': 'JAL104',
        'Jaluit-Jaluit': 'JAL105',
        'Mejurirok-Jaluit': 'JAL107',
        'Narmej-Jaluit': 'JAL108',
        'Ejit-Kili': 'KIL101',
        'Kili-Kili': 'KIL102',
        'Carlos-Kwajalein': 'KWA101',
        'Ebadon-Kwajalein': 'KWA102',
        'Ebeye Middle School-Kwajalein': 'KWA107',
        'Enniburr-Kwajalein': 'KWA111',
        'Lae-Lae': 'LAE101',
        'Lib-Lib': 'LIB101',
        'Jepal-Likiep': 'LIK101',
        'Likiep-Likiep': 'LIK102',
        'Melan-Likiep': 'LIK103',
        'Ajeltake-Majuro': 'MAJ101',
        'DES-Majuro': 'MAJ105',
        'Laura Public-Majuro': 'MAJ109',
        'Rairok-Majuro': 'MAJ121',
        'RES-Majuro': 'MAJ122',
        'UES-Majuro': 'MAJ122', #?
        'Woja M-Majuro': 'MAJ127',
        'Airok M-Maloelap': 'MAL101',
        'Jang-Maloelap': 'MAL102',
        'Kaven-Maloelap': 'MAL103',
        'Ollet-Maloelap': 'MAL104',
        'Tarawa-Maloelap': 'MAL105',
        'Mejatto-Mejatto': 'MAL105', #?
        'Mejit-Mejit': 'MAJ101',
        'Enejet-Mili': 'MIL101',
        'Lukonwod-Mili': 'MIL102',
        'Mili-Mili': 'MIL103',
        'Nallo-Mili': 'MIL104',
        'Namdrik-Namdrik': 'NAM101',
        'Loen-Namu': 'NAU101',
        'Mae-Namu': 'NAU102',
        'Majken-Namu': 'NAU103',
        'Namu-Namu': 'NAU104',
        'Ajeltake Christian Academy-Private': 'MAJ102',
        'Assumption-Private': 'MAJ103',
        'Delap SDA-Private': 'MAJ107',
        'Ebeye Calvary-Private': 'KWA103',
        'Ebeye Christian-Private': 'KWA105',
        'Ebeye SDA-Private': 'KWA109',
        'Gem Christian School-Private': 'KWA112',
        'Jebro-Private': 'KWA114',
        'Laura SDA-Private': 'MAJ110',
        'Majuro Baptist Christian Academy-Private': 'MAJ114',
        'Majuro Coop-Private': 'MAJ116',
        'Queen of Peace-Private': 'KWA117',
        'Rita Christian-Private': 'MAJ123',
        'St. Joseph-Private': 'JAL109',
        'Ujae-Ujae': 'UJA101',
        'Utrik-Utrik': 'UTR101',
        'Wodmeej-Wotje': 'WTH102',
        'Wotje-Wotje': 'WTH103',
        'Wotto-Wotto': 'WOT101',
        'Airok A-Public': 'AIL100',
        'Bouj-Public': 'AIL101',
        'Enewa-Public': 'AIL102',
        'Jah-Public': 'AIL103',
        'Jebwan-Public': 'AIL105',
        'Jeh-Public': 'AIL104',
        'Mejel-Public': 'AIL107',
        'Woja A-Public': 'AIL108',
        'Ailuk-Public': 'ALU101',
        'Enejelaar-Public': 'ALU102',
        'Arno-Public': 'ARN101',
        'Bikarej-Public': 'ARN102',
        'Ine-Public': 'ARN103',
        'Japo-Public': 'ARN104',
        'Kilange-Public': 'ARN105',
        'Longar-Public': 'ARN106',
        'Lukoj-Public': 'ARN107',
        'Matolen-Public': 'ARN108',
        'Ulien-Public': 'ARN111',
        'Aur-Public': 'AUR101',
        'Tobal-Public': 'AUR102',
        'Ebon-Public': 'EBO101',
        'Enekoion-Public': 'EBO102',
        'Toka-Public': 'EBO103',
        'Jabat-Public': 'JAB101',
        'Imiej-Public': 'JAL101',
        'Imroj-Public': 'JAL102',
        'Jabnoden-Public': 'JAL103',
        'Jabor-Public': 'JAL104',
        'Jaluit-Public': 'JAL105',
        'Mejrirok-Public': 'JAL107',
        'Narmej-Public': 'JAL108',
        'Ejit-Public': 'KIL101',
        'Kili-Public': 'KIL102',
        'Carlos-Public': 'KWA101',
        'Ebadon-Public': 'KWA102',
        'Ebeye Public-Public': 'KWA108',
        'Enniburr-Public': 'KWA111',
        'Lae-Public': 'LAE101',
        'Lib-Public': 'LIB101',
        'Jepal-Public': 'LIK101',
        'Likiep-Public': 'LIK102',
        'Melang-Public': 'LIK103',
        'Ajeltake-Public': 'MAJ101',
        'DES-Public': 'MAJ105',
        'Laura Public-Public': 'MAJ109',
        'Rairok-Public': 'MAJ121',
        'RES-Public': 'MAJ122',
        'UES-Public': 'ARN111', #?
        'Aerok M-Public': 'MAL101',
        'Jang-Public': 'MAL102',
        'Kaven-Public': 'MAL103',
        'Ollet-Public': 'MAL104',
        'Tarawa-Public': 'MAL105',
        'Mejatto-Public': 'RON101',
        'Mejit-Public': 'MAJ101',
        'Enejet-Public': 'MIL101',
        'Lukonwod-Public': 'MIL102',
        'Mili-Public': 'MIL103',
        'Nallo-Public': 'MIL104',
        'Tokewa-Public': 'MIL105',
        'Namdrik-Public': 'NAM101',
        'Loen-Public': 'NAU101',
        'Mae-Public': 'NAU102',
        'Majkin-Public': 'NAU103',
        'Ajeltake Christian Acedemy-Private': 'MAJ102',
        'Assumption-Private': 'MAJ103',
        'Delap SDA-Private': 'MAJ107',
        'Ebeye Calvary-Private': 'KWA103',
        'Ebeye Christian-Private': 'KWA105',
        'Gem Christian School-Private': 'KWA112',
        'Imroj Protestant-Private': 'JAL102',
        'Jebro-Private': 'KWA115',
        'Jeh SDA-Private': 'AIL104',
        'Laura Christian Academy-Private': 'MAJ110', #?
        'Majuro Baptist Christian Academy-Private': 'MAJ114',
        'Majuro Coop-Private': 'MAJ116',
        'Queen of Peace-Private': 'KWA117',
        'Rita Christian-Private': 'MAJ123',
        'St. Joseph-Private': 'JAL109',
        'St. Paul-Private': 'JAL109', #?
        'St. Thomas-Private': 'JAL109', #?
        'Ujae-Public': 'UJA101',
        'Utrik-Public': 'UTR101',
        'Wodmeej-Public': 'WTH102',
        'Wotje-Public': 'WTH103',
        'Wotto-Public': 'WOT101',
        'Airok A-Ailinglaplap': 'AIL100',
        'Bouj-Ailinglaplap': 'AIL101',
        'Enewa-Ailinglaplap': 'AIL102',
        'Jebwan-Ailinglaplap': 'AIL105',
        'Jeh-Ailinglaplap': 'AIL104',
        'Mejel-Ailinglaplap': 'AIL107',
        'Woja A-Ailinglaplap': 'AIL108',
        'Ailuk-Ailuk': 'ALU101',
        'Enejelaar-Alluk': 'ALU102',
        'Arno-Arno': 'ARN101',
        'Bikarej-Arno': 'ARN102',
        'Ine-Arno': 'ARN103',
        'Japo-Arno': 'ARN104',
        'Kilange-Arno': 'ARN105',
        'Longar-Arno': 'ARN106',
        'Matolen-Arno': 'ARN108',
        'Ulien-Arno': 'ARN111',
        'Aur-Aur': 'AUR101',
        'Tobal-Aur': 'AUR102',
        'Ebon-Ebon': 'EBO101',
        'Enekoion-Ebon': 'EBO102',
        'Toka-Ebon': 'EBO103',
        'Enewetak-Enewetak': 'ENE101',
        'Imiej-Jaluit': 'JAL101',
        'Imroj-Jaluit': 'JAL102',
        'Jabnoden-Jaluit': 'JAL103',
        'Jabor-Jaluit': 'JAL104',
        'Jaluit-Jaluit': 'JAL105',
        'Mejrirok-Jaluit': 'JAL107',
        'Narmej-Jaluit': 'JAL108',
        'Ejit-Kili/Bikini': 'KIL101',
        'Kili-Kili/Bikini': 'KIL102',
        'Carlos-Kwajalein': 'KWA101',
        'Ebadon-Kwajalein': 'KWA102',
        'Ebeye Public-Kwajalein': 'KWA108',
        'Enniburr-Kwajalein': 'KWA111',
        'Jepal-Likiep': 'LIK101',
        'Likiep-Likiep': 'LIK102',
        'Melang-Likiep': 'LIK103',
        'Ajeltake-Majuro': 'MAJ101',
        'DES-Majuro': 'MAJ105',
        'Laura Public-Majuro': 'MAJ109',
        'Rairok-Majuro': 'MAJ121',
        'RES-Majuro': 'MAJ122', #?,
        'UES-Majuro': 'ARN111', #?,
        'Woja M-Majuro': 'MAJ127',
        'Aerok M-Maloelap': 'MAL101',
        'Jang-Maloelap': 'MAL102',
        'Kaven-Maleolap': 'MAL103',
        'Ollet-Maloelap': 'MAL104',
        'Tarawa-Maloelap': 'MAL105',
        'Mejit-Mejit': 'MEJ101',
        'Lukonwod-Mili': 'MIL102',
        'Mili-Mili': 'MIL103',
        'Nallo-Mili': 'MIL104',
        'Namdrik-Namdrik': 'NAM101',
        'Loen-Namu': 'NAU101',
        'Mae-Namu': 'NAU102',
        'Majkin-Namu': 'NAU103',
        'Ajeltake Christian Acedemy-Majuro': 'MAJ102',
        'Assumption-Majuro': 'MAJ103',
        'Delap SDA-Majuro': 'MAJ107',
        'Ebeye Calvary-Private': 'KWA103',
        'Ebeye Christian-Private': 'KWA105',
        'Ebeye SDA-Private': 'KWA109',
        'Gem Christian School-Private': 'KWA112',
        'Jebro-Private': 'KWA115',
        'Laura Christian Academy-Majuro': 'MAJ110', #?,
        'Majuro Baptist Christian Academy-Majuro': 'MAJ114',
        'Majuro Coop-Majuro': 'MAJ116',
        'Queen of Peace-Private': 'KWA117',
        'Rita Christian-Majuro': 'MAJ123',
        'St. Joseph-Private': 'JAL109',
        'St. Paul-Private': 'JAL109', #?,
        'Utrik-Utrik': 'UTR101',
        'Wodmeej-Wotje': 'WTH102',
        'Wotje-Wotje': 'WTH103',
        'Tokewa-Mili': 'MIL105',
        'Ailuk Protestant-Private': 'ALU101', #?
        'Middle School-Public': 'MAJ120',
        'Laura Protestant-Private': 'MAJ110', #?
        'UPCS-Private': 'ARN111', #?
        'Woja SDA-Private': 'AIL108', #?
        'Namu-Public': 'NAU104',
        'Enewetak-Public': 'ENE101',
        'Tutu-Public': 'ARN110',
        'Woja M-Public': 'MAJ127',
        'Buoj-Ailinglaplap': 'AIL101',
        'Ailuk-Enejelaar': 'ALU102',
        'Enburr-Kwajalein': 'KWA111',
        'Jebro Kabua-Private': 'KWA115',
        'Jabat-Jabat': 'JAB101',
        'Jah-Ailinglaplap': 'AIL103',
        'NVTI-Majuro': 'JAL108', #?
        'Uliga-Majuro': 'MAJ116', #?
        'Majuro Middle School-Majuro': 'MAJ120',
        'Delap -Majuro': 'MAJ105',
        'St. Thomas-Wotje': 'JAL109', #?
        'St.Paul-Arno': 'JAL109', #?
        'Delap-Majuro': 'MAJ105',
        ' Ine-Arno': 'ARN103',
        'Buoj -Ailinglaplap': 'AIL101',
        'Aur -Aur': 'AUR101',
        'Lukunwod-Mili': 'MIL102',
        'Tobal -Aur': 'AUR102',
        'Uliga Protestant-Private': 'ARN111', #?
        'Rita-Majuro': 'MAJ122',
        'St. Paul-Arno': 'JAL109', #?
        'Rita -Majuro': 'MAJ122',   
        'St. Joseph-Jaluit': 'JAL109',
        'Airok Protestant-Private': 'AIL100', #? 
        'Toka -Ebon': 'EBO103',
        'Carlos -Kwajalein': 'KWA101',
        'Jah -Ailinglaplap': 'AIL103',
        'Jebat-Jebat': 'JAB101',
        'Jeh -Ailinglaplap': 'AIL103',
        'Ajeltake -Majuro': 'MAJ101',
        'Ebon -Ebon': 'EBO101',
        'Rairok -Majuro': 'MAJ121',
        'Rita Chrisitan-Private': 'MAJ123',
        'Kili -Kili': 'KIL102',
        'Longar -Arno': 'ARN106',
        'Uliga -Majuro': 'ARN111', #?
        'Jabonden-Jaluit': 'JAL103', #?   
        'St.Joseph-Jaluit': 'JAL109',
        'Bouj-Aelonlaplap': 'AIL101',
        'Rongrong-Private': 'MAJ125',
        'Ebeye Public-Kwajelein': 'KWA108',
        'Aerok Protestant-Private': 'AIL100', #?   
        'Ebeye Elementary-Kwajelein': 'KWA108', #?
        'LHS-Majuro': 'MAJ111',
        'Ebeye Calvary-Private Secondary': 'KWA104',
        'LHS-Public Secondary': 'MAJ111',
        'Jebro-Private Secondary': 'KWA114',
        'Father Hacker High School-Private Secondary': 'KWA118',
        'Ebeye SDA-Private Secondary': 'KWA110',
        'NIHS-Public Secondary': 'WTH101',
        'Assumption-Private Secondary': 'MAJ104',
        'Rita Christian-Private Secondary': 'MAJ124',
        'Majuro Coop-Private Secondary': 'MAJ117',
        'Majuro Baptist Christian Academy-Private Secondary': 'MAJ115',
        'KAHS-Public Secondary': 'KWA116',
        'JHS-Public Secondary': 'JAL106',
        'Delap SDA-Private Secondary': 'MAJ108',
        'MIHS-Public Secondary': 'MAJ119', 
        'Kattiej-Public': 'AIL106',
        'Melan-Public': 'LIK103',
        'Ebeye Middle School-Public': 'KWA107',
        'Airok M-Public': 'MAL101',
        'Buoj-Public': 'AIL101',
        'Majuro Middle School-Public': 'MAJ120',
        'LHS-Public': 'MAJ111',
        'Majken-Public': 'NAU103',
        'Aerok A-Public': 'AIL100',
        'Tinak-Public': 'ARN109',
        'Aerok A-Ailinglaplap': 'AIL100',
        'Kattiej-Ailinglaplap': 'AIL106',
        'NIHS-Wotje': 'WTH101',
        'NIHS-Private Secondary': 'WTH101',
        'Rita Christian-nan': 'MAJ124',
        'MCHS-Public Secondary': 'MAJ118',
        'MCHS-Private Secondary': 'MAJ118', #?
        'Rongrong Christian Elementary-Private': 'MAJ125', #?
        'Rongrong Elementary-Private': 'MAJ125', #?
        'Carlos-Small': 'KWA101',
        'Enewa-Small': 'AIL102',
        'Wotje-Large': 'WTH103',
        'Arno-Medium': 'ARN101',
        'Japo-Medium': 'ARN104',
        'Rairok-Large': 'MAJ121',
        'Airok M-Small': 'MAL101',
        'Lukonwod-Small': 'MIL102',
        'Jaluit-Medium': 'JAL105',
        'Namdrik-Large': 'NAM101',
        'Ebeye Public-Large': 'KWA108',
        'Tutu-Small': 'ARN110',
        'Jabor-Medium': 'JAL104',
        'Utrik-Medium': 'UTR101',
        'Woja M-Large': 'MAJ127',
        'Ulien-Medium': 'ARN111',
        'Aerok A-Medium': 'AIL100',
        'Enejet-Medium': 'MIL101',
        'Mejurirok-Medium': 'JAL107',
        'Toka-Medium': 'EBO103',
        'Lae-Medium': 'LAE101',
        'Tokewa-Small': 'MIL105',
        'Aur-Medium': 'AUR101',
        'Jeh-Medium': 'AIL104',
        'Wotto-Small': 'WOT101',
        'Jepal-Small': 'LIK101',
        'Jang-Small': 'MAL102',
        'Ebon-Medium': 'EBO101',
        'Likiep-Medium': 'LIK102',
        'Imiej-Medium': 'JAL101',
        'RES-Large': 'MAJ122',
        'Loen-Medium': 'NAU101',
        'Ejit-Medium': 'KIL101',
        'Enejelaar-Small': 'ALU102',
        'Kattiej-Small': 'AIL106',
        'Jebwan-Small': 'AIL105',
        'Mejit-Medium': 'MEJ101',
        'Laura Public-Large': 'MAJ109',
        'Ollet-Small': 'MAL104',
        'Enniburr-Medium': 'KWA111',
        'Mejatto-Medium': 'RON101',
        'Ine-Medium': 'ARN103',
        'Kili-Medium': 'KIL102',
        'Wodmeej-Small': 'WTH102',
        'Nallo-Medium': 'MIL104',
        'Buoj-Medium': 'AIL101',
        'Kaven-Small': 'MAL103',
        'Matolen-Medium': 'ARN108',
        'Lukoj-Small': 'ARN107',
        'Tinak-Medium': 'ARN109',
        'Mejel-Small': 'AIL107',
        'Ailuk-Medium': 'ALU101',
        'Lib-Medium': 'LIB101',
        'Jah-Small': 'AIL103',
        'Namu-Small': 'NAU104',
        'Ebadon-Small': 'KWA102',
        'Ajeltake-Large': 'MAJ101',
        'Kilange-Medium': 'ARN105',
        'Bikarej-Medium': 'ARN102',
        'Majken-Medium': 'NAU103',
        'Imroj-Medium': 'JAL102',
        'UES-Large': 'ARN111', #?
        'Enekoion-Small': 'EBO102',
        'DES-Large': 'MAJ105',
        'Longar-Medium': 'ARN106',
        'Melang-Small': 'LIK103',
        'Tobal-Medium': 'AUR102',
        'Woja A-Medium': 'AIL108',
        'Ujae-Medium': 'UJA101',
        'Tarawa-Medium': 'MAJ105',
        'Narmej-Medium': 'JAL108',
        'Rita Christian-Private Seconday': 'MAJ124',
        'Rongrong Christian High-Private Secondary': 'MAJ125', #?
        'Marshall Christian-Private Secondary': 'MAJ118',
        'Long Island-Majuro': 'MAJ112',
        'Laura High School-Majuro': 'MAJ111',
        'Rongrong Christian School-Private': 'MAJ125', #?  
        'Aerok M-Maleolap': 'MAL101',
        'Marshall Islands High School-Majuro': 'MAJ119',
        'Majuro Deaf School-Majuro': 'MAJ118', #?
        'Deaf Center-Majuro': 'MAJ118', #?
        'Majuro Deaf Center-Majuro': 'MAJ118', #?
        'Life Skills Academy-Majuro': 'MAJ113',
        'Marshall Christian High-Private Secondary': 'MAJ118',
        'Rita Christian High-Private Secondary': 'MAJ124',
        'Gem High School-Private Secondary': 'KWA113',
        'Rongrong-Rongrong': 'MAJ125', #?
        'Delap Calvary-Private': 'MAJ107', #?
        'Longar-Arno ': 'ARN106',
        'Ine-Arno ': 'ARN103',
        'Japo-Arno ': 'ARN104',
        'Lib -Lib ': 'LIB101',
        'Loen -Namu': 'NAU101',
        'Majken -Namu': 'NAU103',
        'Laura-Majuro': 'MAJ109',
        'Wodmeej-Wotje ': 'WTH102',
        'Wotje -Wotje ': 'WTH103',
        'Bikarej-Arno ': 'ARN102',
        'Nallo -Mili': 'MIL104',
        'Matolen-Arno ': 'ARN108',
        'Tutu-Arno ': 'ARN110',
        'Mejirirok -Jaluit': 'JAL107',
        'Assumption -Private': 'MAJ103',
        'Tinak-Arno ': 'ARN109',
        'Kilange-Arno ': 'ARN105',
        'Arno -Arno ': 'ARN101',
        'Airok A-Aelonlaplap': 'AIL100',
        'Ulien-Arno ': 'ARN111',
        'Kattiej -Aelonlaplap': 'AIL106',
        'Majuro Deaf Center-Private Secondary': 'MAJ118', #?
        'Ebeye Deaf Edu. -Private Secondary': 'MAJ118', #?
        'Ebeye Deaf Center -Private Secondary': 'MAJ118', #?
        'Rairok-Public ': 'MAJ121',
        'Ujae-Public ': 'UJA101',
        'Jeh-Public ': 'AIL104',
        'Enekoion-Public ': 'EBO102',
        'Mejel-Public ': 'AIL107',
        'Loen-Public ': 'NAU101',
        'Toka-Public ': 'EBO103',
        'Ajeltake-Public ': 'MAJ101',
        'Laura-Public ': 'MAJ109',
        'Kaven-Public ': 'MAL103',
        'Aerok A-Public ': 'AIL100',
        'Tokewa-Public ': 'MIL105',
        'Narmej-Public ': 'JAL108',
        'Tarawa-Public ': 'MAL105',
        'Lukonwod-Public ': 'MIL102',
        'Woja M-Private': 'MAJ127',
        'Majkin-Public ': 'NAU103',
        'Mili-Public ': 'MIL103',
        'Jabnoden-Public ': 'JAL103',
        'Ailuk-Public ': 'ALU101',
        'Nallo-Public ': 'MIL104',
        'Ebadon-Public ': 'KWA102',
        'Tutu-Public ': 'ARN110',
        'Ollet-Public ': 'MAL104',
        'Jebwan-Public ': 'AIL105',
        'Ulien-Public ': 'ARN111',
        'Kili-Public ': 'KIL102',
        'Longar-Public ': 'ARN106',
        'Wotje-Public ': 'WTH103',
        'Ebeye Public-Public ': 'KWA108',
        'Aur-Public ': 'AUR101',
        'Namu-Public ': 'NAU104',
        'Likiep-Public ': 'LIK102',
        'Jah-Public ': 'AIL103',
        'Carlos-Public ': 'KWA101',
        'Wotto-Public ': 'WOT101',
        'Mejit-Public ': 'MEJ101',
        'Bikarej-Public ': 'ARN102',
        'Majuro Baptist-Private': 'MAJ114',
        'Arno-Public ': 'ARN101',
        'Mejurirok-Public': 'JAL107',
        'Wodmej-Public': 'WTH102',
        'Majuro Coop-Private ': 'MAJ116',
        'Delap-Public': 'MAJ105',
        'Ebeye Calvary-Public': 'KWA103',
        'Ebeye Christian-Private ': 'KWA105',
        'Assumption-Private ': 'MAJ103',
        'Long Island-Public': 'MAJ112',
        'Jebro-Private ': 'KWA115',
        'Woja M-Public ': 'MAJ127',
        'Rita Christian-Private ': 'MAJ123',
        'Delap SDA-Private ': 'MAJ107',
        'Ebeye SDA-Private ': 'KWA109',
        'Laura Public-Public ': 'MAJ109',
        'Queen of Peace-Private ': 'KWA117',
        'St. Joseph-Private ': 'JAL109',
        'Laura SDA-Private ': 'MAJ110',
        'Majuro Baptist Christian Academy-Private ': 'MAJ114', #?
        'Gem Christian -Private ': 'KWA112', #?
        'Ajeltake Christian Academy-Private ': 'MAJ102',
        'Jabnodren-Southern': 'JAL103',
        'Imroj-Southern': 'JAL102',
        'Wodmeej-Northern': 'WTH102',
        'Jabat-Central': 'JAB101',
        'Tarawa-Northern': 'MAL105',
        'Wotje-Northern': 'WTH103',
        'Kilange-Eastern': 'ARN105',
        'Ejit-Southern': 'KIL101',
        'Kili -Southern': 'KIL102',
        'Ebon-Southern': 'EBO101',
        'Jebwan-Central': 'AIL105',
        'Imiej-Southern': 'JAL101',
        'Ulien-Eastern': 'ARN111',
        'Namdrik-Southern': 'NAM101',
        'Lukoj-Eastern': 'ARN107',
        'Tobal-Northern': 'AUR102',
        'Arno-Eastern': 'ARN101',
        'Loen-Central': 'NAU101',
        'Ollet-Northern': 'MAL104',
        'Majken-Central': 'NAU103',
        'Mae-Central': 'NAU102',
        'Bikarej-Eastern': 'ARN102',
        'Narmij-Southern': 'JAL108',
        'Jaluit-Southern': 'JAL105',
        'Tokewa-Eastern': 'MIL105',
        'Mejrirok-Southern': 'JAL107',
        'Lae-Western': 'LAE101',
        'Enejet-Eastern': 'MIL101',
        'Ailuk-Northern': 'ALU101',
        'Ine-Eastern': 'ARN103',
        'Jepal-Northern': 'LIK101',
        'Jang-Northern': 'MAL102',
        'Nallo-Eastern': 'MIL104',
        'Matolen-Eastern': 'ARN108',
        'Enekoion-Southern': 'EBO102',
        'Enewa-Central': 'AIL102',
        'Wotto-Western': 'WOT101',
        'Aerok M-Northern': 'MAL101',
        'Aur-Northern': 'AUR101',
        'Airok A-Central': 'AIL100',
        'Imroj -Southern': 'JAL102',
        'Enewetak-Eastern': 'ENE101',
        'Enejelaar-Northern': 'ALU102',
        'Jah-Central': 'AIL103',
        'Buoj-Central': 'AIL101',
        'Mejit-Northern': 'MEJ101',
        'Likiep-Northern': 'LIK102',
        'Mejatto-Kwajalein': 'RON101',
        'Tutu-Eastern': 'ARN110',
        'Longar-Eastern': 'ARN106',
        'Jeh-Central': 'AIL104',
        'Toka-Southern': 'EBO103',
        'Ujae-Western': 'UJA101',
        'Namu-Central': 'NAU104',
        'Woja A-Central': 'AIL108',
        'Jabor-Southern': 'JAL104',
        'Kili-Southern': 'KIL102',
        'Mili-Eastern': 'MIL103',
        'Utrik-Northern': 'UTR101',
        'Tinak-Eastern': 'ARN109',
        'Lib-Western': 'LIB101',
        'Mejel-Central': 'AIL107',
        'Japo-Eastern': 'ARN104',
        'Kattiej-Central': 'AIL106',
        'North Delap-Majuro': 'MAJ126',
        'Kaven-Northern': 'MAL103',
        'St.Joseph-Private': 'JAL109',
        'Lukonwod-Eastern': 'MIL102',
        'Majkin-Central': 'NAU103',
        'Mejatto-Western': 'RON101',
        'Rita Christian -Private': 'MAJ123',
        'Rairok-Majuro ': 'MAJ121',
        'Mejurirok-Southern': 'JAL107',
        'Narmej-Southern': 'JAL108',
        'Melang-Northern': 'LIK103',
        'Assumption HS-Ailinglaplap': 'MAJ104',
        'Jebro HS-Ailinglaplap': 'KWA114',
        'Majuro Baptist HS-Ailinglaplap': 'MAJ115',
        'Ebeye Calvary HS-Ailinglaplap': 'KWA104',
        'LHS -Ailinglaplap': 'MAJ111',
        'NIHS -Ailinglaplap': 'WTH101',
        'KAHS-Ailinglaplap': 'KWA116',
        'Gem HS-Ailinglaplap': 'KWA113',
        'Majuro Coop HS-Ailinglaplap': 'MAJ117',
        'Rita Christian HS-Ailinglaplap': 'MAJ124',
        'Father Hacker HS-Ailinglaplap': 'KWA118',
        'MCHS -Ailinglaplap': 'MAJ117',
        'Delap SDA HS-Ailinglaplap': 'MAJ108',
        'LHS-Ailinglaplap': 'MAJ111',
        'JHS-Ailinglaplap': 'KWA114',
        'Ebeye SDA HS-Ailinglaplap': 'KWA110',
        'MIHS -Ailinglaplap': 'MAJ119',
        'Enekoion -Public': 'EBO102',
        'Enewa -Public': 'AIL102',
        'Jabor -Public': 'JAL104',
        'Jabnoden -Public': 'JAL103',
        'Toka -Public': 'EBO103',
        'Lukonwod -Public': 'MIL102',
        'Mejit -Public': 'MEJ101',
        'Delap  -Public': 'MAJ105',
        'North Delap-Public': 'MAJ126',
        'Ujae -Public': 'UJA101',
        'Mili -Public': 'MIL103',
        'Loen -Public': 'NAU101',
        'Bouj -Public': 'AIL101',
        'Lae -Public': 'LAE101',
        'Airok  M-Public': 'MAL101',
        'Imroj -Public': 'JAL102',
        'Jabot -Public': 'JAB101',
        'Matolen -Public': 'ARN108',
        'Gem -Private': 'KWA112',
        'Ebeye SDA -Private': 'KWA109',
        'Mejel -Public': 'AIL107',
        'Tarawa -Public': 'MAL105',
        'Ulien -Public': 'ARN111',
        'Namdrik -Public': 'NAM101',
        'Jeh -Public': 'AIL104',
        'Enejet -Public': 'MIL101',
        'Rairok -Public': 'MAJ121',
        'Mejatto -Public': 'RON101',
        'Rita-Public': 'MAJ122',
        'Melang -Public': 'LIK103',
        'Ine -Public': 'ARN103',
        'MDEC-Public': 'MAJ120', #?
        'Majkin -Public': 'NAU103',
        'Ebon -Public': 'EBO101',
        'Imiej -Public': 'JAL101',
        'Aur -Public': 'AUR101',
        'Ebadon -Public': 'KWA102',
        'Jaluit -Public': 'JAL105',
        'Ollet -Public': 'MAL104',
        'Ailuk -Public': 'ALU101',
        'St. Joseph -Private': 'JAL109',
        'Arno -Public': 'ARN101',
        'Enniburr -Public': 'KWA111',
        'Tokewa -Public': 'MIL105',
        'Kilange -Public': 'ARN105',
        'Nallo -Public': 'MIL104',
        'Laura SDA -Private': 'MAJ110',
        'Likiep -Public': 'LIK102',
        'Delap SDA -Private': 'MAJ107',
        'Tinak -Public': 'ARN109',
        'Jah -Public': 'AIL103',
        'Queen of Peace-Kwajalein': 'KWA117',
        'Laura -Majuro': 'MAJ109',
        'Gem Christian School-Kwajalein': 'KWA113',
        'Enewa -Ailinglaplap': 'AIL102',
        'Narmej -Jaluit': 'JAL108',
        'Ollet -Ollet ': 'MAL104',
        'Jaluit -Jaluit ': 'JAL105',
        'Woja M -Majuro': 'MAJ127',
        'Mejit -Mejit ': 'MEJ101',
        'Bikarej -Arno': 'ARN102',
        'Kilange -Arno': 'ARN105',
        'LIB-LIB': 'LIB101',
        'Bouj -Ailinglaplap': 'AIL101',
        'Enejet -Mili': 'MIL101',
        'Tinak -Arno': 'ARN109',
        'Wodmej -Wotje': 'WTH102',
        'Imiej -Jaluit': 'JAL101',
        'Melang -Likiep': 'LIK103',
        'Ebon -Ebon ': 'EBO101',
        'Jabnoden -Jaluit': 'JAL103',
        'Ebeye Calvary -Kwajalein': 'KWA103',
        'Imroj -Jaluit': 'JAL102',
        'Jebal -Likiep': 'LIK101',
        'Woja A -Ailinglaplap': 'AIL108',
        'Ajeltake Chistian Academy-Majuro': 'MAJ102',
        'Lae -Lae ': 'LAE101',
        'Delap SDA -Majuro': 'MAJ107',
        'Jabor -Jaluit': 'JAL104',
        'Ailuk -Ailuk': 'ALU101',
        'MDED-Majuro': 'MAJ120', #?
        'Enejelaar -Ailuk': 'ALU102',
        'Tutu -Arno': 'ARN110',
        'Jebro-Kwajalein': 'KWA115',
        'Kaven -Maloelap': 'MAL103',
        'Tarawa -Maloelap': 'MAL105',
        'NDES -Majuro': 'MAJ126',
        'Tokewa -Mili': 'MIL105',
        'Enewetak -Enewetak ': 'ENE101',
        'St. Joseph -Jaluit': 'JAL109',
        'Aur -Aur ': 'AUR101',
        'Namdrik -Namdrik ': 'NAM101',
        'Ujae -Ujae ': 'UJA101',
        'Ine -Arno': 'ARN103',
        'Kattiej -Ailinglaplap': 'AIL106',
        'Likiep -Likiep ': 'LIK102',
        'Ebeye Christian-Kwajalein': 'KWA105',
        'Rongrong-Majuro': 'MAJ125', #?
        'Ebeye SDA -Kwajalein': 'KWA109',
        'Jobwon -Ailinglaplap': 'AIL105',
        'Utrik -Utrik ': 'UTR101',
        'Laura SDA -Majuro': 'MAJ110',
        'NDES-Majuro': 'MAJ126',
        'Ulien -Arno': 'ARN111',
        'Mejel -Ailinglaplap': 'AIL107',
        'Japo -Arno': 'ARN104',
        'RongRong-Majuro': 'MAJ125', #?
        'Mejrirok -Jaluit': 'JAL107',
        'RongROng -Majuro': 'MAJ125', #?        
        'Ejit-Majuro': 'MAJ101', #?
        'Jaluit-Jaljuit': 'JAL105',
        'Aerok A-Maloelap': 'MAL101',
        'Jaluit-???': 'JAL105', #?
        'Rongrong Christian-Private': 'MAJ125',
        'Ujae -Ujae': 'UJA101',
        'Majkin -Namu': 'NAU103',
        'Likiep -Likiep': 'LIK102',
        'Namdrik -Namdrik': 'NAM101',
        'Enniburr -Kwajalein': 'KWA111',
        'Delap  -Majuro': 'MAJ105',
        'Jaluit -Jaluit': 'JAL105',
        'Arno -Arno': 'ARN101',
        'Mili -Mili': 'MIL103',
        'Matolen -Arno': 'ARN108',
        'Lukonwod -Mili': 'MIL102',
        'Mejatto -Rongelap': 'RON101',
        'Airok  M-Maloelap': 'MAL101',
        'MDEC-Majuro': 'MAJ105',
        'Lae -Lae': 'LAE101',
        'Ollet -Maloelap': 'MAL104',
        'Enekoion -Ebon': 'EBO102',
        'Mejit -Mejit': 'MEJ101',
        'Ebadon -Kwajalein': 'KWA102',
        'Enniburr-Public ': 'KWA111',
        'Assumption-Public ': 'MAJ103',
        'Enewetak-Public ': 'ENE101',
        'Enejet-Public ': 'MIL101',
        'Mejurirok-Public ': 'JAL107',
        'St.Joseph-Public ': 'JAL109',
        'Ebon-Public ': 'EBO101',
        'Laura SDA-Public ': 'MAJ110',
        'Enejelaar-Public ': 'ALU102',
        'Japo-Public ': 'ARN104',
        'Ebeye Calvary-Public ': 'KWA103',
        'Utrik-Public ': 'UTR101',
        'Majuro Baptist Christian Academy-Public ': 'MAJ114', #?
        'Airok A-Public ': 'AIL100',
        'Tinak-Public ': 'ARN109',
        'Jang-Public ': 'MAL102',
        'Jebro-Public ': 'KWA115', #?
        'Ejit-Public ': 'KIL101',
        'Ajeltake Christian Academy-Public ': 'MAJ102',
        'Namdrik-Public ': 'NAM101',
        'Gem Christian School-Public ': 'KWA112', #?
        'Kilange-Public ': 'ARN105',
        'DES-Public ': 'MAJ105',
        'Enewa-Public ': 'AIL102',
        'Ine-Public ': 'ARN103',
        'Jabat-Public ': 'JAB101',
        'Jaluit-Public ': 'JAL105',
        'North Delap-Public ': 'MAJ126',
        'Kattiej-Public ': 'AIL106',
        'Matolen-Public ': 'ARN108',
        'Lib-Public ': 'LIB101',
        'Melang-Public ': 'LIK103',
        'Wodmeej-Public ': 'WTH102',
        'Rita Christian-Public ': 'MAJ123',
        'Queen of Peace-Public ': 'KWA117',
        'Imiej-Public ': 'JAL101',
        'Imroj-Public ': 'JAL102',
        'Jabor-Public ': 'JAL104',
        'Ebeye Christian-Public ': 'KWA105',
        'Ebeye SDA-Public ': 'KWA109',
        'Rongrong-Public ': 'MAJ125',
        'Lae-Public ': 'LAE101',
        'Jepal-Public ': 'LIK101',
        'Majken-Public ': 'NAU103',
        'RES-Public ': 'MAJ121', #?
        'Jabnodren-Public ': 'JAL103',
        'Buoj-Public ': 'AIL101',
        'Majuro Coop-Public ': 'MAJ116',
        'Long Island-Public ': 'MAJ112',
        'Mae-Public ': 'NAU102',
        'Delap SDA-Public ': 'MAJ107',
        'Woja A-Public ': 'AIL108',
        'Tobal-Public ': 'AUR102',
        'Ebon -Public ': 'EBO101',
        'Kili -Public ': 'KIL102',
        'Japo -Public ': 'ARN104',
        'Wodmej -Public ': 'WTH102',
        'Jabor -Public ': 'JAL104',
        'Utrik -Public ': 'UTR101',
        'Tutu -Public ': 'ARN110',
        'Jeh -Public ': 'AIL104',
        'Mae -Public ': 'NAU102',
        'Kilange -Public ': 'ARN105',
        'Tarawa -Public ': 'MAL105',
        'Toka -Public ': 'EBO103',
        'Ebeye Calvary -Private': 'KWA103',
        'Tobal -Public ': 'AUR102',
        'Ollet -Public ': 'MAL104',
        'Kattiej -Public ': 'AIL106',
        'Enewetak -Public ': 'ENE101',
        'Likiep -Public ': 'LIK102',
        'Namdrik -Public ': 'NAM101',
        'Narmej -Public ': 'JAL108',
        'Ine -Public ': 'ARN103',
        'Kaven -Public ': 'MAL103',
        'Enewa -Public ': 'AIL102',
        'Woja M -Public ': 'MAJ127',
        'Jah -Public ': 'AIL103',
        'Tinak -Public ': 'ARN109',
        'Enejet -Public ': 'MIL101',
        'Ailuk -Public ': 'ALU101',
        'Nallo -Public ': 'MIL104',
        'Ebeye Deaf Center -Kwajalein': 'KWA107', #?
        'Aur -Public ': 'AUR101',
        'RongRong-Private': 'MAJ125',
        'Imroj -Public ': 'JAL102',
        'Rairok -Public ': 'MAJ121',
        'Lae -Public ': 'LAE101',
        'Woja A -Public ': 'AIL108',
        'Loen -Public ': 'NAU101',
        'Ulien -Public ': 'ARN111',
        'Carlos -Public ': 'KWA101',
        'Jabnoden -Public ': 'JAL103',
        'Ujae -Public ': 'UJA101',
        'Jaluit -Public ': 'JAL105',
        'Tokewa -Public ': 'MIL105',
        'Mejit -Public ': 'MEJ101',
        'Laura Public -Public ': 'MAJ109',
        'Enniburr -Public ': 'KWA111',
        'Jepal -Public ': 'LIK101',
        'Imiej -Public ': 'JAL101',
        'Majuro Deaf Center -Public ': 'MAJ118', #?
        'Ebeye Public -Public ': 'KWA108',
        'Arno -Public ': 'ARN101',
        'Jang -Public ': 'MAL102',
        'Rongrong-Private ': 'MAJ125',
        'UES-Public ': 'ARN111', #?
        'Bikarej -Public ': 'ARN102',
        'Ebadon -Public ': 'KWA102',
        'Gem Christian School-Private ': 'KWA112',
        'Melan-Public ': 'LIK103',
        'Longar -Public ': 'ARN106',
        'Ebeye Cavalry-Private Primary': 'KWA103',
        'Laura SDA-Private Primary': 'MAJ110',
        'Majuro Baptist Christian Academy-Private Primary': 'MAJ114',
        'RongRong Christian-Private Primary': 'MAJ125',
        'Majuro Coop-Private Primary': 'MAJ116',
        'Assumption-Private Primary': 'MAJ103',
        'Ebeye SDA-Private Primary': 'KWA109',
        'St. Joseph-Private Primary': 'JAL109',
        'Ebeye Christian-Private Primary': 'KWA105',
        'Ajeltake Chistian Academy-Private Primary': 'MAJ102',
        'Jebro-Private Primary': 'KWA115',
        'Delap SDA-Private Primary': 'MAJ107',
        'Gem Christian School-Private Primary': 'KWA112',
        'Queen of Peace-Private Primary': 'KWA117',
        'Rita Christian-Private Primary': 'MAJ123',
        'x-x': 'AIL100', #?
        'xx-xx': 'AIL100', #?
        'xxx-xxx': 'AIL100', #?        
        'xxxx-xxxx': 'AIL100', #?
        'X-X': 'AIL100', #?
        'XX-XX': 'AIL100', #?
        'XXX-XXX': 'AIL100', #?        
        'XXXX-XXXX': 'AIL100', #?
        '?-?': 'AIL100', #?
        '??-??': 'AIL100', #?
        '???-???': 'AIL100', #?        
        '????-????': 'AIL100', #?
    }
    schools_lookup_from_exams = {y:x for x,y in schools_lookup_from_exams_byname.items()}
    
    if testing:
        print('schools_lookup_from_exams_byname')
        #pp.pprint(dict(schools_lookup_from_exams_byname))
        pp.pprint(dict(itertools.islice(schools_lookup_from_exams_byname.items(), 3)))

    # ??? Check if this is primary or elementary, some have same school names so use
    # grade of test to define the school

    # Create a temporary SchoolName and SchoolIsland joined
    df['SchoolTemp'] = df.agg('{0[SchoolName]}-{0[IslandName]}'.format, axis=1)
    if testing:
        print('Cleaning schools SchoolTemp')
        display(df['SchoolTemp'])
    
    # Upper case all school ID
    # Not needed perhaps?
    #df['SchoolID'] = df['SchoolID'].astype(str)
    #df['SchoolID'] = df['SchoolID'].str.upper()
    
    # Check if the school ID in the exams data file exists in the EMIS
    # and create temporary school name for those
    #s_school_ids1 = df['SchoolID'].map(schools_lookup_byname)
    #df = df.assign(SchoolIDTemp1 = s_school_ids1)
    mask = df['SchoolID'].isin(df_schools['schNo'].values)
    df['SchoolIDTemp1'] = df['SchoolID'].where(mask)
    # Check if the school name in the exams data file has a mapping hard coded (old/incorrect schoolIDs)
    # and create temporary school name for those
    s_school_ids2 = df['SchoolTemp'].map(schools_lookup_from_exams_byname)
    df = df.assign(SchoolIDTemp2 = s_school_ids2)

    # Coalesce to get the school ID
    # Use bfill if I end up using more then two columns to coalesce
    # https://stackoverflow.com/questions/38152389/coalesce-values-from-2-columns-into-a-single-column-in-a-pandas-dataframe
    df['SchoolIDFinal'] = df.SchoolIDTemp1.combine_first(df.SchoolIDTemp2)
    df['SchoolNameFinal'] = df['SchoolIDFinal'].map(schools_lookup)

    # Check if there is a school that does not have a known
    # mapping either from the EMIS' df_schools or the manually
    # maintained above mapping (old ID, incorrect ones, etc.)
    # If True look at the source file
    if df['SchoolNameFinal'].isnull().values.any():
        print('SchoolID still unknown/check source (hint from data): ', df[:1].iloc[:, : 10].to_csv(index=False, header=False))
        print('All school name and island name combination not yet part of hard coded mapping:')
        unique_combination = set(df['SchoolTemp'].unique())
        unique_combination_mapped = set(schools_lookup_from_exams_byname.keys())
        unique_combination_not_mapped = unique_combination.difference(unique_combination_mapped)
        for i in unique_combination_not_mapped:
            print("'" + i + "': '',")
    if testing:
        print('DataFrame with records with schoolID still unknown.')
        display(df[df['SchoolNameFinal'].isnull()])
    
    df = df.drop(['SchoolID','SchoolName','SchoolTemp','SchoolIDTemp1','SchoolIDTemp2'], 1)
    df = df.rename(columns = {'SchoolIDFinal': 'SchoolID','SchoolNameFinal': 'SchoolName'})

    if testing:
        print('Cleaned schools DataFrame')
        display(df)
        
    return df
        
def clean_items(df, testing=False):
    """ Does any cleanup/validation needed with Items (test responses.)

    Parameters
    ----------
    df : DataFrame, required
        The student results and enrol DataFrame with items to clean
        
    Raises
    ------
    NotImplementedError
        Could raise unknown error. Implement if it happens
    
    Returns
    -------
    DataFrame
    """        

    def simplify_items(x):
        """ If column an item make it uppercase and strip the redundant string
        """
        if x.startswith('Item_'):         
            return str.upper('_'.join(x.split('_', 2)[:2]))
        return x 
          
    # Re-arrange and rename item columns
    df = df.rename(columns = simplify_items)
    
    # Get list of items columns
    cols = df.columns[df.columns.str.startswith('ITEM_')].tolist()
    if testing: 
        print('Cleaned items columns:', cols)
        print('Cleaned items columns length:', len(cols))        
        print('Cleaned items Item-only DataFrame.columns length:', len(df[cols].columns))
        print('Cleaned items Item-only DataFrame.columns')
        display(df[cols].columns)
    
        
    try:
        # Remove all multiple answers (all the time?). Answers like (A,C), A&B, etc.
        df[cols] = df[cols].replace(to_replace='\(\D+?\)', value='MULT', regex=True)
        df[cols] = df[cols].replace(to_replace='[A-D].+', value='MULT', regex=True)
    
        # Insert string 'BLANK' where na
        df[cols] = df[cols].fillna('BLANK')
    except ValueError as e:
        cols1 = len(cols)
        cols2 = len(df[cols].columns)
        print('Cleaned items possible duplicate item. Columns starting with ITEM_ is {} while DataFrame columsn is {} (hint from data): '.format(cols1, cols2),
              df[:1].iloc[:, : 10].to_csv(index=False, header=False))
        print('Error was: ', e)
    
    df[cols].apply(lambda x: x.astype(str).str.upper())
    
    if testing:
        print('Cleaned items DataFrame.')
        display(df)
    
    return df

def clean_students(df, testing=False):
    """ Cleanup students data here. There is stuff from other functions like in the merge
    above that could be put more cleanly here.

    Parameters
    ----------
    df : DataFrame, required
        The student results and enrol DataFrame
        
    Raises
    ------
    NotImplementedError
        Could raise unknown error. Implement if it happens
    
    Returns
    -------
    DataFrame
    """
    
    genders = {
        'Male': 'M',
        'MALE': 'M',
        'm': 'M',       
        'M': 'M',       
        'M,': 'M',
        'Boy': 'M',
        'BOY': 'M',
        'b': 'M',        
        'B': 'M',
        'n': 'M',
        'mm': 'M',
        'MM': 'M',
        'FM': 'M',        
        'm  m': 'M',
        'Bm': 'M',
        'Female': 'F',        
        'FEMALE': 'F',
        'f': 'F',       
        'F': 'F',
        'Girl': 'F',        
        'GIRL': 'F',
        'g': 'F',        
        'G': 'F',
        'I': 'F',
        '`f': 'F',
        'FF': 'F',
        'ff': 'F',
        '?': 'M',
        '??': 'M',
        '???': 'M',
        '????': 'M',
        'BLANK': 'M',
        'a': 'F',
        'A': 'F',
        'BLANKm': 'M',
        ',': 'M',
        'ERROR #3100': 'M',
        '**': 'M',
    }
    
    yesno = {
        'Yes': 'Yes',        
        'YEs': 'Yes',
        'YES': 'Yes',        
        'yes': 'Yes',
        'y': 'No',
        'No': 'No',        
        'NO': 'No',
        'no': 'No',        
        'n': 'No',
    }

    # Adjust the correct student ID where possible and generate UUID for all others
    
    # At this point student ID already there come from the EMIS
    # Perhaps it might be useful to build a list of automatically assigned
    # UUIDs as they get processed and look there as well?!
    missing_student_id_tot = df['stuCardID'].isna().sum()
    missing_student_ids = []

    for i in range(missing_student_id_tot):
        missing_student_ids.append(uuid.UUID(int=rd.getrandbits(128), version=4))

    df.loc[df.stuCardID.isnull(), 'stuCardID'] = missing_student_ids    
        
    # Coalesce student genders
    df['GenderFinal'] = df.stuGender.combine_first(df.Gender)
    
    # Clean genders
    df['GenderFinal'] = df['GenderFinal'].str.strip() # strip out leading/trailing spaces
    df['GenderFinal'] = df['GenderFinal'].map(genders)
    # Tell me if the DataFrame has any unkown gender
    if df['GenderFinal'].hasnans:
        print('Some unknown gender detected. Here are the remaining ones:', set(df['GenderFinal'].unique()))
        if testing:
            print('These records have no gender')
            display(df[df['GenderFinal'].isna()])
    
    # Clean some boolean
    df['SpEdCode'] = df['SpEdCode'].map(yesno)
    df['Accommodation'] = df['Accommodation'].map(yesno)
    
    # Student names with *
    
    # Student names with ???
    df['StudentName'] = df['StudentName'].replace(to_replace=' *\?+ *', value='Unknown', regex=True)
    
    # Student names repeating
    
    if testing:
        print('Cleaned students DataFrame')
        display(df)
    
    df = df.drop(['Gender'], 1)
    df = df.rename(columns = {'GenderFinal': 'Gender'})
        
    return df

def clean_teachers(df, testing=False):
    """ Mostly a placeholder at the moment. But might be desirable to cleanup
    teachers, add teacher ID, etc.

    Parameters
    ----------
    df : DataFrame, required
        The student results (contains teachers) and enrol DataFrame
        
    Raises
    ------
    NotImplementedError
        Could raise unknown error. Implement if it happens
    
    Returns
    -------
    DataFrame
    """
    
    # Teachers names with ???
    df['Teacher'] = df['Teacher'].replace(to_replace=' *\?+ *', value='Unknown', regex=True)    
    
    if testing:
        print('Cleaned teachers DataFrame')
        display(df)
        
    return df

def convert_to_onlinesba(df, testing=False):
    """ A pretty fat function that does some validation, cleaning and converting
    to the OnlineSBA format. Function can be split if needed as this tools gets refine
    through practicalities of real life usage

    Parameters
    ----------
    df_students_results_and_enrol : DataFrame, required
        The student results and enrol DataFrame
        
    Raises
    ------
    NotImplementedError
        Could raise unknown error. Implement if it happens
    
    Returns
    -------
    DataFrame
    """
    
    df = df.rename(columns = {
        'stuCardID': 'STUDENTID', 
        'SpEdCode': 'SPED',
        'Accommodation': 'ACCOM',
        'StudentName': 'STUDENTNAME',
        'SchoolID': 'SCHOOLID',
        'Gender': 'GENDER',
        'TestID': 'TESTID',
        'Teacher': 'TEACHERNAME',
        'SchoolYear': 'SCHOOLYEAR'
        })
    df = df.drop([
        'RecordNo', 'TestName', 'IslandName', # 'SchoolYear',
        'SchoolName', 'StudentID', 'Ethnicity', 'Disability', 
        'ELL', 'Migrant', 'FRLunch', 'StudentName2', 'Student',
        'stuGender', 'stuDoB', 'schNo', 'stueYear', 'Student2'], 1, errors='ignore')

    cols = list(df.columns)
    cols_items = [i for i in cols if 'ITEM_' in i]
    cols_items.sort()

    # Re-order. First set of colums will likely always be there and the same
    # followed by a varying number of exam items
    df = df[[
        'SCHOOLYEAR','STUDENTID','SPED','ACCOM','STUDENTNAME','SCHOOLID','GENDER','TESTID',
        'TEACHERNAME'] + cols_items]
    
    if testing:
        print('Final OnlineSBA DataFrame')
        display(df)
    
    return df

In [13]:
# Cleanup DataFrame as OnlineSBA input format
# Working with the single student exams file (for testing)
df_onlinesba = clean_schools(df_students_results_and_enrol, df_schools, testing=True)
df_onlinesba = clean_items(df_onlinesba, testing=True)
df_onlinesba = clean_students(df_onlinesba, testing=True)
df_onlinesba = clean_teachers(df_onlinesba, testing=True)
df_onlinesba = convert_to_onlinesba(df_onlinesba, testing=True)

schools_lookup_byname
{   'Aerok Elementary School': 'MAL101',
    'Ailuk Elementary School': 'ALU101',
    'Airok Elementary School': 'AIL100'}
schools_lookup_from_exams_byname
{   'Aerok A-Aelonlaplap': 'AIL100',
    'Buoj-Aelonlaplap': 'AIL101',
    'Enewa-Aelonlaplap': 'AIL102'}
Cleaning schools SchoolTemp


0       Ajeltake Christian Academy-Private 
1                       Assumption-Private 
2                       Assumption-Private 
3                       Assumption-Private 
4                       Assumption-Private 
                       ...                 
1031                   Majuro Coop-Private 
1032                   Majuro Coop-Private 
1033                            RES-Public 
1034                            RES-Public 
1035                            RES-Public 
Name: SchoolTemp, Length: 1036, dtype: object

DataFrame with records with schoolID still unknown.


Unnamed: 0,RecordNo,SchoolYear,TestID,TestName,IslandName,SchoolID,SchoolName,StudentID,StudentName,Gender,...,stuGender,stuDoB,schNo,stueYear,Student2,SchoolTemp,SchoolIDTemp1,SchoolIDTemp2,SchoolIDFinal,SchoolNameFinal


Cleaned schools DataFrame


Unnamed: 0,RecordNo,SchoolYear,TestID,TestName,IslandName,StudentID,StudentName,Gender,Ethnicity,Disability,...,StudentName2,stuCardID,Student,stuGender,stuDoB,schNo,stueYear,Student2,SchoolID,SchoolName
0,1,2018-19,M06,M06 - Grade 6 - Math Form A,Private,33,Bin Langmour,m,,,...,bin langmour,MH007442,Bin Langmour,F,2006-02-24,MAJ101,2016.0,bin langmour,MAJ102,Ajeltake Christian Academy
1,2,2018-19,M06,M06 - Grade 6 - Math Form A,Private,34,Alfonso Agustin,m,,,...,alfonso agustin,,,,,,,,MAJ103,Assumption Elementary School
2,3,2018-19,M06,M06 - Grade 6 - Math Form A,Private,35,Chorister Deke,m,,,...,chorister deke,,,,,,,,MAJ103,Assumption Elementary School
3,4,2018-19,M06,M06 - Grade 6 - Math Form A,Private,36,Chris Paul,m,,,...,chris paul,,,,,,,,MAJ103,Assumption Elementary School
4,5,2018-19,M06,M06 - Grade 6 - Math Form A,Private,37,Delaney Lorennij,m,,,...,delaney lorennij,,,,,,,,MAJ103,Assumption Elementary School
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1031,1032,2018-19,M06,M06 - Grade 6 - Math Form A,Private,166,Kirby Alik SPED,m,,,...,kirby alik sped,,,,,,,,MAJ116,Majuro Coop Elementary School
1032,1033,2018-19,M06,M06 - Grade 6 - Math Form A,Private,177,Yanzhe Hrang SPED,f,,,...,yanzhe hrang sped,,,,,,,,MAJ116,Majuro Coop Elementary School
1033,1034,2018-19,M06,M06 - Grade 6 - Math Form A,Public,758,Rema Moja SPED,f,,,...,rema moja sped,,,,,,,,MAJ122,Rita Elementary School
1034,1035,2018-19,M06,M06 - Grade 6 - Math Form A,Public,763,ROdney Anni SPED,m,,,...,rodney anni sped,,,,,,,,MAJ122,Rita Elementary School


Cleaned items columns: ['ITEM_001', 'ITEM_002', 'ITEM_003', 'ITEM_004', 'ITEM_005', 'ITEM_006', 'ITEM_007', 'ITEM_008', 'ITEM_009', 'ITEM_010', 'ITEM_011', 'ITEM_012', 'ITEM_013', 'ITEM_014', 'ITEM_015', 'ITEM_016', 'ITEM_017', 'ITEM_018', 'ITEM_019', 'ITEM_020', 'ITEM_021', 'ITEM_022', 'ITEM_023', 'ITEM_024', 'ITEM_025', 'ITEM_026', 'ITEM_027', 'ITEM_028', 'ITEM_029', 'ITEM_030', 'ITEM_031', 'ITEM_032', 'ITEM_033', 'ITEM_034', 'ITEM_035', 'ITEM_036', 'ITEM_037', 'ITEM_038', 'ITEM_039', 'ITEM_040']
Cleaned items columns length: 40
Cleaned items Item-only DataFrame.columns length: 40
Cleaned items Item-only DataFrame.columns


Index(['ITEM_001', 'ITEM_002', 'ITEM_003', 'ITEM_004', 'ITEM_005', 'ITEM_006',
       'ITEM_007', 'ITEM_008', 'ITEM_009', 'ITEM_010', 'ITEM_011', 'ITEM_012',
       'ITEM_013', 'ITEM_014', 'ITEM_015', 'ITEM_016', 'ITEM_017', 'ITEM_018',
       'ITEM_019', 'ITEM_020', 'ITEM_021', 'ITEM_022', 'ITEM_023', 'ITEM_024',
       'ITEM_025', 'ITEM_026', 'ITEM_027', 'ITEM_028', 'ITEM_029', 'ITEM_030',
       'ITEM_031', 'ITEM_032', 'ITEM_033', 'ITEM_034', 'ITEM_035', 'ITEM_036',
       'ITEM_037', 'ITEM_038', 'ITEM_039', 'ITEM_040'],
      dtype='object')

Cleaned items DataFrame.


Unnamed: 0,RecordNo,SchoolYear,TestID,TestName,IslandName,StudentID,StudentName,Gender,Ethnicity,Disability,...,StudentName2,stuCardID,Student,stuGender,stuDoB,schNo,stueYear,Student2,SchoolID,SchoolName
0,1,2018-19,M06,M06 - Grade 6 - Math Form A,Private,33,Bin Langmour,m,,,...,bin langmour,MH007442,Bin Langmour,F,2006-02-24,MAJ101,2016.0,bin langmour,MAJ102,Ajeltake Christian Academy
1,2,2018-19,M06,M06 - Grade 6 - Math Form A,Private,34,Alfonso Agustin,m,,,...,alfonso agustin,,,,,,,,MAJ103,Assumption Elementary School
2,3,2018-19,M06,M06 - Grade 6 - Math Form A,Private,35,Chorister Deke,m,,,...,chorister deke,,,,,,,,MAJ103,Assumption Elementary School
3,4,2018-19,M06,M06 - Grade 6 - Math Form A,Private,36,Chris Paul,m,,,...,chris paul,,,,,,,,MAJ103,Assumption Elementary School
4,5,2018-19,M06,M06 - Grade 6 - Math Form A,Private,37,Delaney Lorennij,m,,,...,delaney lorennij,,,,,,,,MAJ103,Assumption Elementary School
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1031,1032,2018-19,M06,M06 - Grade 6 - Math Form A,Private,166,Kirby Alik SPED,m,,,...,kirby alik sped,,,,,,,,MAJ116,Majuro Coop Elementary School
1032,1033,2018-19,M06,M06 - Grade 6 - Math Form A,Private,177,Yanzhe Hrang SPED,f,,,...,yanzhe hrang sped,,,,,,,,MAJ116,Majuro Coop Elementary School
1033,1034,2018-19,M06,M06 - Grade 6 - Math Form A,Public,758,Rema Moja SPED,f,,,...,rema moja sped,,,,,,,,MAJ122,Rita Elementary School
1034,1035,2018-19,M06,M06 - Grade 6 - Math Form A,Public,763,ROdney Anni SPED,m,,,...,rodney anni sped,,,,,,,,MAJ122,Rita Elementary School


Cleaned students DataFrame


Unnamed: 0,RecordNo,SchoolYear,TestID,TestName,IslandName,StudentID,StudentName,Gender,Ethnicity,Disability,...,stuCardID,Student,stuGender,stuDoB,schNo,stueYear,Student2,SchoolID,SchoolName,GenderFinal
0,1,2018-19,M06,M06 - Grade 6 - Math Form A,Private,33,Bin Langmour,m,,,...,MH007442,Bin Langmour,F,2006-02-24,MAJ101,2016.0,bin langmour,MAJ102,Ajeltake Christian Academy,F
1,2,2018-19,M06,M06 - Grade 6 - Math Form A,Private,34,Alfonso Agustin,m,,,...,e3e70682-c209-4cac-a29f-6fbed82c07cd,,,,,,,MAJ103,Assumption Elementary School,M
2,3,2018-19,M06,M06 - Grade 6 - Math Form A,Private,35,Chorister Deke,m,,,...,f728b4fa-4248-4e3a-8a5d-2f346baa9455,,,,,,,MAJ103,Assumption Elementary School,M
3,4,2018-19,M06,M06 - Grade 6 - Math Form A,Private,36,Chris Paul,m,,,...,eb1167b3-67a9-4378-bc65-c1e582e2e662,,,,,,,MAJ103,Assumption Elementary School,M
4,5,2018-19,M06,M06 - Grade 6 - Math Form A,Private,37,Delaney Lorennij,m,,,...,f7c1bd87-4da5-4709-9471-3d60c8a70639,,,,,,,MAJ103,Assumption Elementary School,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1031,1032,2018-19,M06,M06 - Grade 6 - Math Form A,Private,166,Kirby Alik SPED,m,,,...,d35169bc-0044-4690-b17a-7c84cb3d7ee7,,,,,,,MAJ116,Majuro Coop Elementary School,M
1032,1033,2018-19,M06,M06 - Grade 6 - Math Form A,Private,177,Yanzhe Hrang SPED,f,,,...,e55b88ff-f086-4962-b771-f867dfa2e52b,,,,,,,MAJ116,Majuro Coop Elementary School,F
1033,1034,2018-19,M06,M06 - Grade 6 - Math Form A,Public,758,Rema Moja SPED,f,,,...,e83f3278-2c70-4901-9e3f-4f152f5c5cbe,,,,,,,MAJ122,Rita Elementary School,F
1034,1035,2018-19,M06,M06 - Grade 6 - Math Form A,Public,763,ROdney Anni SPED,m,,,...,9c017e5d-5c41-4fb5-a94a-29d3f73c3dce,,,,,,,MAJ122,Rita Elementary School,M


Cleaned teachers DataFrame


Unnamed: 0,RecordNo,SchoolYear,TestID,TestName,IslandName,StudentID,StudentName,Ethnicity,Disability,SpEdCode,...,stuCardID,Student,stuGender,stuDoB,schNo,stueYear,Student2,SchoolID,SchoolName,Gender
0,1,2018-19,M06,M06 - Grade 6 - Math Form A,Private,33,Bin Langmour,,,No,...,MH007442,Bin Langmour,F,2006-02-24,MAJ101,2016.0,bin langmour,MAJ102,Ajeltake Christian Academy,F
1,2,2018-19,M06,M06 - Grade 6 - Math Form A,Private,34,Alfonso Agustin,,,No,...,e3e70682-c209-4cac-a29f-6fbed82c07cd,,,,,,,MAJ103,Assumption Elementary School,M
2,3,2018-19,M06,M06 - Grade 6 - Math Form A,Private,35,Chorister Deke,,,No,...,f728b4fa-4248-4e3a-8a5d-2f346baa9455,,,,,,,MAJ103,Assumption Elementary School,M
3,4,2018-19,M06,M06 - Grade 6 - Math Form A,Private,36,Chris Paul,,,No,...,eb1167b3-67a9-4378-bc65-c1e582e2e662,,,,,,,MAJ103,Assumption Elementary School,M
4,5,2018-19,M06,M06 - Grade 6 - Math Form A,Private,37,Delaney Lorennij,,,No,...,f7c1bd87-4da5-4709-9471-3d60c8a70639,,,,,,,MAJ103,Assumption Elementary School,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1031,1032,2018-19,M06,M06 - Grade 6 - Math Form A,Private,166,Kirby Alik SPED,,,No,...,d35169bc-0044-4690-b17a-7c84cb3d7ee7,,,,,,,MAJ116,Majuro Coop Elementary School,M
1032,1033,2018-19,M06,M06 - Grade 6 - Math Form A,Private,177,Yanzhe Hrang SPED,,,No,...,e55b88ff-f086-4962-b771-f867dfa2e52b,,,,,,,MAJ116,Majuro Coop Elementary School,F
1033,1034,2018-19,M06,M06 - Grade 6 - Math Form A,Public,758,Rema Moja SPED,,,No,...,e83f3278-2c70-4901-9e3f-4f152f5c5cbe,,,,,,,MAJ122,Rita Elementary School,F
1034,1035,2018-19,M06,M06 - Grade 6 - Math Form A,Public,763,ROdney Anni SPED,,,No,...,9c017e5d-5c41-4fb5-a94a-29d3f73c3dce,,,,,,,MAJ122,Rita Elementary School,M


Final OnlineSBA DataFrame


Unnamed: 0,SCHOOLYEAR,STUDENTID,SPED,ACCOM,STUDENTNAME,SCHOOLID,GENDER,TESTID,TEACHERNAME,ITEM_001,...,ITEM_031,ITEM_032,ITEM_033,ITEM_034,ITEM_035,ITEM_036,ITEM_037,ITEM_038,ITEM_039,ITEM_040
0,2018-19,MH007442,No,No,Bin Langmour,MAJ102,F,M06,Kamo Benait,B,...,D,D,A,B,A,A,C,D,A,C
1,2018-19,e3e70682-c209-4cac-a29f-6fbed82c07cd,No,No,Alfonso Agustin,MAJ103,M,M06,Almira Alanzo,D,...,D,D,B,C,C,D,B,C,A,C
2,2018-19,f728b4fa-4248-4e3a-8a5d-2f346baa9455,No,No,Chorister Deke,MAJ103,M,M06,Almira Alanzo,B,...,C,D,B,B,C,A,D,B,C,D
3,2018-19,eb1167b3-67a9-4378-bc65-c1e582e2e662,No,No,Chris Paul,MAJ103,M,M06,Almira Alanzo,C,...,D,D,B,D,C,B,C,D,A,C
4,2018-19,f7c1bd87-4da5-4709-9471-3d60c8a70639,No,No,Delaney Lorennij,MAJ103,M,M06,Almira Alanzo,B,...,C,D,A,D,C,A,C,D,C,D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1031,2018-19,d35169bc-0044-4690-b17a-7c84cb3d7ee7,No,No,Kirby Alik SPED,MAJ116,M,M06,Kalesi Raratabu,C,...,D,D,B,C,A,B,D,A,B,D
1032,2018-19,e55b88ff-f086-4962-b771-f867dfa2e52b,No,No,Yanzhe Hrang SPED,MAJ116,F,M06,Kalesi Raratabu,B,...,D,D,B,D,A,B,C,D,D,C
1033,2018-19,e83f3278-2c70-4901-9e3f-4f152f5c5cbe,No,No,Rema Moja SPED,MAJ122,F,M06,Meria Ralpho,A,...,A,C,MULT,A,C,MULT,A,MULT,B,C
1034,2018-19,9c017e5d-5c41-4fb5-a94a-29d3f73c3dce,No,No,ROdney Anni SPED,MAJ122,M,M06,Meria Ralpho,D,...,B,C,A,D,C,B,A,D,C,B


In [14]:
%%time
# Cleanup DataFrame as OnlineSBA input format
# Working with all student exams files (~17 seconds on iMac with i9 CPU and 32GB RAM)
df_onlinesba_list = []

for df in df_students_results_and_enrol_list:
    print('Processing exam ID {} for year {}'.format(df['TestID'].values[0],df['SchoolYear'].values[0]))
    df_onlinesba = clean_schools(df, df_schools, testing=False)
    df_onlinesba = clean_items(df_onlinesba, testing=False)
    df_onlinesba = clean_students(df_onlinesba, testing=False)
    df_onlinesba = clean_teachers(df_onlinesba, testing=False)
    df_onlinesba = convert_to_onlinesba(df_onlinesba, testing=False)
    df_onlinesba_list.append(df_onlinesba)

len(df_onlinesba_list)

Processing exam ID A03 for year 2008-09
Processing exam ID B03 for year 2008-09
Processing exam ID M03 for year 2008-09
Processing exam ID A06 for year 2008-09
Processing exam ID B06 for year 2008-09
Processing exam ID M06 for year 2008-09
Processing exam ID H08 for year 2008-09
Processing exam ID A03 for year 2009-10
Processing exam ID B03 for year 2009-10
Processing exam ID M03 for year 2009-10
Processing exam ID A06 for year 2009-10
Processing exam ID B06 for year 2009-10
Processing exam ID M06 for year 2009-10
Processing exam ID H08 for year 2009-10
Processing exam ID A03 for year 2010-11
Processing exam ID B03 for year 2010-11
Processing exam ID M03 for year 2010-11
Processing exam ID A06 for year 2010-11
Processing exam ID B06 for year 2010-11
Processing exam ID M06 for year 2010-11
Processing exam ID H08 for year 2010-11
Processing exam ID A03 for year 2011-12
Processing exam ID B03 for year 2011-12
Processing exam ID M03 for year 2011-12
Processing exam ID A06 for year 2011-12


105

In [15]:
# Write processed data back into excel (or CSV directly)
# Working with the single student exams file (for testing)
data_xls = 'data/'+country+'/onlinesba-test.xlsx'
data_csv = 'data/'+country+'/onlinesba-test.csv'
filename_xls = os.path.join(cwd, data_xls)
filename_csv = os.path.join(cwd, data_csv)

with pd.ExcelWriter(filename_xls) as writer:
    # add DataFrames you want to write to Excel here
    df_onlinesba.to_excel(writer, index=False, sheet_name='Sheet1', engine='openpyxl')

df_onlinesba.to_csv(filename_csv, index=False)

In [16]:
df_onlinesba_list[12]

Unnamed: 0,SCHOOLYEAR,STUDENTID,SPED,ACCOM,STUDENTNAME,SCHOOLID,GENDER,TESTID,TEACHERNAME,ITEM_001,...,ITEM_039,ITEM_040,ITEM_041,ITEM_042,ITEM_043,ITEM_044,ITEM_045,ITEM_046,ITEM_047,ITEM_048
0,2009-10,14ce67b2-d75d-40c0-9680-41900ab2ff81,No,No,Jobina Jibon,AIL100,F,M06,Donald Johnny,D,...,C,B,C,C,C,B,B,B,D,B
1,2009-10,MH031933,No,No,Jokon Langidrik,AIL100,M,M06,Donald Johnny,C,...,C,B,B,D,C,B,MULT,C,B,B
2,2009-10,MH032300,No,No,Keyrose Tommy,AIL100,F,M06,Donald Johnny,B,...,C,D,A,D,B,A,C,D,B,C
3,2009-10,54ff0336-ea20-4095-805b-004ca36be292,No,No,Kyle Kabwa,AIL100,M,M06,Donald Johnny,D,...,C,D,A,A,A,C,D,D,B,C
4,2009-10,b77f8769-8e43-4b50-8aa2-46cf11033a68,No,No,Kyle Katzang,AIL100,M,M06,Donald Johnny,MULT,...,C,D,A,D,A,B,D,A,D,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1099,2009-10,38c04c66-9fb6-40b0-b539-88685ab39e08,No,No,Tarwos Ben,WTH103,M,M06,Carlina Melson,A,...,A,C,A,A,B,C,A,C,A,B
1100,2009-10,dfe0dba0-c2ff-4dfc-8749-a0599892bbd1,No,No,Compy Kattil,WOT101,M,M06,Melinmar Anjarok,D,...,C,D,C,B,A,B,B,D,C,B
1101,2009-10,b4bd80e3-8655-40c2-936a-85d0b01748b7,No,No,Dusty Jebde,WOT101,M,M06,Melinmar Anjarok,C,...,B,A,D,C,B,C,A,C,B,B
1102,2009-10,8e70c321-12d7-4317-8345-6203a9b70714,No,No,Johnny Briand,WOT101,M,M06,Melinmar Anjarok,D,...,B,C,B,C,D,B,C,A,B,C


In [19]:
%%time
# Write processed data back into excel (or CSV directly much faster)
# Working with all student exams files (~1min 52sec on iMac with i9 CPU and 32GB RAM for Excel, 2sec for CSV)

for df in df_onlinesba_list:
    try: 
        #exam = 'data/'+country+'/onlinesba-load-files-xls/' + df['SCHOOLYEAR'].values[0] + '-' + df['TESTID'].values[0] + '.xlsx'
        exam = 'data/'+country+'/onlinesba-load-files-csv/' + df['SCHOOLYEAR'].values[0] + '-' + df['TESTID'].values[0] + '.csv'
        # Could remove the SCHOOLYEAR if onlinesba really requires it
        filename = os.path.join(cwd, exam)
        print('Writing', filename)
        #with pd.ExcelWriter(filename) as writer:
        #    # add DataFrames you want to write to Excel here
        #    df.to_excel(writer, index=False, sheet_name='Sheet1', engine='openpyxl')
        df.to_csv(exam, index=False)
    except TypeError:
        print('Problem with a type, cannot generate filename')
    except:
        print('Unknown error')  

Writing /mnt/c/Users/Ghislain Hachey/Google Drive (ghachey@nuzusys.com)/Development/Pacific EMIS/repositories/pacific-emis-exams-data-jupyter-python/data/RMI/onlinesba-load-files-csv/2008-09-A03.csv
Writing /mnt/c/Users/Ghislain Hachey/Google Drive (ghachey@nuzusys.com)/Development/Pacific EMIS/repositories/pacific-emis-exams-data-jupyter-python/data/RMI/onlinesba-load-files-csv/2008-09-B03.csv
Writing /mnt/c/Users/Ghislain Hachey/Google Drive (ghachey@nuzusys.com)/Development/Pacific EMIS/repositories/pacific-emis-exams-data-jupyter-python/data/RMI/onlinesba-load-files-csv/2008-09-M03.csv
Writing /mnt/c/Users/Ghislain Hachey/Google Drive (ghachey@nuzusys.com)/Development/Pacific EMIS/repositories/pacific-emis-exams-data-jupyter-python/data/RMI/onlinesba-load-files-csv/2008-09-A06.csv
Writing /mnt/c/Users/Ghislain Hachey/Google Drive (ghachey@nuzusys.com)/Development/Pacific EMIS/repositories/pacific-emis-exams-data-jupyter-python/data/RMI/onlinesba-load-files-csv/2008-09-B06.csv
Writi

In [17]:
# Get the exact matches (i.e. exact name in exams data and the EMIS)
# Working with the single student exams file (for testing)
df_exact_matches = df_students_results_and_enrol.dropna(how='all', subset=['stuCardID']) #subset=['stuCardID', 'stuGender', 'stuDoB', 'schNo', 'stueYear'])
display(df_exact_matches)

Unnamed: 0,RecordNo,SchoolYear,TestID,TestName,IslandName,SchoolID,SchoolName,StudentID,StudentName,Gender,...,StudentName2,stuCardID,Student,stuGender,stuDoB,schNo,stueYear,Student2,SchoolTemp,SchoolIDTemp1
0,1,2018-19,M06,M06 - Grade 6 - Math Form A,Private,MAJ132,Ajeltake Christian Academy,33,Bin Langmour,m,...,bin langmour,MH007442,Bin Langmour,F,2006-02-24,MAJ101,2016.0,bin langmour,Ajeltake Christian Academy-Private,
10,11,2018-19,M06,M06 - Grade 6 - Math Form A,Private,MAJ103,Assumption,43,Lanz Paraan,m,...,lanz paraan,MH035646,Lanz Paraan,M,2007-05-02,MAJ103,2019.0,lanz paraan,Assumption-Private,MAJ103
26,27,2018-19,M06,M06 - Grade 6 - Math Form A,Private,MAJ107,Delap SDA,59,Cathlynn Glory,f,...,cathlynn glory,MH029634,Cathlynn Glory,F,2006-07-01,MAJ107,2019.0,cathlynn glory,Delap SDA-Private,MAJ107
30,31,2018-19,M06,M06 - Grade 6 - Math Form A,Private,MAJ107,Delap SDA,63,Jabwie Maika,m,...,jabwie maika,MH029638,Jabwie Maika,M,2006-06-26,MAJ107,2019.0,jabwie maika,Delap SDA-Private,MAJ107
161,162,2018-19,M06,M06 - Grade 6 - Math Form A,Private,MAJ116,Majuro Coop,175,TM Sorimle,f,...,tm sorimle,MH009724,TM Sorimle,M,2006-11-24,MAJ116,2018.0,tm sorimle,Majuro Coop-Private,MAJ116
189,190,2018-19,M06,M06 - Grade 6 - Math Form A,Private,JAL109,St. Joseph,27,Maryn Reimers,m,...,maryn reimers,MH023884,MaryN Reimers,F,2006-12-27,JAL109,2017.0,maryn reimers,St. Joseph-Private,JAL109
190,191,2018-19,M06,M06 - Grade 6 - Math Form A,Private,JAL109,St. Joseph,28,Neiwojan Binton,m,...,neiwojan binton,MH023886,Neiwojan Binton,F,2006-04-04,JAL109,2017.0,neiwojan binton,St. Joseph-Private,JAL109
195,196,2018-19,M06,M06 - Grade 6 - Math Form A,Public,ALU101,Ailuk,206,Anastesia Jack,m,...,anastesia jack,MH030019,Anastesia Jack,F,2009-04-28,ALU101,2016.0,anastesia jack,Ailuk-Public,ALU101
196,197,2018-19,M06,M06 - Grade 6 - Math Form A,Public,ALU101,Ailuk,207,Badiko Winta,m,...,badiko winta,MH030208,Badiko Winta,F,2006-09-10,ALU101,2016.0,badiko winta,Ailuk-Public,ALU101
207,208,2018-19,M06,M06 - Grade 6 - Math Form A,Public,AIL100,Airok A,200,Garnneth Horiuchi,f,...,garnneth horiuchi,MH010910,Garnneth Horiuchi,M,2007-01-28,AIL100,2018.0,garnneth horiuchi,Airok A-Public,AIL100


In [18]:
%%time
# Get the exact matches (i.e. exact name in exams data and the EMIS)
# Working with all student exams files (~23 seconds on iMac with i9 CPU and 32GB RAM)
df_exact_matches_list = []
for df in df_students_results_and_enrol_list:
    df_exact_matches_list.append(df.dropna(how='all', subset=['stuCardID']))

CPU times: user 147 ms, sys: 20.2 ms, total: 167 ms
Wall time: 166 ms


In [None]:
%%time
# WARNING: Not currently running as df_student_enrol_nonambiguous is no longer globally defined

# Just included for playing around. Not currently being used, just working with exact matches for now

# Here we will get a bit more sophisticated in trying to match students to get their EMIS
# canonical data (DoB, ID, etc.)
# Instead of doing a simple name matching we will do fuzy search using the Levenshtein algorithm
# That way we will capture students with slightly different name spellings

# Is this time consuming search worth it?!

import fuzzy_pandas as fpd

exams_cols = list(set(df_student_results.columns))
stuen_cols = list(set(df_student_enrol_nonambiguous.columns))

# the threshold is set high so we may not capture students with terribly
# bad spellings but will capture things with only small mis-spelling
# and reduce chances of false positive matching
df_fuzzy_matches = fpd.fuzzy_merge(
    df_student_results, df_student_enrol_nonambiguous,
    left_on=['StudentName2'], right_on=['Student2'],
    #keep='all',
    method='levenshtein',
    threshold=0.94, #0.9
    ignore_case=True,
    ignore_nonalpha=False,
    ignore_nonlatin=False,
    ignore_order_words=False,
    ignore_order_letters=False,
    ignore_titles=False,
    join='left-outer' # { 'inner', 'left-outer', 'right-outer', 'full-outer' }
)

df_fuzzy_matches

#s = df_fuzzy_matches['stuCardID'] == ''
#s.sum()

In [26]:
# Write various DataFrame into Excel to examine (testing)
filename = os.path.join(cwd, 'data/RMI/soe-to-online-test.xlsx')
with pd.ExcelWriter(filename) as writer:
    # add DataFrames you want to write to Excel here
    df_student_results.to_excel(writer, index=False, sheet_name='Sheet1', engine='openpyxl')
    df_students_results_and_enrol.to_excel(writer, index=False, sheet_name='Sheet2', engine='openpyxl')
    #df_fuzzy_matches.to_excel(writer, index=False, sheet_name='Sheet3', engine='openpyxl')
    df_onlinesba.to_excel(writer, index=False, sheet_name='Sheet4', engine='openpyxl')