In [108]:
###############################################################################
# This notebook focuses on processing data from excel spreadsheet directly    #
# into another format ready to load into OnlineSBA. It's focused on producing #
# the items meta file                                                         #
# This notebook should work on the same set of SOE assessment files as        #
# the notebook soe-to-onlinesba for best results                              #
###############################################################################
# Core stuff
import os
from pathlib import Path
import re

# Data stuff
import pandas as pd # Data analysis

# Initial setup
country = 'RMI' # FSM
test = 'MISAT' # NMCT

In [109]:
def load_excel_to_df(filename):
    """Loads an Excel filename to a Pandas DataFrame.

    Parameters
    ----------
    filename : str, required
        The filename of the excel file to load

    Raises
    ------
    NotImplementedError
        Could raise unknown error. Implement if it happens
    
    Returns
    -------
    DataFrame
    """
    file_path = Path(filename)
    file_extension = file_path.suffix.lower()[1:]

    if file_extension == 'xlsx':
        df_student_results = pd.read_excel(filename, index_col=None, header=0, engine='openpyxl')
    elif file_extension == 'xls':
        df_student_results = pd.read_excel(filename, index_col=None, header=0)
    elif file_extension == 'csv':
        df_student_results = pd.read_csv(filename, index_col=None, header=0)
    else:
        raise Exception("File not supported")

    return df_student_results

In [110]:
# Load a single SOE Assessment workbook (for testing,)
# in particular the sheet with the raw data
cwd = os.getcwd()
#filename = os.path.join(cwd, 'data/RMI/MISAT/MISAT 2019/3GrEng2019/AllSchools_A03_2018-19_Results.xls')
#filename = os.path.join(cwd, 'data/RMI/MISAT/MISAT 2012/6grEng12/AllSchools_A06_2011-12_Results.xls')
#filename = os.path.join(cwd, 'data/RMI/MISAT/MISAT 2010/3GrMath/AllSchools_M03_2009-10_Results.xls')
filename = os.path.join(cwd, 'data/RMI/MISAT/MISAT 2012/6GrEng2012/AllSchools_A06_2011-12_Results.xls')
#filename = os.path.join(cwd, 'data/RMI/MISAT/MISAT 2009/3GrKM2009/AllSchools_B03_2008-09_Results.xls')
#filename = os.path.join(cwd, 'data/RMI/MISAT/MISAT 2014/Gr6Eng2014/AllSchools_A06_2013-14_Results.xls')

df_student_results = load_excel_to_df(filename)
print('df_student_results')
display(df_student_results)

df_student_results


Unnamed: 0,RecordNo,SchoolYear,TestID,TestName,IslandName,SchoolID,SchoolName,StudentID,StudentName,Gender,...,Item_031_AS0602020603m_ccc,Item_032_AS0602020604h_ccc,Item_033_AS0602020101e_ddd,Item_034_AS0602020102m_ddd,Item_035_AS0602020103m_aaa,Item_036_AS0602020104h_bbb,Item_037_AS0602020401e_ccc,Item_038_AS0602020402m_ddd,Item_039_AS0602020403m_bbb,Item_040_AS0602020404h_ccc
0,1,2011-12,A06,A06 - Reading Grade 6 - English,Aelonlaplap,101,Airok A,419,Jally Kedibad,M,...,A,D,A,B,,,,,,
1,2,2011-12,A06,A06 - Reading Grade 6 - English,Aelonlaplap,101,Airok A,420,Macklynn Kedibad,F,...,D,D,B,A,B,D,B,C,,
2,3,2011-12,A06,A06 - Reading Grade 6 - English,Aelonlaplap,101,Airok A,421,Sign Henson,F,...,A,C,D,B,B,A,C,C,B,C
3,4,2011-12,A06,A06 - Reading Grade 6 - English,Aelonlaplap,101,Airok A,422,Wadik Samuel,F,...,B,A,C,B,D,A,C,B,D,C
4,5,2011-12,A06,A06 - Reading Grade 6 - English,Aelonlaplap,102,Buoj,423,Jimor Samule,M,...,B,A,D,C,D,B,B,D,B,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1162,1163,2011-12,A06,A06 - Reading Grade 6 - English,Wotje,194,Wotje,496,Kano Lanwi,M,...,D,B,D,C,A,B,B,A,D,C
1163,1164,2011-12,A06,A06 - Reading Grade 6 - English,Wotto,195,Wotto,99,Monica Ainrik,F,...,,,,,,,,,,
1164,1165,2011-12,A06,A06 - Reading Grade 6 - English,Wotto,195,Wotto,100,Junior Botlok,M,...,A,C,D,C,B,A,D,A,C,D
1165,1166,2011-12,A06,A06 - Reading Grade 6 - English,Wotto,195,Wotto,101,Aaron Elanzo,M,...,A,C,C,B,C,A,B,A,C,B


In [99]:
%%time
# Load all SOE Assessment workbook inside a directory
# (~50 seconds on iMac with i9 CPU and 32GB RAM)
cwd = os.getcwd()
path = os.path.join(cwd, 'data/'+country+'/'+test+'/')

df_student_results_list = []

for root, directories, files in os.walk(path, topdown=False):
    for name in files:
        filename = os.path.join(root, name)
        print('Loading into DataFrame:', filename)
        try:
            df_student_results_list.append(load_excel_to_df(filename))
            #df_student_results_list[name] = load_excel_to_df(filename)
        except:
            print('Problem loading:', filename)
            #print('Error was:', )            

print('Completed loading excel files')

Loading into DataFrame: /mnt/c/Users/Ghislain Hachey/Google Drive (ghachey@nuzusys.com)/Development/Pacific EMIS/repositories/pacific-emis-exams-data-jupyter-python/data/RMI/MISAT/MISAT 2009/3GrEng2009/AllSchools_A03_2008-09_Results.xls
Loading into DataFrame: /mnt/c/Users/Ghislain Hachey/Google Drive (ghachey@nuzusys.com)/Development/Pacific EMIS/repositories/pacific-emis-exams-data-jupyter-python/data/RMI/MISAT/MISAT 2009/3GrKM2009/AllSchools_B03_2008-09_Results.xls
Loading into DataFrame: /mnt/c/Users/Ghislain Hachey/Google Drive (ghachey@nuzusys.com)/Development/Pacific EMIS/repositories/pacific-emis-exams-data-jupyter-python/data/RMI/MISAT/MISAT 2009/3GrMath2009/AllSchools_M03_2008-09_Results.xls
Loading into DataFrame: /mnt/c/Users/Ghislain Hachey/Google Drive (ghachey@nuzusys.com)/Development/Pacific EMIS/repositories/pacific-emis-exams-data-jupyter-python/data/RMI/MISAT/MISAT 2009/6GrEng2009/AllSchools_A06_2008-09_Results.xls
Loading into DataFrame: /mnt/c/Users/Ghislain Hachey

In [133]:
def checkConsecutive(l):
    """Simply checks the items are all consecutive (e.g. Item_001, Item_002, etc)
    Parameters
    ----------
    l : List of items
    
    Returns
    -------
    True if the Items are numbered consecutively
    """    
    l = [int(i.split('_')[1]) for i in l]
    return l == list(range(min(l), max(l)+1))

def create_series(df, accept_testid_alt: False, testing: False):
    """Create a pandas series containing meta data from a SOE Assessment responses raw DataFrame.

    Parameters
    ----------
    df : pandas.core.frame.DataFrame, required
        The DataFrame to produce the Series

    Raises
    ------
    NotImplementedError
        Could raise unknown error. Implement if it happens
    
    Returns
    -------
    pandas.core.serries.Series
    """
    # Create the Series for a particular exams
    sy = df['SchoolYear'].iloc[0]
    if not re.match('20\d{2}-\d{2}$', sy):
        print('Year format incorrect')
    testid = df['TestID'].iloc[0]
    testid_chars = list(testid)
    testid_chars.insert(1,'S')
    testid_alt = "".join(testid_chars)
    testname = df['TestName'].iloc[0]
    
    # this also excludes items with _zzz
    items = df.columns[df.columns.str.startswith('Item_') & ~df.columns.str.contains('_zzz')].tolist()
    
    # Check for inconsistencies in Test Items
    # TestID must be the same as found in the Items (e.g. MS03 is in Item_055_MS0304010103h_ddd)
    test_inconsistencies = False
    test_item_not_matching = False    
    for i in items:
        if accept_testid_alt:
            if not testid in i and not testid_alt in i:
                test_item_not_matching = True
                test_inconsistencies = True
        else:
            if not testid in i:
                test_item_not_matching = True
                test_inconsistencies = True
    
    
    if test_item_not_matching:
        print("Inconsistency detected in the test {} for year {}: Items test ID not matching test ID (e.g. TestID M03 should have items like Item_055_M0304010103h_ddd)".format(testname, sy))
    if not checkConsecutive(items):
        test_inconsistencies = True
        print("Inconsistency detected in the test {} for year {}: Items not correctly ordered (e.g. Item_001_MS0301010101e_aaa, Item_002_MS0301010101e_aaa, Item_004_MS0301010101e_aaa missing Item_003)".format(testname, sy)) 
    if test_inconsistencies:
        print("")
    
    test_meta = [sy, test, testname, len(items), testid]
    test_meta = test_meta + items
    
    s = pd.Series(test_meta)
    return s

In [127]:
s = 'M10'
l = list(s)
l.insert(1,'S')
"".join(l)

'MS10'

In [134]:
# Create a single Series from SOE Assessment workbook (for testing,)
#t = df_student_results
#t.at[0,'SchoolYear'] = '2009-2010'
s_exam_meta_data = create_series(df_student_results, accept_testid_alt=True, testing=True)
print('s_exam_meta_data')
display(s_exam_meta_data)

s_exam_meta_data


0                             2011-12
1                               MISAT
2     A06 - Reading Grade 6 - English
3                                  40
4                                 A06
5          Item_001_AS0602010401e_ddd
6          Item_002_AS0602010402m_aaa
7          Item_003_AS0602010403m_ccc
8          Item_004_AS0602010404h_aaa
9          Item_005_AS0602010401e_ccc
10         Item_006_AS0602010402m_aaa
11         Item_007_AS0602010403m_ddd
12         Item_008_AS0602010404h_bbb
13         Item_009_AS0602020201e_ccc
14         Item_010_AS0602020202m_bbb
15         Item_011_AS0602020203h_aaa
16         Item_012_AS0602020204m_bbb
17         Item_013_AS0602030201e_ccc
18         Item_014_AS0602030202m_ddd
19         Item_015_AS0602030203m_bbb
20         Item_016_AS0602030204h_ddd
21         Item_017_AS0602010301e_ccc
22         Item_018_AS0602010302m_aaa
23         Item_019_AS0602010303m_bbb
24         Item_020_AS0602010304h_aaa
25         Item_021_AS0602020201e_ddd
26         I

In [135]:
%%time
# Create a list of Series from all SOE Assessment workbooks (for testing,)
# Working with all student exams files (~28 seconds on iMac with i9 CPU and 32GB RAM)
s_exam_meta_data_list = []

for df in df_student_results_list:
    s_exam_meta_data_list.append(create_series(df, accept_testid_alt=True, testing=False))

Inconsistency detected in the test B06 - Reading Grade 6 - Marshallese for year 2008-09: Items test ID not matching test ID (e.g. TestID M03 should have items like Item_055_M0304010103h_ddd)

Inconsistency detected in the test H08 - Grade 8 - High School Entrance Test for year 2008-09: Items test ID not matching test ID (e.g. TestID M03 should have items like Item_055_M0304010103h_ddd)

Inconsistency detected in the test M03 - Grade 3 Math for year 2009-10: Items not correctly ordered (e.g. Item_001_MS0301010101e_aaa, Item_002_MS0301010101e_aaa, Item_004_MS0301010101e_aaa missing Item_003)

Inconsistency detected in the test H08 - Grade 8 - High School Entrance Test for year 2009-10: Items test ID not matching test ID (e.g. TestID M03 should have items like Item_055_M0304010103h_ddd)

Inconsistency detected in the test H08 - Grade 8 - High School Entrance Test for year 2010-11: Items test ID not matching test ID (e.g. TestID M03 should have items like Item_055_M0304010103h_ddd)

Incons

In [136]:
# Re-assemble list of series into DataFrames based on the school year
years = []
df_exam_meta_data_list = []

# First create a unique list of all the years for which we have exams data
for s in s_exam_meta_data_list:
    # Get the year
    years.append(s[0])
years = list(dict.fromkeys(years))
print(years)

# Create a dictionary of year to exams meta data DataFrame starting with empty DataFrames
exam_meta_data_dict = {}
for year in years:
    exam_meta_data_dict[year] = pd.DataFrame()
#exam_meta_data_dict
#display(s_exam_meta_data_list)

# Go through the list of series and populate their respective DataFrames
for s in s_exam_meta_data_list:
    # e.g. exam_meta_data_dict['2019-20']
    #exam_meta_data_dict[s[0]]
    try:
        print('Processing exam meta data for test id {} and year {}'.format(s[4], s[0]))
        df1 = exam_meta_data_dict[s[0]]
        df2 = pd.DataFrame()
        df2[s[0]+'-'+s[4]] = s.reset_index(drop=True)   
        df3 = df1.join(df2, how='outer')
        exam_meta_data_dict[s[0]] = df3
    except ValueError as e:
        print('File contains the wrong TestID. Fix file with TestID of {} to match Test Name of {} in year {}'.format(s[4], s[2], s[0]))        
        print('Error was', e)
    except:
        print('Unknown error')

#exam_meta_data_dict['2011-12']

['2008-09', '2009-10', '2010-11', '2011-12', '2012-13', '2013-14', '2014-15', '2015-16', '2016-17', '2017-18', '2018-19', '2019-20']
Processing exam meta data for test id A03 and year 2008-09
Processing exam meta data for test id B03 and year 2008-09
Processing exam meta data for test id M03 and year 2008-09
Processing exam meta data for test id A06 and year 2008-09
Processing exam meta data for test id B06 and year 2008-09
Processing exam meta data for test id M06 and year 2008-09
Processing exam meta data for test id H08 and year 2008-09
Processing exam meta data for test id A03 and year 2009-10
Processing exam meta data for test id B03 and year 2009-10
Processing exam meta data for test id M03 and year 2009-10
Processing exam meta data for test id A06 and year 2009-10
Processing exam meta data for test id B06 and year 2009-10
Processing exam meta data for test id M06 and year 2009-10
Processing exam meta data for test id H08 and year 2009-10
Processing exam meta data for test id A03

In [20]:
# Write processed data back into excel (or CSV directly)
# Working with all student exams files

for year, df in exam_meta_data_dict.items():
    # Remove the year row? They don't seem to need it
    df = df.drop([0])
    try: 
        #exam_year_meta = 'data/RMI/onlinesba-load-files-xls/' + test + '-' + year + '.xlsx'        
        exam_year_meta = 'data/RMI/onlinesba-load-files-csv/' + test + '-' + year + '.csv'        
        filename = os.path.join(cwd, exam_year_meta)
        print('Writing', filename)
        #with pd.ExcelWriter(filename) as writer:
        #    # add DataFrames you want to write to Excel here
        #    df.to_excel(writer, index=False, sheet_name='Sheet1', engine='openpyxl', header=False)
        df.to_csv(filename, index=False)        
    except TypeError as e:
        print('Problem with a type, cannot generate filename')
        print('Unknown error', e) 
    except:
        print('Unknown error') 

Writing /mnt/c/Users/Ghislain Hachey/Google Drive (ghachey@nuzusys.com)/Development/Pacific EMIS/repositories/pacific-emis-exams-data-jupyter-python/data/RMI/onlinesba-load-files-csv/MISAT-2008-09.csv
Writing /mnt/c/Users/Ghislain Hachey/Google Drive (ghachey@nuzusys.com)/Development/Pacific EMIS/repositories/pacific-emis-exams-data-jupyter-python/data/RMI/onlinesba-load-files-csv/MISAT-2009-10.csv
Writing /mnt/c/Users/Ghislain Hachey/Google Drive (ghachey@nuzusys.com)/Development/Pacific EMIS/repositories/pacific-emis-exams-data-jupyter-python/data/RMI/onlinesba-load-files-csv/MISAT-2010-11.csv
Writing /mnt/c/Users/Ghislain Hachey/Google Drive (ghachey@nuzusys.com)/Development/Pacific EMIS/repositories/pacific-emis-exams-data-jupyter-python/data/RMI/onlinesba-load-files-csv/MISAT-2011-12.csv
Writing /mnt/c/Users/Ghislain Hachey/Google Drive (ghachey@nuzusys.com)/Development/Pacific EMIS/repositories/pacific-emis-exams-data-jupyter-python/data/RMI/onlinesba-load-files-csv/MISAT-2012-13