In [None]:
###############################################################################
# This notebook provides some cleanup tools                                   #
# Tools include the following                                                 #
#   - Remove all files in a directory tree that is not an AllSchool           #
#   - Combined all schools in a directory into a AllSchools (when missing)    #
###############################################################################

# Core stuff
import os
from pathlib import Path
import re
import json

# Data stuff
import pandas as pd # Data analysis
import xlrd # excel 
import pyodbc # SQL DB

# Initial setup
test = 'MISAT' # NMCT
country = 'RMI' # FSM
cwd = os.getcwd()

# Configuration
with open('config.json', 'r') as file:
     config = json.load(file)

year_to_load = config['load_year']

In [None]:
def load_excel_to_df(filename):
    """Loads an Excel filename to a Pandas DataFrame.

    Parameters
    ----------
    filename : str, required
        The filename of the excel file to load

    Raises
    ------
    NotImplementedError
        Could raise unknown error. Implement if it happens
    
    Returns
    -------
    DataFrame
    """
    file_path = Path(filename)
    file_extension = file_path.suffix.lower()[1:]

    if file_extension == 'xlsx':
        df = pd.read_excel(filename, index_col=None, header=0, engine='openpyxl')
    elif file_extension == 'xls':
        df = pd.read_excel(filename, index_col=None, header=0)
    elif file_extension == 'csv':
        df = pd.read_csv(filename, index_col=None, header=0)
    else:
        raise Exception("File not supported")

    return df

In [None]:
%%time
# Cleanup and only keep the AllSchools exams file
data_dir = 'data/'+country+'/'+test
path = os.path.join(cwd, data_dir)

if year_to_load != 'all':
    path = os.path.join(path, year_to_load)
    
p = re.compile('AllSchool', re.IGNORECASE)

for root, directories, files in os.walk(path, topdown=False):
    for name in files:
        if p.match(name):
            pass
            #print(os.path.join(root, name))
        else:            
            print('Deleting file: ', os.path.join(root, name))
            os.remove(os.path.join(root, name))
    #for name in directories:
    #    print(os.path.join(root, name))

In [None]:
# After cleanup verify each directory has the AllSchools exams file
for root, directories, files in os.walk(path, topdown=False):
    for d in directories:
        if root[len(path):].count(os.sep) == 1:
            d_abs = os.path.join(root, d)
            print("Directory {}".format(d_abs))
            # Check all files
            files = os.listdir(d_abs)
            for filename in files:
                print("Contain file {}".format(filename))


In [None]:
# Combined all schools in a directory into a single AllSchools
data_dir = 'data/'+country+'/combine-from-schools'
path = os.path.join(cwd, data_dir)

df_student_results_list = []

for root, directories, files in os.walk(path, topdown=False):
    for name in files:
        filename = os.path.join(root, name)
        print('Loading into DataFrame:', filename)
        try:
            df_student_results_list.append(load_excel_to_df(filename))
        except Error as e:
            print('Problem loading:', filename)
            print('Error was', e)            

print('Completed loading excel files')

df_all_schools_student_resuls = pd.concat(df_student_results_list)
df_all_schools_student_resuls

# Write resulting AllSchool DataFrame into Excel
filename = os.path.join(data_dir, 'AllSchools_M06_2013-14_Results.xlsx')
with pd.ExcelWriter(filename) as writer:
    df_all_schools_student_resuls.to_excel(writer, index=False, sheet_name='Responses', engine='openpyxl')
    
print('Completed writing resulting AllSchool DataFrame to file')