In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
import pandas as pd
import numpy as np
from scipy import stats
import xlrd

## Loading Data

#### To use this method, two files have to be saved in this direction: 

##### 1) Question Raw Data, named YYYYData.xls.  The "Report" sheet at the end MUST BE DELETED
##### 2) All Students, named YYYYStudents.xlsx.

In [3]:
def load_data(year):
    directory = './'+str(year)+'Data.xls'
    
    
    overallDF = pd.DataFrame()
    book = xlrd.open_workbook(directory) 
    num_sheets = len(book.sheets())

    sheet_names = book.sheet_names()
    sheet_names

    #GET ALL STUDENTS TO USE AS THE INDEX
    book_students = xlrd.open_workbook('./'+ str(year) + 'Students.xlsx')
    all_students = pd.read_excel('./'+ str(year) + 'Students.xlsx', 'Sheet1', header=None).iloc[:,0].tolist()
    
    possiblePoints = pd.Series() #creating a separate DF for possible points
    
    for i in range(num_sheets):
        this_sheet = sheet_names[i]
        df = pd.read_excel(directory, this_sheet)
        df.set_index(df.columns[0], inplace=True)
        
        possible_points_case = df.iloc[0]
        possiblePoints = possiblePoints.append(possible_points_case)
        
        df = df.reindex(all_students) #somewhere around here is where you decide what to do with NaN values
        df = df.reset_index()
        overallDF = pd.concat([overallDF, df], axis=1)
        
        
    

    return overallDF, possiblePoints

In [4]:
name_ID = pd.read_excel('./AllStudentsWithNumbers.xlsx', 'Sheet1')

ID_Score = pd.read_excel('./CSResults.xlsx', '2008-2014')

name_ID_score = pd.merge(name_ID, ID_Score, on=['Identifier'])

## Get Mean and Variance

#### Normalize values by dividing by possible points for that question, then compute row-wise mean and variance for those selected columns and add to end of DF

In [47]:
def get_mean_var_for_year(yearDF, year_dict, possible_points):
    def construct_full_q(q, form_name, case_name):
        return q + "_" + form_name + "_SP_" + case_name
    
    new_year_df = pd.DataFrame()
    new_year_df["StudentName"] = yearDF.iloc[:,0]
    
    for form_name, sub_cat_dict in year_dict.iteritems():
        for sub_cat, case_dict in sub_cat_dict.iteritems():
#             print(sub_cat)
            all_questions_for_sub_cat = []
            all_poss_points_for_sub_cat = pd.Series()
            for case, questions in case_dict.iteritems():
#                 print("......................." + case)
                col_names = [construct_full_q(q, form_name, case) for q in questions]
                selected_cols = list(yearDF.loc[:, yearDF.columns.str.contains(('|'.join(col_names)))].columns)
#                 if (case == 'AMBN'):
#                     print("Questions: ")
#                     print(questions)
#                     print("Column Names For Query:")
#                     print(col_names)
#                     print("Columns Selected:")
#                     print(selected_cols)
#                 print(all_questions_for_sub_cat)
                all_questions_for_sub_cat = all_questions_for_sub_cat + selected_cols
#                 print(all_questions_for_sub_cat)
                selected_points = possible_points[col_names]
#                 print(type(selected_points))
                all_poss_points_for_sub_cat = all_poss_points_for_sub_cat.append(selected_points) 
#             print("Qs: ")
#             print(len(all_questions_for_sub_cat))
#             print("Poss Points")
#             print(all_poss_points_for_sub_cat.shape)
#             print("mean_" + form_name + "_" + sub_cat)

#             if(len(all_questions_for_sub_cat) != len(set(all_questions_for_sub_cat))):
#                 seen = {}
#                 dupes = []

#                 for x in all_questions_for_sub_cat:
#                     if x not in seen:
#                         seen[x] = 1
#                     else:
#                         if seen[x] == 1:
#                             dupes.append(x)
#                         seen[x] += 1
#                 print(dupes)
#             yearDF[all_questions_for_sub_cat].div(all_poss_points_for_sub_cat).mean(axis=1)
#             print("Made it")
            new_year_df["mean_" + form_name + "_" + sub_cat] = yearDF[all_questions_for_sub_cat].div(all_poss_points_for_sub_cat).mean(axis=1)
            new_year_df["var_" + form_name + "_" + sub_cat] = yearDF[all_questions_for_sub_cat].div(all_poss_points_for_sub_cat).var(axis=1)
    
    return new_year_df

In [6]:
# helper method to generate list of strings based on start and end question numbers
def generate_question_strings(first_q, last_q):
    def question_string(num):
        return "Q" + str(num)
    int_list = list(range(first_q,last_q+1))
    return [question_string(num) for num in int_list]

In [7]:
def deidentify_and_add_scores(year_DF, year):
    named_ID_score_year = name_ID_score[name_ID_score.Year_x == year]

    with_scores = pd.merge(year_DF, named_ID_score_year, left_on = 'StudentName', right_on = 'Student Name')

    final_df = with_scores.drop(['StudentName', 'Student Name', 'Year_y'], axis=1)
    
    ## Move Identifier Column and year to the front of the DF, then rename
    cols = list(final_df)
    cols.insert(0, cols.pop(cols.index('Identifier')))
    cols.insert(1, cols.pop(cols.index('Year_x')))
    final_df = final_df.ix[:, cols]
    
    #Rename some columns
    final_df = final_df.rename(index=str, columns={"Identifier": "ID", "Year_x": "Year"})
    
    #Transform scores
    final_df['P/F'] = final_df['P/F'].map({'P':1,'F':0})

    return final_df

In [8]:
def load_and_clean_year_data(year, year_dict):
    print("loading " + str(year) + " excel sheet....")
    overalldf, possible_points = load_data(year)
    
    seen = {}
    dupes = []

    for x in overalldf.columns:
        if x not in seen:
            seen[x] = 1
        else:
            if seen[x] == 1:
                dupes.append(x)
            seen[x] += 1
    print(dupes)
    
    print("Calculating Mean and Variance of Question Groups...")
    modified = get_mean_var_for_year(overalldf, year_dict, possible_points)
    
    print("Deidentifying and adding scores...")
    final_year_df = deidentify_and_add_scores(modified, year)
    print("Done!")
    
    return final_year_df

### 2009

In [43]:
dict_2009 = {
    'PPI': {
        'init': {
            'AP': generate_question_strings(1,4), #Marty Elliot
            'TW': generate_question_strings(1,4), #Alex Miller
            'AMBN': generate_question_strings(1,4), #Carol Whitman
            'WL': generate_question_strings(1,4), #Corey Wolfe
            'SOB': generate_question_strings(1,4), #Dana Mitchell
            'H': generate_question_strings(1,3),  #Jamie Browning
            'CP': generate_question_strings(1,4), #Leslie Keats
            'CS': generate_question_strings(1,4), #Sam Swift/Grandparent
            'KP': generate_question_strings(1,4), #Shawn Clancy
            'LAP': generate_question_strings(1,4) #Tessa Frost
        },
        'info_gather': {
            'AP': generate_question_strings(5,12),
            'TW': generate_question_strings(5,12),
            'AMBN': generate_question_strings(5,12),
            'WL': generate_question_strings(5,12),
            'SOB': generate_question_strings(5,12),
            'H': generate_question_strings(4,11),
            'CP': generate_question_strings(5,12),
            'CS': generate_question_strings(5,12),
            'KP': generate_question_strings(5,12),
            'LAP': generate_question_strings(5,12) 
        },
        'closing': {
            'AP': generate_question_strings(13,16),
            'TW': generate_question_strings(13,16),
            'AMBN': generate_question_strings(13,16),
            'WL': generate_question_strings(13,16),
            'SOB': generate_question_strings(13,16),
            'H': generate_question_strings(12,15),
            'CP': generate_question_strings(13,16),
            'CS': generate_question_strings(13,16),
            'KP': generate_question_strings(13,16),
            'LAP': generate_question_strings(13,16)
        }
    },
    'Hx': {
        'history': {
            'AP': generate_question_strings(1,9),
            'TW': generate_question_strings(1,13),
            'AMBN': generate_question_strings(1,6),
            'WL': generate_question_strings(1,9),
            'SOB': generate_question_strings(1,14),
            'H': generate_question_strings(1,11),
            'CP': generate_question_strings(1,14),
            'CS': generate_question_strings(1,11),
            'KP': generate_question_strings(1,9),
            'LAP': generate_question_strings(1,15)
        }
    },
    'PE': { #Carol Whitman did not do PE
        'handwash': {
            'AP': generate_question_strings(1,1),
            'TW': generate_question_strings(1,1),
            'WL': generate_question_strings(1,1),
            'SOB': generate_question_strings(1,1),
            'H': generate_question_strings(1,1),
            'CP': generate_question_strings(1,1),
            'CS': generate_question_strings(1,1),
            'KP': generate_question_strings(1,1),
            'LAP': generate_question_strings(1,1)
        },
        'phys_check': {
            'AP': generate_question_strings(2,2) + generate_question_strings(4,9),
            'TW': generate_question_strings(2,2) + generate_question_strings(4,20),
            'WL': generate_question_strings(2,2) + generate_question_strings(4,10),
            'SOB': generate_question_strings(2,2) + generate_question_strings(4,6),
            'H': generate_question_strings(2,2) + generate_question_strings(4,21),
            'CP': generate_question_strings(2,2) + generate_question_strings(4,10),
            'CS': generate_question_strings(2,2) + generate_question_strings(4,13),
            'KP': generate_question_strings(2,2) + generate_question_strings(4,8),
            'LAP': generate_question_strings(2,2) + generate_question_strings(3,7)
        },
        'modesty': {
            'AP': generate_question_strings(3,3),
            'TW': generate_question_strings(3,3),
            'WL': generate_question_strings(3,3),
            'SOB': generate_question_strings(3,3),
            'H': generate_question_strings(3,3),
            'CP': generate_question_strings(3,3),
            'CS': generate_question_strings(3,3),
            'KP': generate_question_strings(3,3),
            'LAP': generate_question_strings(3,3)
        }
    },
    'PS': {
        'personal': {
            'AP': generate_question_strings(1,1),
            'TW': generate_question_strings(1,1),
            'AMBN': generate_question_strings(1,1),
            'WL': generate_question_strings(1,1),
            'SOB': generate_question_strings(1,1),
            'H': generate_question_strings(1,1),
            'CP': generate_question_strings(1,1),
            'CS': generate_question_strings(1,1),
            'KP': generate_question_strings(1,1),
            'LAP': generate_question_strings(1,1)
        },
        'rec': {
            'AP': generate_question_strings(2,2),
            'TW': generate_question_strings(2,2),
            'AMBN': generate_question_strings(2,2),
            'WL': generate_question_strings(2,2),
            'SOB': generate_question_strings(2,2),
            'H': generate_question_strings(2,2),
            'CP': generate_question_strings(2,2),
            'CS': generate_question_strings(2,2),
            'KP': generate_question_strings(2,2),
            'LAP': generate_question_strings(1,1)
        }
    }
}

In [48]:
clean_dataset_2009 = load_and_clean_year_data(2009, dict_2009)

print(clean_dataset_2009.shape)
clean_dataset_2009.head()

loading 2009 excel sheet....
Calculating Mean and Variance of Question Groups...
Questions: 
['Q1']
Column Names For Query:
['Q1_PS_SP_AMBN']
Columns Selected:
[u'Q1_PS_SP_AMBN']
mean_PS_personal
Made it
Questions: 
['Q2']
Column Names For Query:
['Q2_PS_SP_AMBN']
Columns Selected:
[u'Q2_PS_SP_AMBN']
mean_PS_rec
Made it
Questions: 
['Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12']
Column Names For Query:
['Q5_PPI_SP_AMBN', 'Q6_PPI_SP_AMBN', 'Q7_PPI_SP_AMBN', 'Q8_PPI_SP_AMBN', 'Q9_PPI_SP_AMBN', 'Q10_PPI_SP_AMBN', 'Q11_PPI_SP_AMBN', 'Q12_PPI_SP_AMBN']
Columns Selected:
[u'Q5_PPI_SP_AMBN', u'Q6_PPI_SP_AMBN', u'Q7_PPI_SP_AMBN', u'Q8_PPI_SP_AMBN', u'Q9_PPI_SP_AMBN', u'Q10_PPI_SP_AMBN', u'Q11_PPI_SP_AMBN', u'Q12_PPI_SP_AMBN', u'Q5_PPI_SP_AMBN', u'Q6_PPI_SP_AMBN', u'Q7_PPI_SP_AMBN', u'Q8_PPI_SP_AMBN', u'Q9_PPI_SP_AMBN', u'Q10_PPI_SP_AMBN', u'Q11_PPI_SP_AMBN', u'Q12_PPI_SP_AMBN']
mean_PPI_info_gather
[u'Q5_PPI_SP_AMBN', u'Q6_PPI_SP_AMBN', u'Q7_PPI_SP_AMBN', u'Q8_PPI_SP_AMBN', u'Q9_PPI_SP_AMB

ValueError: cannot reindex from a duplicate axis

# 2008

In [None]:
# dict_2008 = {
#     'ppi_initiation': ["Q1_PPI", "Q2_PPI", "Q3_PPI", "Q4_PPI"], 
#     'ppi_info_gather': ["Q5_PPI", "Q6_PPI", "Q7_PPI", "Q8_PPI", "Q9_PPI", "Q10_PPI", "Q11_PPI"], 
#     'ppi_closing': ["Q12_PPI", "Q13_PPI", "Q14_PPI"], 
#     'hx_physical': ["Q1_Hx", "Q2_Hx", "Q3_Hx", "Q4_Hx", "Q5_Hx", "Q6_Hx", "Q7_Hx", "Q8_Hx", "Q9_Hx", "Q10_Hx", "Q11_Hx", "Q12_Hx"], 
#     'hx_social': ["Q13_Hx"], 
#     'pe_handwash': ["Q1_PE"], 
#     'pe_phys_check': ["Q2_PE", "Q4_PE", "Q5_PE", "Q6_PE", "Q7_PE", "Q8_PE", "Q9_PE", "Q10_PE", "Q11_PE", "Q12_PE", "Q13_PE", "Q14_PE", "Q15_PE", "Q16_PE", "Q17_PE", "Q18_PE", "Q19_PE", "Q20_PE"], 
#     'pe_modesty': ["Q3_PE"], 
#     'ps_personal': ['Q1_PS'], 
#     'ps_rec': ['Q2_PS']
# }