In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
import pandas as pd
import numpy as np
from scipy import stats
import xlrd

## Loading Data

#### To use this method, two files have to be saved in this direction: 

##### 1) Question Raw Data, named YYYYData.xls.  The "Report" sheet at the end MUST BE DELETED
##### 2) All Students, named YYYYStudents.xlsx.

In [3]:
def load_data(year):
    directory = './'+str(year)+'Data.xls'
    
    
    overallDF = pd.DataFrame()
    book = xlrd.open_workbook(directory) 
    num_sheets = len(book.sheets())

    sheet_names = book.sheet_names()
    sheet_names

    #GET ALL STUDENTS TO USE AS THE INDEX
    book_students = xlrd.open_workbook('./'+ str(year) + 'Students.xlsx')
    all_students = pd.read_excel('./'+ str(year) + 'Students.xlsx', 'Sheet1', header=None).iloc[:,0].tolist()
    
    possiblePoints = pd.Series() #creating a separate DF for possible points
    
    for i in range(num_sheets):
        this_sheet = sheet_names[i]
        if (this_sheet == "Carol Whitman - 03-SPIKES Proto"):
            continue ## HACK TO SKIP A SHEET THAT HAS IDENTICAL COLUMNS TO ANOTHER SHEET
        df = pd.read_excel(directory, this_sheet)
        
        ## GRAB NAME OF FIRST COLUMN AND MAKE IT THE INDEX OF NEW DF
        student_name_column = df.columns[0]
        df.set_index(student_name_column, inplace=True)
        
        possible_points_case = df.iloc[0]
        possiblePoints = possiblePoints.combine_first(possible_points_case)
        
        df = df.reindex(all_students) #somewhere around here is where you decide what to do with NaN values
        df = df.reset_index()
#         overallDF = pd.concat([overallDF, df], axis=1)
        overallDF = overallDF.combine_first(df) ## This should resolve the Sam Swift Case
        
        ## MAKE THE COLUMN WITH THE STUDENT NAMES THE FIRST COLUMN AGAIN, BECAUSE IT IS
        ## EXPECTED IN FUTURE METHODS
        cols = list(overallDF)
        cols.insert(0, cols.pop(cols.index(student_name_column)))
        
        overallDF = overallDF.ix[:, cols]
        
        
    
    return overallDF, possiblePoints

In [4]:
name_ID = pd.read_excel('./AllStudentsWithNumbers.xlsx', 'Sheet1')

ID_Score = pd.read_excel('./CSResults.xlsx', '2008-2014')

name_ID_score = pd.merge(name_ID, ID_Score, on=['Identifier'])

## Get Mean and Variance

#### Normalize values by dividing by possible points for that question, then compute row-wise mean and variance for those selected columns and add to end of DF

In [5]:
def get_mean_var_for_year(yearDF, year_dict, possible_points):
    def construct_full_q(q, form_name, case_name):
        return q + "_" + form_name + "_SP_" + case_name
    
    new_year_df = pd.DataFrame()
    new_year_df["StudentName"] = yearDF.iloc[:,0]
    
    for form_name, sub_cat_dict in year_dict.iteritems():
        for sub_cat, case_dict in sub_cat_dict.iteritems():
            all_questions_for_sub_cat = []
            all_poss_points_for_sub_cat = pd.Series()
            for case, questions in case_dict.iteritems():
                col_names = [construct_full_q(q, form_name, case) for q in questions]
                selected_cols = list(yearDF.loc[:, yearDF.columns.str.contains(('|'.join(col_names)))].columns)
                all_questions_for_sub_cat = all_questions_for_sub_cat + selected_cols
                selected_points = possible_points[col_names]
                all_poss_points_for_sub_cat = all_poss_points_for_sub_cat.append(selected_points) 
            
#             print(yearDF[all_questions_for_sub_cat].div(all_poss_points_for_sub_cat).mean(axis=1))
            new_year_df["mean_" + form_name + "_" + sub_cat] = yearDF[all_questions_for_sub_cat].div(all_poss_points_for_sub_cat).mean(axis=1)
            new_year_df["var_" + form_name + "_" + sub_cat] = yearDF[all_questions_for_sub_cat].div(all_poss_points_for_sub_cat).var(axis=1)
    
    return new_year_df

In [6]:
# helper method to generate list of strings based on start and end question numbers
def generate_question_strings(first_q, last_q):
    def question_string(num):
        return "Q" + str(num)
    int_list = list(range(first_q,last_q+1))
    return [question_string(num) for num in int_list]

In [7]:
def deidentify_and_add_scores(year_DF, year):
    named_ID_score_year = name_ID_score[name_ID_score.Year_x == year]

    with_scores = pd.merge(year_DF, named_ID_score_year, left_on = 'StudentName', right_on = 'Student Name')

    final_df = with_scores.drop(['StudentName', 'Student Name', 'Year_y'], axis=1)
    
    ## Move Identifier Column and year to the front of the DF, then rename
    cols = list(final_df)
    cols.insert(0, cols.pop(cols.index('Identifier')))
    cols.insert(1, cols.pop(cols.index('Year_x')))
    final_df = final_df.ix[:, cols]
    
    #Rename some columns
    final_df = final_df.rename(index=str, columns={"Identifier": "ID", "Year_x": "Year"})
    
    #Transform scores
    final_df['P/F'] = final_df['P/F'].map({'P':1,'F':0})

    return final_df

In [8]:
def load_and_clean_year_data(year, year_dict):
    print("loading " + str(year) + " excel sheet....")
    overalldf, possible_points = load_data(year)    
    
    print("Calculating Mean and Variance of Question Groups...")
    modified = get_mean_var_for_year(overalldf, year_dict, possible_points)
    
    print("Deidentifying and adding scores...")
    final_year_df = deidentify_and_add_scores(modified, year)
    print("Done!")
    
    return final_year_df

In [20]:
def transform_and_output(df):
    print("hello")

# 2008

In [9]:
dict_2008 = {
    'PPI': {
        'init': {
            'AP': generate_question_strings(1,4), #Marty Elliot
            'TW': generate_question_strings(1,4), #Alex Miller
            'AMBN': generate_question_strings(1,4), #Carol Whitman
            'WL': generate_question_strings(1,4), #Corey Wolfe
            'SOB': generate_question_strings(1,4), #Dana Mitchell
            'H': generate_question_strings(1,4),  #Jamie Browning
            'CP': generate_question_strings(1,4), #Leslie Keats
            'CS': generate_question_strings(1,4), #Sam Swift/Grandparent or Sam Swift/Parent
            'KP': generate_question_strings(1,4), #Shawn Clancy
            'LAP': generate_question_strings(1,4) #Tessa Frost
        },
        'info_gather': {
            'AP': generate_question_strings(5,11),
            'TW': generate_question_strings(5,11),
            'AMBN': generate_question_strings(5,11),
            'WL': generate_question_strings(5,11),
            'SOB': generate_question_strings(5,11),
            'H': generate_question_strings(5,12),
            'CP': generate_question_strings(5,11),
            'CS': generate_question_strings(5,11),
            'KP': generate_question_strings(5,11),
            'LAP': generate_question_strings(5,11) 
        },
        'closing': {
            'AP': generate_question_strings(12,14),
            'TW': generate_question_strings(12,14),
            'AMBN': generate_question_strings(12,14),
            'WL': generate_question_strings(12,14),
            'SOB': generate_question_strings(12,14),
            'H': generate_question_strings(13,16),
            'CP': generate_question_strings(12,14),
            'CS': generate_question_strings(12,14),
            'KP': generate_question_strings(12,14),
            'LAP': generate_question_strings(12,14)
        }
    },
    'Hx': {
        'history': {
            'AP': generate_question_strings(1,9),
            'TW': generate_question_strings(1,13),
            'AMBN': generate_question_strings(1,6),
            'WL': generate_question_strings(1,9),
            'SOB': generate_question_strings(1,14),
            'H': generate_question_strings(1,11),
            'CP': generate_question_strings(1,14),
            'CS': generate_question_strings(1,11),
            'KP': generate_question_strings(1,9),
            'LAP': generate_question_strings(1,14)
        }
    },
    'PE': { #Carol Whitman did not do PE
        'handwash': {
            'AP': generate_question_strings(1,1),
            'TW': generate_question_strings(1,1),
            'WL': generate_question_strings(1,1),
            'SOB': generate_question_strings(1,1),
            'H': generate_question_strings(1,1),
            'CP': generate_question_strings(1,1),
            'CS': generate_question_strings(1,1),
            'KP': generate_question_strings(1,1),
            'LAP': generate_question_strings(1,1)
        },
        'phys_check': {
            'AP': generate_question_strings(2,2) + generate_question_strings(4,9),
            'TW': generate_question_strings(2,2) + generate_question_strings(4,20),
            'WL': generate_question_strings(2,2) + generate_question_strings(4,10),
            'SOB': generate_question_strings(2,2) + generate_question_strings(4,6),
            'H': generate_question_strings(2,2) + generate_question_strings(4,23),
            'CP': generate_question_strings(2,2) + generate_question_strings(4,10),
            'CS': generate_question_strings(2,2) + generate_question_strings(4,13),
            'KP': generate_question_strings(2,2) + generate_question_strings(4,8),
            'LAP': generate_question_strings(2,2) + generate_question_strings(4,7)
        },
        'modesty': {
            'AP': generate_question_strings(3,3),
            'TW': generate_question_strings(3,3),
            'WL': generate_question_strings(3,3),
            'SOB': generate_question_strings(3,3),
            'H': generate_question_strings(3,3),
            'CP': generate_question_strings(3,3),
            'CS': generate_question_strings(3,3),
            'KP': generate_question_strings(3,3),
            'LAP': generate_question_strings(3,3)
        }
    },
    'PS': {
        'personal': {
            'AP': generate_question_strings(1,1),
            'TW': generate_question_strings(1,1),
            'AMBN': generate_question_strings(1,1),
            'WL': generate_question_strings(1,1),
            'SOB': generate_question_strings(1,1),
            'H': generate_question_strings(1,1),
            'CP': generate_question_strings(1,1),
            'CS': generate_question_strings(1,1),
            'KP': generate_question_strings(1,1),
            'LAP': generate_question_strings(1,1)
        },
        'rec': {
            'AP': generate_question_strings(2,2),
            'TW': generate_question_strings(2,2),
            'AMBN': generate_question_strings(2,2),
            'WL': generate_question_strings(2,2),
            'SOB': generate_question_strings(2,2),
            'H': generate_question_strings(2,2),
            'CP': generate_question_strings(2,2),
            'CS': generate_question_strings(2,2),
            'KP': generate_question_strings(2,2),
            'LAP': generate_question_strings(2,2)
        }
    }
}

In [10]:
clean_dataset_2008 = load_and_clean_year_data(2008, dict_2008)

print(clean_dataset_2008.shape)
clean_dataset_2008.head()

loading 2008 excel sheet....
Calculating Mean and Variance of Question Groups...
Deidentifying and adding scores...
Done!
(103, 21)


Unnamed: 0,ID,Year,mean_PS_personal,var_PS_personal,mean_PS_rec,var_PS_rec,mean_PPI_info_gather,var_PPI_info_gather,mean_PPI_init,var_PPI_init,...,var_PPI_closing,mean_Hx_history,var_Hx_history,mean_PE_modesty,var_PE_modesty,mean_PE_handwash,var_PE_handwash,mean_PE_phys_check,var_PE_phys_check,P/F
0,865413,2008,0.725,0.061806,0.672222,0.058951,0.915493,0.078471,0.925,0.071154,...,0.16129,0.790909,0.166889,0.777778,0.194444,0.777778,0.194444,0.715909,0.205721,1
1,367062,2008,0.6,0.058333,0.572222,0.058333,0.830986,0.142455,0.875,0.112179,...,0.225806,0.8,0.161468,0.888889,0.111111,0.555556,0.277778,0.522727,0.252351,1
2,245385,2008,0.9,0.016667,0.875,0.03125,0.985915,0.014085,0.975,0.025,...,0.062366,0.818182,0.150125,1.0,0.0,0.555556,0.277778,0.625,0.237069,1
3,286071,2008,0.8,0.038889,0.744444,0.045062,0.957746,0.041046,1.0,0.0,...,0.225806,0.754545,0.186906,1.0,0.0,0.777778,0.194444,0.625,0.237069,1
4,536349,2008,0.175,0.028472,0.175,0.028472,0.704225,0.211268,0.7,0.215385,...,0.251613,0.663636,0.225271,0.777778,0.194444,0.111111,0.111111,0.522727,0.252351,0


### 2009

In [11]:
dict_2009 = {
    'PPI': {
        'init': {
            'AP': generate_question_strings(1,4), #Marty Elliot
            'TW': generate_question_strings(1,4), #Alex Miller
            'AMBN': generate_question_strings(1,4), #Carol Whitman
            'WL': generate_question_strings(1,4), #Corey Wolfe
            'SOB': generate_question_strings(1,4), #Dana Mitchell
            'H': generate_question_strings(1,3),  #Jamie Browning
            'CP': generate_question_strings(1,4), #Leslie Keats
            'CS': generate_question_strings(1,4), #Sam Swift/Grandparent
            'KP': generate_question_strings(1,4), #Shawn Clancy
            'LAP': generate_question_strings(1,4) #Tessa Frost
        },
        'info_gather': {
            'AP': generate_question_strings(5,12),
            'TW': generate_question_strings(5,12),
            'AMBN': generate_question_strings(5,12),
            'WL': generate_question_strings(5,12),
            'SOB': generate_question_strings(5,12),
            'H': generate_question_strings(4,11),
            'CP': generate_question_strings(5,12),
            'CS': generate_question_strings(5,12),
            'KP': generate_question_strings(5,12),
            'LAP': generate_question_strings(5,12) 
        },
        'closing': {
            'AP': generate_question_strings(13,16),
            'TW': generate_question_strings(13,16),
            'AMBN': generate_question_strings(13,16),
            'WL': generate_question_strings(13,16),
            'SOB': generate_question_strings(13,16),
            'H': generate_question_strings(12,15),
            'CP': generate_question_strings(13,16),
            'CS': generate_question_strings(13,16),
            'KP': generate_question_strings(13,16),
            'LAP': generate_question_strings(13,16)
        }
    },
    'Hx': {
        'history': {
            'AP': generate_question_strings(1,9),
            'TW': generate_question_strings(1,13),
            'AMBN': generate_question_strings(1,6),
            'WL': generate_question_strings(1,9),
            'SOB': generate_question_strings(1,14),
            'H': generate_question_strings(1,11),
            'CP': generate_question_strings(1,14),
            'CS': generate_question_strings(1,11),
            'KP': generate_question_strings(1,9),
            'LAP': generate_question_strings(1,15)
        }
    },
    'PE': { #Carol Whitman did not do PE
        'handwash': {
            'AP': generate_question_strings(1,1),
            'TW': generate_question_strings(1,1),
            'WL': generate_question_strings(1,1),
            'SOB': generate_question_strings(1,1),
            'H': generate_question_strings(1,1),
            'CP': generate_question_strings(1,1),
            'CS': generate_question_strings(1,1),
            'KP': generate_question_strings(1,1),
            'LAP': generate_question_strings(1,1)
        },
        'phys_check': {
            'AP': generate_question_strings(2,2) + generate_question_strings(4,9),
            'TW': generate_question_strings(2,2) + generate_question_strings(4,20),
            'WL': generate_question_strings(2,2) + generate_question_strings(4,10),
            'SOB': generate_question_strings(2,2) + generate_question_strings(4,6),
            'H': generate_question_strings(2,2) + generate_question_strings(4,21),
            'CP': generate_question_strings(2,2) + generate_question_strings(4,10),
            'CS': generate_question_strings(2,2) + generate_question_strings(4,13),
            'KP': generate_question_strings(2,2) + generate_question_strings(4,8),
            'LAP': generate_question_strings(2,2) + generate_question_strings(4,7)
        },
        'modesty': {
            'AP': generate_question_strings(3,3),
            'TW': generate_question_strings(3,3),
            'WL': generate_question_strings(3,3),
            'SOB': generate_question_strings(3,3),
            'H': generate_question_strings(3,3),
            'CP': generate_question_strings(3,3),
            'CS': generate_question_strings(3,3),
            'KP': generate_question_strings(3,3),
            'LAP': generate_question_strings(3,3)
        }
    },
    'PS': {
        'personal': {
            'AP': generate_question_strings(1,1),
            'TW': generate_question_strings(1,1),
            'AMBN': generate_question_strings(1,1),
            'WL': generate_question_strings(1,1),
            'SOB': generate_question_strings(1,1),
            'H': generate_question_strings(1,1),
            'CP': generate_question_strings(1,1),
            'CS': generate_question_strings(1,1),
            'KP': generate_question_strings(1,1),
            'LAP': generate_question_strings(1,1)
        },
        'rec': {
            'AP': generate_question_strings(2,2),
            'TW': generate_question_strings(2,2),
            'AMBN': generate_question_strings(2,2),
            'WL': generate_question_strings(2,2),
            'SOB': generate_question_strings(2,2),
            'H': generate_question_strings(2,2),
            'CP': generate_question_strings(2,2),
            'CS': generate_question_strings(2,2),
            'KP': generate_question_strings(2,2),
            'LAP': generate_question_strings(2,2)
        }
    }
}

In [12]:
clean_dataset_2009 = load_and_clean_year_data(2009, dict_2009)

print(clean_dataset_2009.shape)
clean_dataset_2009.head()

loading 2009 excel sheet....
Calculating Mean and Variance of Question Groups...
Deidentifying and adding scores...
Done!
(109, 21)


Unnamed: 0,ID,Year,mean_PS_personal,var_PS_personal,mean_PS_rec,var_PS_rec,mean_PPI_info_gather,var_PPI_info_gather,mean_PPI_init,var_PPI_init,...,var_PPI_closing,mean_Hx_history,var_Hx_history,mean_PE_modesty,var_PE_modesty,mean_PE_handwash,var_PE_handwash,mean_PE_phys_check,var_PE_phys_check,P/F
0,815893,2009,,,,,,,,,...,,,,,,,,,,1
1,792928,2009,0.25,0.069444,0.15,0.030556,0.6625,0.226424,0.74359,0.195682,...,0.250641,0.774775,0.176085,0.666667,0.25,0.333333,0.25,0.406977,0.244186,1
2,662719,2009,,,,,,,,,...,,,,,,,,,,0
3,610461,2009,0.725,0.047917,0.775,0.061806,0.9125,0.080854,0.974359,0.025641,...,0.130769,0.675676,0.22113,0.888889,0.111111,0.666667,0.25,0.488372,0.252804,1
4,624003,2009,0.575,0.084028,0.5,0.097222,0.9125,0.080854,0.948718,0.049933,...,0.215385,0.693694,0.214414,0.777778,0.194444,0.888889,0.111111,0.627907,0.236389,1


## 2010

In [13]:
dict_2010 = {
    'PPI': {
        'init': {
            'AP': generate_question_strings(1,9), #Marty Elliot
            'TW': generate_question_strings(1,9), #Alex Miller
            'AMBN': generate_question_strings(1,9), #Carol Whitman
            'WL': generate_question_strings(1,9), #Corey Wolfe
            'SOB': generate_question_strings(1,9), #Dana Mitchell
            'PCMS': generate_question_strings(1,5),  #Margaret Lockhart
            'CP': generate_question_strings(1,9), #Leslie Keats
            'CS': generate_question_strings(1,9), #Sam Swift/Grandparent or Sam Swift/Parent
            'KP': generate_question_strings(1,9), #Shawn Clancy
            'LAP': generate_question_strings(1,9) #Tessa Frost
        },
        'info_gather': {
            'AP': generate_question_strings(10,16),
            'TW': generate_question_strings(10,16),
            'AMBN': generate_question_strings(10,16),
            'WL': generate_question_strings(10,16),
            'SOB': generate_question_strings(10,16),
            'PCMS': generate_question_strings(6,12),
            'CP': generate_question_strings(10,16),
            'CS': generate_question_strings(10,16),
            'KP': generate_question_strings(10,16),
            'LAP': generate_question_strings(10,16) 
        },
        'closing': {
            'AP': generate_question_strings(17,22),
            'TW': generate_question_strings(17,22),
            'AMBN': generate_question_strings(17,22),
            'WL': generate_question_strings(17,22),
            'SOB': generate_question_strings(17,22),
            'PCMS': generate_question_strings(13,18),
            'CP': generate_question_strings(17,22),
            'CS': generate_question_strings(17,22),
            'KP': generate_question_strings(17,22),
            'LAP': generate_question_strings(17,22)
        }
    },
    'Hx': {
        'history': {
            'AP': generate_question_strings(1,10),
            'TW': generate_question_strings(1,13),
            'AMBN': generate_question_strings(1,6),
            'WL': generate_question_strings(1,9),
            'SOB': generate_question_strings(1,14),
            'PCMS': generate_question_strings(1,18),
            'CP': generate_question_strings(1,14),
            'CS': generate_question_strings(1,11),
            'KP': generate_question_strings(1,9),
            'LAP': generate_question_strings(1,15)
        }
    },
    'PE': { #Carol Whitman & Margaret Lockhart did not do PE
        'handwash': {
            'AP': generate_question_strings(1,1),
            'TW': generate_question_strings(1,1),
            'WL': generate_question_strings(1,1),
            'SOB': generate_question_strings(1,1),
            'CP': generate_question_strings(1,1),
            'CS': generate_question_strings(1,1),
            'KP': generate_question_strings(1,1),
            'LAP': generate_question_strings(1,1)
        },
        'phys_check': {
            'AP': generate_question_strings(3,8),
            'TW': generate_question_strings(3,20),
            'WL': generate_question_strings(3,10),
            'SOB': generate_question_strings(3,5),
            'CP': generate_question_strings(3,10),
            'CS': generate_question_strings(3,12),
            'KP': generate_question_strings(3,7),
            'LAP': generate_question_strings(3,6)
        },
        'modesty': {
            'AP': generate_question_strings(2,2),
            'TW': generate_question_strings(2,2),
            'WL': generate_question_strings(2,2),
            'SOB': generate_question_strings(2,2),
            'CP': generate_question_strings(2,2),
            'CS': generate_question_strings(2,2),
            'KP': generate_question_strings(2,2),
            'LAP': generate_question_strings(2,2)
        }
    },
    'PS': {
        'personal': {
            'AP': generate_question_strings(1,1) + generate_question_strings(3,3),
            'TW': generate_question_strings(1,1) + generate_question_strings(3,3),
            'AMBN': generate_question_strings(1,1) + generate_question_strings(3,3),
            'WL': generate_question_strings(1,1) + generate_question_strings(3,3),
            'SOB': generate_question_strings(1,1) + generate_question_strings(3,3),
            'PCMS': generate_question_strings(1,1) + generate_question_strings(3,3),
            'CP': generate_question_strings(1,1) + generate_question_strings(3,3),
            'CS': generate_question_strings(1,1) + generate_question_strings(3,3),
            'KP': generate_question_strings(1,1) + generate_question_strings(3,3),
            'LAP': generate_question_strings(1,1) + generate_question_strings(3,3)
        },
        'rec': {
            'AP': generate_question_strings(2,2),
            'TW': generate_question_strings(2,2),
            'AMBN': generate_question_strings(2,2),
            'WL': generate_question_strings(2,2),
            'SOB': generate_question_strings(2,2),
            'PCMS': generate_question_strings(2,2),
            'CP': generate_question_strings(2,2),
            'CS': generate_question_strings(2,2),
            'KP': generate_question_strings(2,2),
            'LAP': generate_question_strings(2,2)
        }
    }
}

In [14]:
clean_dataset_2010 = load_and_clean_year_data(2010, dict_2010)

print(clean_dataset_2010.shape)
clean_dataset_2010.head()

loading 2010 excel sheet....
Calculating Mean and Variance of Question Groups...
Deidentifying and adding scores...
Done!
(112, 21)


Unnamed: 0,ID,Year,mean_PS_personal,var_PS_personal,mean_PS_rec,var_PS_rec,mean_PPI_info_gather,var_PPI_info_gather,mean_PPI_init,var_PPI_init,...,var_PPI_closing,mean_Hx_history,var_Hx_history,mean_PE_modesty,var_PE_modesty,mean_PE_handwash,var_PE_handwash,mean_PE_phys_check,var_PE_phys_check,P/F
0,521771,2010,,,,,,,,,...,,,,,,,,,,1
1,139291,2010,0.8625,0.022862,0.85,0.016667,0.724,0.100181,0.790581,0.076469,...,0.102543,0.798319,0.16237,0.75,0.214286,0.875,0.125,0.442623,0.25082,1
2,813543,2010,0.7875,0.014967,0.775,0.020139,0.72,0.06353,0.776279,0.050857,...,0.08524,0.789916,0.167355,1.0,0.0,0.75,0.214286,0.622951,0.238798,1
3,816724,2010,0.5125,0.068914,0.5,0.055556,0.557,0.11254,0.620116,0.105684,...,0.128079,0.672269,0.222191,0.625,0.267857,0.75,0.214286,0.52459,0.253552,1
4,101459,2010,0.575,0.099342,0.5,0.083333,0.643286,0.114234,0.666628,0.107797,...,0.145449,0.647059,0.230309,1.0,0.0,1.0,0.0,0.540984,0.252459,1


## 2011

In [15]:
dict_2011 = {
    'PPI': {
        'init': {
            'AP': generate_question_strings(1,9), #Marty Elliot
            'TW': generate_question_strings(1,9), #Alex Miller
            'AMBN': generate_question_strings(1,9), #Carol Whitman
            'WL': generate_question_strings(1,9), #Corey Wolfe
            'SOB': generate_question_strings(1,9), #Dana Mitchell
            'PCMS': generate_question_strings(1,5),  #Margaret Lockhart
            'CP': generate_question_strings(1,9), #Leslie Keats
            'CS': generate_question_strings(1,9), #Sam Swift/Grandparent or Sam Swift/Parent
            'KP': generate_question_strings(1,9), #Shawn Clancy
            'LAP': generate_question_strings(1,9) #Tessa Frost
        },
        'info_gather': {
            'AP': generate_question_strings(10,16),
            'TW': generate_question_strings(10,16),
            'AMBN': generate_question_strings(10,16),
            'WL': generate_question_strings(10,16),
            'SOB': generate_question_strings(10,16),
            'PCMS': generate_question_strings(6,12),
            'CP': generate_question_strings(10,16),
            'CS': generate_question_strings(10,16),
            'KP': generate_question_strings(10,16),
            'LAP': generate_question_strings(10,16) 
        },
        'closing': {
            'AP': generate_question_strings(17,22),
            'TW': generate_question_strings(17,22),
            'AMBN': generate_question_strings(17,22),
            'WL': generate_question_strings(17,22),
            'SOB': generate_question_strings(17,22),
            'PCMS': generate_question_strings(13,18),
            'CP': generate_question_strings(17,22),
            'CS': generate_question_strings(17,22),
            'KP': generate_question_strings(17,22),
            'LAP': generate_question_strings(17,22)
        }
    },
    'Hx': {
        'history': {
            'AP': generate_question_strings(1,10),
            'TW': generate_question_strings(1,13),
            'AMBN': generate_question_strings(1,6),
            'WL': generate_question_strings(1,9),
            'SOB': generate_question_strings(1,14),
            'PCMS': generate_question_strings(1,18),
            'CP': generate_question_strings(1,14),
            'CS': generate_question_strings(1,10),
            'KP': generate_question_strings(1,9),
            'LAP': generate_question_strings(1,15)
        }
    },
    'PE': { #Carol Whitman & Margaret Lockhart did not do PE
        'handwash': {
            'AP': generate_question_strings(1,1),
            'TW': generate_question_strings(1,1),
            'WL': generate_question_strings(1,1),
            'SOB': generate_question_strings(1,1),
            'CP': generate_question_strings(1,1),
            'CS': generate_question_strings(1,1),
            'KP': generate_question_strings(1,1),
            'LAP': generate_question_strings(1,1)
        },
        'phys_check': {
            'AP': generate_question_strings(3,8),
            'TW': generate_question_strings(3,19),
            'WL': generate_question_strings(3,10),
            'SOB': generate_question_strings(3,5),
            'CP': generate_question_strings(3,10),
            'CS': generate_question_strings(3,12),
            'KP': generate_question_strings(3,7),
            'LAP': generate_question_strings(3,6)
        },
        'modesty': {
            'AP': generate_question_strings(2,2),
            'TW': generate_question_strings(2,2),
            'WL': generate_question_strings(2,2),
            'SOB': generate_question_strings(2,2),
            'CP': generate_question_strings(2,2),
            'CS': generate_question_strings(2,2),
            'KP': generate_question_strings(2,2),
            'LAP': generate_question_strings(2,2)
        }
    },
    'PS': {
        'personal': {
            'AP': generate_question_strings(1,1) + generate_question_strings(3,3),
            'TW': generate_question_strings(1,1) + generate_question_strings(3,3),
            'AMBN': generate_question_strings(1,1) + generate_question_strings(3,3),
            'WL': generate_question_strings(1,1) + generate_question_strings(3,3),
            'SOB': generate_question_strings(1,1) + generate_question_strings(3,3),
            'PCMS': generate_question_strings(1,1) + generate_question_strings(3,3),
            'CP': generate_question_strings(1,1) + generate_question_strings(3,3),
            'CS': generate_question_strings(1,1) + generate_question_strings(3,3),
            'KP': generate_question_strings(1,1) + generate_question_strings(3,3),
            'LAP': generate_question_strings(1,1) + generate_question_strings(3,3)
        },
        'rec': {
            'AP': generate_question_strings(2,2),
            'TW': generate_question_strings(2,2),
            'AMBN': generate_question_strings(2,2),
            'WL': generate_question_strings(2,2),
            'SOB': generate_question_strings(2,2),
            'PCMS': generate_question_strings(2,2),
            'CP': generate_question_strings(2,2),
            'CS': generate_question_strings(2,2),
            'KP': generate_question_strings(2,2),
            'LAP': generate_question_strings(2,2)
        }
    }
}

In [16]:
clean_dataset_2011 = load_and_clean_year_data(2011, dict_2011)

print(clean_dataset_2011.shape)
clean_dataset_2011.head()

loading 2011 excel sheet....
Calculating Mean and Variance of Question Groups...
Deidentifying and adding scores...
Done!
(138, 21)


Unnamed: 0,ID,Year,mean_PS_personal,var_PS_personal,mean_PS_rec,var_PS_rec,mean_PPI_info_gather,var_PPI_info_gather,mean_PPI_init,var_PPI_init,...,var_PPI_closing,mean_Hx_history,var_Hx_history,mean_PE_modesty,var_PE_modesty,mean_PE_handwash,var_PE_handwash,mean_PE_phys_check,var_PE_phys_check,P/F
0,617243,2011,,,,,,,,,...,,,,,,,,,,1
1,469760,2011,0.85,0.015789,0.8,0.025,0.733714,0.089261,0.838023,0.051498,...,0.15996,0.711864,0.206867,1.0,0.0,0.5,0.285714,0.672131,0.224044,1
2,572949,2011,0.8125,0.051809,0.75,0.083333,0.681429,0.098465,0.748372,0.081277,...,0.124582,0.788136,0.168405,0.75,0.214286,0.625,0.267857,0.672131,0.224044,1
3,558483,2011,0.9,0.022368,0.875,0.03125,0.834143,0.053836,0.926744,0.037474,...,0.064665,0.847458,0.130378,1.0,0.0,0.875,0.125,0.852459,0.127869,1
4,640409,2011,0.525,0.045395,0.375,0.03125,0.656857,0.119715,0.650698,0.115512,...,0.116722,0.70339,0.210416,0.625,0.267857,0.25,0.214286,0.47541,0.253552,1


In [24]:
for name in clean_dataset_2011.columns:
    print(":" + "'" + name + "',")

:'ID',
:'Year',
:'mean_PS_personal',
:'var_PS_personal',
:'mean_PS_rec',
:'var_PS_rec',
:'mean_PPI_info_gather',
:'var_PPI_info_gather',
:'mean_PPI_init',
:'var_PPI_init',
:'mean_PPI_closing',
:'var_PPI_closing',
:'mean_Hx_history',
:'var_Hx_history',
:'mean_PE_modesty',
:'var_PE_modesty',
:'mean_PE_handwash',
:'var_PE_handwash',
:'mean_PE_phys_check',
:'var_PE_phys_check',
:'P/F',


In [29]:


rename_map = {
    'Student':'ID',
    'Year':'Year',
    'PersonSatisf_Mean':'mean_PS_personal',
    'PersonSatisf_Variance':'var_PS_personal',
    'RecSatisf_Mean':'mean_PS_rec',
    'RecSatisf_Variance':'var_PS_rec',
    'PPI_Infor_Mean':'mean_PPI_info_gather',
    'PPI_Infor_Variance':'var_PPI_info_gather',
    'PPI_Init_Mean':'mean_PPI_init',
    'PPI_Init_Variance':'var_PPI_init',
    'PPI_Close_Mean':'mean_PPI_closing',
    'PPI_Close_Variance':'var_PPI_closing',
    'Hx_Mean':'mean_Hx_history',
    'Hx_Variance':'var_Hx_history',
    'PE_Mod_Mean':'mean_PE_modesty',
    'PE_Mod_Variance':'var_PE_modesty',
    'PE_Hand_Mean':'mean_PE_handwash',
    'PE_Hand_Variance':'var_PE_handwash',
    'PE_Check_Mean':'mean_PE_phys_check',
    'PE_Check_Variance':'var_PE_phys_check',
    'Result':'P/F',
}

rename_map = dict((y,x) for x,y in rename_map.iteritems())



print rename_map.values()
clean_dataset_2011 = clean_dataset_2011.rename(index=str, columns=rename_map)
correct_order = ['Student','Year', 'Result', 'PersonSatisf_Mean', 'PersonSatisf_Variance', 'RecSatisf_Mean', 'RecSatisf_Variance', 'PPI_Init_Mean', 'PPI_Init_Variance','PPI_Infor_Mean','PPI_Infor_Variance', 'PPI_Close_Mean', 'PPI_Close_Variance', 'PE_Hand_Mean', 'PE_Hand_Variance', 'PE_Mod_Mean', 'PE_Mod_Variance', 'PE_Check_Mean', 'PE_Check_Variance', 'Hx_Mean', 'Hx_Variance']

clean_dataset_2011 = clean_dataset_2011[correct_order]

clean_dataset_2011

['PE_Hand_Mean', 'PPI_Close_Mean', 'PPI_Init_Variance', 'PPI_Init_Mean', 'PersonSatisf_Mean', 'Year', 'PE_Check_Mean', 'PE_Check_Variance', 'PPI_Infor_Mean', 'PE_Mod_Mean', 'PE_Mod_Variance', 'Hx_Mean', 'Hx_Variance', 'Result', 'PersonSatisf_Variance', 'RecSatisf_Variance', 'RecSatisf_Mean', 'PPI_Infor_Variance', 'PPI_Close_Variance', 'Student', 'PE_Hand_Variance']


Unnamed: 0,Student,Year,Result,PersonSatisf_Mean,PersonSatisf_Variance,RecSatisf_Mean,RecSatisf_Variance,PPI_Init_Mean,PPI_Init_Variance,PPI_Infor_Mean,...,PPI_Close_Mean,PPI_Close_Variance,PE_Hand_Mean,PE_Hand_Variance,PE_Mod_Mean,PE_Mod_Variance,PE_Check_Mean,PE_Check_Variance,Hx_Mean,Hx_Variance
0,617243,2011,1,,,,,,,,...,,,,,,,,,,
1,469760,2011,1,0.850000,0.015789,0.800000,0.025000,0.838023,0.051498,0.733714,...,0.589167,0.159960,0.500000,0.285714,1.000000,0.000000,0.672131,0.224044,0.711864,0.206867
2,572949,2011,1,0.812500,0.051809,0.750000,0.083333,0.748372,0.081277,0.681429,...,0.678000,0.124582,0.625000,0.267857,0.750000,0.214286,0.672131,0.224044,0.788136,0.168405
3,558483,2011,1,0.900000,0.022368,0.875000,0.031250,0.926744,0.037474,0.834143,...,0.872167,0.064665,0.875000,0.125000,1.000000,0.000000,0.852459,0.127869,0.847458,0.130378
4,640409,2011,1,0.525000,0.045395,0.375000,0.031250,0.650698,0.115512,0.656857,...,0.599667,0.116722,0.250000,0.214286,0.625000,0.267857,0.475410,0.253552,0.703390,0.210416
5,935954,2011,1,0.737500,0.029441,0.700000,0.025000,0.865116,0.045230,0.795571,...,0.800333,0.080064,0.875000,0.125000,1.000000,0.000000,0.672131,0.224044,0.771186,0.177966
6,979835,2011,1,0.833333,0.036765,0.777778,0.053819,0.818182,0.075078,0.767778,...,0.827593,0.074456,0.571429,0.285714,1.000000,0.000000,0.568627,0.250196,0.703704,0.210453
7,850841,2011,1,0.487500,0.042599,0.400000,0.058333,0.682326,0.094451,0.608857,...,0.578000,0.105647,0.625000,0.267857,0.750000,0.214286,0.508197,0.254098,0.745763,0.191221
8,990207,2011,1,0.775000,0.051974,0.750000,0.055556,0.810000,0.082878,0.781143,...,0.811000,0.095918,1.000000,0.000000,1.000000,0.000000,0.704918,0.211475,0.771186,0.177966
9,265990,2011,1,,,,,,,,...,,,,,,,,,,


## Playground

In [17]:
finalDF = pd.DataFrame()
all_names = ['Phil', 'Jane', 'Tony', 'Chico']

df1 = pd.DataFrame([['Phil', .5, .3], ['Jane', .4, .75]],
                  columns=['name', 'Q1_PPI', 'Q2_PPI'])


df1.set_index(df1.columns[0], inplace=True)

df1 = df1.reindex(all_names)

df1 = df1.reset_index()

print(df1)

finalDF = finalDF.combine_first(df1)

df3 = pd.DataFrame([['Tony', .1, .8], ['Chico', .1, .25]],
                  columns=['name', 'Q1_PPI', 'Q2_PPI'])




df3.set_index(df3.columns[0], inplace=True)

df3 = df3.reindex(all_names)

df3 = df3.reset_index()
print(df3)

# finalDF = pd.concat([finalDF, df3], axis=1)

# finalDF

finalDF = finalDF.combine_first(df3)
finalDF

    name  Q1_PPI  Q2_PPI
0   Phil     0.5    0.30
1   Jane     0.4    0.75
2   Tony     NaN     NaN
3  Chico     NaN     NaN
    name  Q1_PPI  Q2_PPI
0   Phil     NaN     NaN
1   Jane     NaN     NaN
2   Tony     0.1    0.80
3  Chico     0.1    0.25


Unnamed: 0,name,Q1_PPI,Q2_PPI
0,Phil,0.5,0.3
1,Jane,0.4,0.75
2,Tony,0.1,0.8
3,Chico,0.1,0.25


In [18]:
df4 = pd.DataFrame([['Phil', .95, .03], ['Jane', .24, .85]],
                  columns=['name', 'Q1_PS', 'Q2_PS'])

df4.set_index(df4.columns[0], inplace=True)

df4 = df4.reindex(all_names)

df4 = df4.reset_index()

finalDF = finalDF.combine_first(df4)
finalDF

Unnamed: 0,Q1_PPI,Q1_PS,Q2_PPI,Q2_PS,name
0,0.5,0.95,0.3,0.03,Phil
1,0.4,0.24,0.75,0.85,Jane
2,0.1,,0.8,,Tony
3,0.1,,0.25,,Chico


In [19]:
df5 = pd.DataFrame([['Tony', .5, .5], ['Chico', .4, .5]],
                  columns=['name', 'Q1_PS', 'Q2_PS'])

df5.set_index(df5.columns[0], inplace=True)

df5 = df5.reindex(all_names)

df5 = df5.reset_index()

finalDF = finalDF.combine_first(df5)
finalDF

Unnamed: 0,Q1_PPI,Q1_PS,Q2_PPI,Q2_PS,name
0,0.5,0.95,0.3,0.03,Phil
1,0.4,0.24,0.75,0.85,Jane
2,0.1,0.5,0.8,0.5,Tony
3,0.1,0.4,0.25,0.5,Chico
