# NeuroLit Prediction Notebook
## Software Engineering for Data Scientists, Autumn 2017
## Maggie Clarke, Patrick Donnelly, & Sritam Kethireddy

In [1]:
# import necessary modules
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import neurolit as nlit



### Initialize dataset
To zero in on the data of interest, and before applying ML techniques, select the variables of interest for analysis.

**data_folder**: choose a location to store temporary data files that are locally available

**selected_metalabels**: choose a group of variables of interest to form and test model
    options include:
        * WJ - Woodcock Johnson IV Tests of Achievement
        * TOWRE - Test of Word Reading Efficiency
        * WASI - Weschler Abbreviated Scale of Intelligence
        * CTOPP - Comprehensive Test of Phonological Processing
        
**metalabel_files**: this is the file that categorizes the variables with metalabels

**selected_features**: for future functionality, this variable can zero in on a specific variable of interest
        in the current iteration, this can only be a test that does not fall within the metalabel selected
        
**outcome_variable**: this variable relates to the survey variable of interest

**missingness_threshold**: set threshold for amount of missing data permitted for analysis

**max_missing_count**: allot the number of permitted missing values for a particular participant

In [None]:
# set up the data, prep for processing
ilabs_data = nlit.Dataset(data_folder = os.path.join(nlit.__path__[0],'data'), selected_metalabels='CTOPP'
                                        , metalabel_files='readingdata_metalabels.csv'
                                        , selected_features = None
                                        , outcome_variable = 'Dyslexia Diagnosis'
                                        , missingness_threshold = 0.4
                                        , max_missing_count = 1)

In [None]:
# get a visual on how complete the dataset is
nlit.dataset.Dataset.visualize_missingness(ilabs_data, output_directory='output')

In [None]:
# manage missing data values and normalize for PCA fit
ilabs_data = nlit.dataset.impute_missing(ilabs_data)
ilabs_data = nlit.dataset.normalize_data(ilabs_data)

In [None]:
# perform pca fit and plot explained variance as a function of components
pca, pca_data = nlit.reduce.perform_pca(ilabs_data)
nlit.reduce.pca_variance_plot(pca, output_directory='output')
pca_data = nlit.reduce.extract_pca_components(pca_data)

In [None]:
# fit linear model and plot confusion matrix
linear_model = nlit.classify.Classifier(dataset_object = ilabs_data, 
                                        pca_data = pca_data, 
                                        model_type = 'logistic_regression')
linear_model.plot_confusion_matrix(output_directory = 'output')