In [8]:



import os
import sys
import time
import numpy as np
import pandas as pd
import seaborn as sns
from pathlib import Path
import matplotlib.pyplot as plt 
from collections import defaultdict 
from types import SimpleNamespace




In [9]:

sys.path.append("../")

from dd_package.data.dyslexia_data import DyslexiaData
from dd_package.data.preprocess import preprocess_data

from dd_package.models.regression_estimators import RegressionEstimators
from dd_package.models.classification_estimators import ClassificationEstimators

from dd_package.common.utils import save_a_dict, load_a_dict, print_the_evaluated_results




In [11]:
tmp = load_a_dict(name="dd_ia_demo_phono-rf_reg--shuffled:True--exclude at risk:0", 
                  save_path="../../Results/")



In [12]:
print_the_evaluated_results(results=tmp, learning_method="regression")



   mae     rmse  	 mrae 	 r2_score  	 meape  	 jsd 
 Ave   std  Ave   std   Ave   std   Ave   std   Ave   std   Ave   std 
0.426 0.033 1.814 0.284 0.006 0.000 0.997 0.001 0.049 0.018 0.008 0.001


In [10]:


configs = {
    "models_path": Path("/home/soroosh/Programmes/DD/Models"),
    "results_path": Path("/home/soroosh/Programmes/DD/Results"),
    "figures_path": Path("/home/soroosh/Programmes/DD/Figures"),
    "params_path": Path("/home/soroosh/Programmes/DD//Params"),
    "n_repeats": 10,
    "n_splits": 5,
}

configs = SimpleNamespace(**configs)

estimator_name = "SV_cls"
data_name = "DD_demo"
to_shuffle = True
learning_method = "classification"

specifier = data_name+"-"+estimator_name+"-"+str(to_shuffle)
configs.specifier = specifier
configs.data_name = data_name
configs.name_wb = data_name+": "+specifier
configs.learning_method = learning_method
# configs.project = "DD_test"
# configs.group = "debug"




In [11]:

# res = load_a_dict(name="dd_demo-l_cls--shuffled:True--exclude at risk:0",
#                   save_path="/home/soroosh/Programmes/DD/Results/")


# print_the_evaluated_results(results=res, learning_method="classification")


# for k, v in res.items():
#     print("k:", k)
#     print("y_true:", v["y_test"])
#     print("y_pred:", v["y_pred"])
#     print(" ")
    


In [12]:

# from sklearn.preprocessing import OneHotEncoder

# enc = OneHotEncoder(sparse=False)
# y_true_ = v["y_test"].reshape(-1, 1)
# y_true_ = enc.fit_transform(y_true_)



In [13]:

dd = DyslexiaData(path="../../datasets/", n_repeats=5)



demos = dd.get_demo_datasets()
ias = dd.get_ia_datasets()
fixs = dd.get_fix_datasets()




Loading Demo data: 
  dyslexia (72, 9)
  norm (213, 9)
  risk (22, 9)
 
Loading IA_report data: 
  dyslexia (12641, 17)
  norm (38075, 17)
  risk (4148, 17)
 
Loading Fixation report data:
  dyslexia (59770, 7)
  norm (139507, 7)
  risk (26073, 7)
 


In [None]:

demo = dd.concat_classes_demo()  # .reset_index(drop=True)

demo



In [None]:


# demo = demo.loc[demo.Group!=2]

# demo



In [None]:

# ia = dd.concat_classes_ia()  # .reset_index(drop=True)

# ia



In [None]:


# fix = dd.concat_classes_fix()  # .reset_index(drop=True)

# fix




In [None]:




# fix_demo = dd.concat_dfs(df1=fix,  
#                          df2=demo,
#                          features1=fix.columns, 
#                          features2=demo.columns[2:],
#                         )



# fix_demo





In [None]:




# ia_demo = dd.concat_dfs(df1=ia,  
#                         df2=demo,
#                         features1=ia.columns, 
#                         features2=demo.columns[2:],
#                         )

# ia_demo




In [None]:


x_org, y_org = dd.get_onehot_features_targets(
        data=demo,
        c_features= ["Sex", "Grade", ],  # 
        indicators=["SubjectID"],
    )


In [None]:


x = preprocess_data(x=x_org, pp='mm')  # only x is standardized



In [None]:

cv = dd.get_stratified_kfold_cv(
    to_shuffle=to_shuffle,
    n_splits=configs.n_splits,
)



In [None]:


y = y_org.Group.values


# y = preprocess_data(x=y, pp='mm')  # only x is standardized


In [None]:


data = dd.get_stratified_train_test_splits(
    x=x, y=y,
    labels=y_org.Group.values,
    to_shuffle=to_shuffle,
    n_splits=configs.n_repeats
)



In [None]:


reg_est = ClassificationEstimators(
    x=x, y=y, cv=cv, data=data,
    estimator_name=estimator_name,
    configs=configs,
)




In [None]:


reg_est.instantiate_tuning_estimator_and_parameters()



In [None]:


reg_est.tune_hyper_parameters()





In [None]:


reg_est.instantiate_train_test_estimator()




In [None]:

reg_est.train_test_tuned_estimator()





In [None]:

# reg_est.save_params_results()




In [None]:

reg_est.print_results()




In [None]:



res = load_a_dict(name="DD_demo-L_cls-True_TEST",
                  save_path="/home/soroosh/Programmes/DD/Results/")


print_the_evaluated_results(results=res, learning_method="classification")


In [None]:


for k, v in res.items():
    print("probs:", v["y_pred_prob"])
    

In [16]:
to_exclude_at_risk = False

# dict of dicts, s.t each dict contains pd.df of a class, e.g normal
_ = dd.get_demo_datasets()  # demos and phonological (which is initially part of demo)
demo_phono = dd.concat_classes_demo()

# The optimize way to exclude at-risk class
if to_exclude_at_risk == 1:
    to_exclude_at_risk = True
    demo_phono = demo_phono.loc[demo_phono.Group != 2]

df_data_to_use = demo_phono.loc[:, [
                                       'Group', 'SubjectID', 'Sound_detection', 'Sound_change', 'Reading_speed'
                                   ]]
c_features = None
indicators = ['SubjectID', ]
targets = ["Group", "Reading_speed", ]

Loading Demo data: 
  dyslexia (72, 9)
  norm (213, 9)
  risk (22, 9)
 


In [17]:
df_data_to_use

Unnamed: 0,Group,SubjectID,Sound_detection,Sound_change,Reading_speed
0,1,10,0.620000,0.910000,102.0
1,1,108,1.000000,0.950000,139.0
2,1,11,1.000000,0.790000,110.0
3,1,11_norm1,0.950000,0.910000,71.0
4,1,12_norm1,0.950000,0.620000,60.0
...,...,...,...,...,...
212,1,varya_norm1,1.000000,0.920000,56.0
68,3,vpa21,0.916667,0.916667,22.0
69,3,yea20,0.875000,0.500000,20.0
70,3,zaa15,0.958333,0.916667,78.0


In [18]:
to_exclude_at_risk = True

# dict of dicts, s.t each dict contains pd.df of a class, e.g normal
_ = dd.get_demo_datasets()  # demos and phonological (which is initially part of demo)
demo_phono = dd.concat_classes_demo()

# The optimize way to exclude at-risk class
if to_exclude_at_risk == 1:
    to_exclude_at_risk = True
    demo_phono = demo_phono.loc[demo_phono.Group != 2]

df_data_to_use = demo_phono.loc[:, [
                                       'Group', 'SubjectID', 'Sound_detection', 'Sound_change', 'Reading_speed'
                                   ]]
c_features = None
indicators = ['SubjectID', ]
targets = ["Group", "Reading_speed", ]

Loading Demo data: 
  dyslexia (72, 9)
  norm (213, 9)
  risk (22, 9)
 


In [19]:
df_data_to_use

Unnamed: 0,Group,SubjectID,Sound_detection,Sound_change,Reading_speed
0,1,10,0.620000,0.910000,102.0
1,1,108,1.000000,0.950000,139.0
2,1,11,1.000000,0.790000,110.0
3,1,11_norm1,0.950000,0.910000,71.0
4,1,12_norm1,0.950000,0.620000,60.0
...,...,...,...,...,...
212,1,varya_norm1,1.000000,0.920000,56.0
68,3,vpa21,0.916667,0.916667,22.0
69,3,yea20,0.875000,0.500000,20.0
70,3,zaa15,0.958333,0.916667,78.0


In [22]:
set(df_data_to_use.Group)

{1, 3}