In [24]:
import math
import pickle
import gzip
import numpy as np
import pandas as pd
import sklearn

from sklearn.utils import resample
%matplotlib inline
import matplotlib.pylab as plt

from sklearn.tree import DecisionTreeClassifier
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

extraction_settings = ComprehensiveFCParameters()

dftrain = pd.read_csv('./all_2/training_set.csv')
dfmetatrain =  pd.read_csv('./all_2/training_set_metadata.csv')
# dftrain.drop(dftrain.columns[0], axis=1, inplace=True)
idcounts = dftrain.iloc[:, 0].value_counts().to_frame('counts')

In [2]:
dftrain.head(5)

Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,615,59750.4229,2,-544.810303,3.622952,1
1,615,59750.4306,1,-816.434326,5.55337,1
2,615,59750.4383,3,-471.385529,3.801213,1
3,615,59750.445,4,-388.984985,11.395031,1
4,615,59752.407,2,-681.858887,4.041204,1


In [3]:
dfmetatrain.head(5)

Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target
0,615,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,0.017,92
1,713,53.085938,-27.784405,223.525509,-54.460748,1,1.8181,1.6267,0.2552,45.4063,0.007,88
2,730,33.574219,-6.579593,170.455585,-61.548219,1,0.232,0.2262,0.0157,40.2561,0.021,42
3,745,0.189873,-45.586655,328.254458,-68.969298,1,0.3037,0.2813,1.1523,40.7951,0.007,90
4,1124,352.711273,-63.823658,316.922299,-51.059403,1,0.1934,0.2415,0.0176,40.4166,0.024,90


In [15]:
len(idcounts)  #7848 total ids

def getVal(dftrain, id):
    bands, r = [0,1,2,3,4,5], {}
    for b in bands:
        locb = dftrain.loc[(dftrain.iloc[:,0]==id) & (dftrain.iloc[:,2]==b)]
        r[b] = {} 
        r[b]['mjd'] = locb.iloc[:,1].values
        r[b]['flux'] = locb.iloc[:,3].values
        r[b]['flux_err'] = locb.iloc[:,4].values
        r[b]['detected'] = locb.iloc[:,5].values
    return r


In [16]:
subset_ids = idcounts.index     #try max 352 datapoints 
df = pd.DataFrame()
for id in subset_ids:
    df = df.append({'id': id, 'info':getVal(dftrain, id)}, ignore_index=True)

df.head(5)

Unnamed: 0,id,info
0,104397.0,"{0: {'detected': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
1,7409.0,"{0: {'detected': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
2,248547.0,"{0: {'detected': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
3,238409.0,"{0: {'detected': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
4,52370.0,"{0: {'detected': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0..."


In [26]:
# print('The scikit-learn version is {}.'.format(sklearn.__version__))   #resample works for sklearn >=v20

def balance(df):     
    return resample(df,
                    replace=True, 
                    n_samples=2313,   #max target 90
                    random_state=123)

def bootstrap(df,cntindex):
    idcounts = df.iloc[:, cntindex].value_counts().to_frame('counts')
#     print(idcounts)
    df90 = df.loc[df.iloc[:,cntindex]==90]
    minority_class = [42, 65, 16, 15, 62, 88, 92, 67, 52, 95, 6, 64, 53]
    upsampled_list = [df90]  

    for m in minority_class:
        t = df.loc[df.iloc[:,cntindex]==m]
        upsampled_list.append(balance(t))
    df_upsampled = pd.concat([i for i in upsampled_list])
    
    return df_upsampled

#bootstrap for meta file 
dfmetatrain_upsampled = bootstrap(dfmetatrain, 11)
dfmetatrain_upsampled.drop(['distmod'], axis=1, inplace=True)
dfmetatrain_upsampled['target'].value_counts()

95    2313
15    2313
62    2313
92    2313
90    2313
42    2313
88    2313
6     2313
53    2313
52    2313
67    2313
65    2313
64    2313
16    2313
Name: target, dtype: int64

In [17]:
# for i in [0,1,2,3,4,5]:
#     print(len(df[:1]['info'].values[0][i]['mjd']))


def transform(df): 
    df_tf = pd.DataFrame()
    for i in df.id.values: 
        x = df.loc[df.iloc[:,0]==i]['info'].values[0]
        length = min(len(x[0]['mjd']), 
                     len(x[1]['mjd']), 
                     len(x[2]['mjd']), 
                     len(x[3]['mjd']), 
                     len(x[4]['mjd']), 
                     len(x[5]['mjd']))
        for j in range(length):
            b=0
            mjd = j
            b1 = x[b]['flux'][j]
            b2 = x[b+1]['flux'][j]
            b3 = x[b+2]['flux'][j]
            b4 = x[b+3]['flux'][j]
            b5 = x[b+4]['flux'][j]
            b6 = x[b+5]['flux'][j]
            df_tf = df_tf.append({'id': i, 'mjd': mjd, 'b1': b1, 'b2': b2, 'b3': b3, 'b4': b4, 'b5': b5,'b6': b6}, ignore_index=True)

    
    return df_tf

df_transformed = transform(df)
df_transformed[:10]

Unnamed: 0,b1,b2,b3,b4,b5,b6,id,mjd
0,-1.075714,8.335362,19.709843,40.885571,35.472851,17.899832,104397.0,0.0
1,-0.893701,8.9547,19.698114,37.090401,37.525723,22.316639,104397.0,1.0
2,4.724999,5.424593,17.612799,27.732584,27.094858,23.17811,104397.0,2.0
3,0.393617,6.549182,17.340786,27.444069,26.861343,15.569922,104397.0,3.0
4,-0.755467,6.400099,11.467354,23.10891,19.103769,10.787785,104397.0,4.0
5,-0.467898,4.041076,12.699963,18.051514,19.703526,8.640262,104397.0,5.0
6,0.13258,4.60768,10.395782,16.569839,11.925572,8.323359,104397.0,6.0
7,0.697035,3.300525,12.005974,11.976978,10.721906,6.971705,104397.0,7.0
8,1.372355,3.447134,10.484252,12.155567,9.668359,6.929439,104397.0,8.0
9,2.111459,4.307893,9.387548,13.230844,13.755565,9.797935,104397.0,9.0


In [2]:
#checkpoint 1

# df_transformed.to_pickle("./df_transformed.pkl")
unpickled_df = pd.read_pickle("./df_transformed.pkl")

In [13]:
# #Extract features
# %matplotlib inline
# import matplotlib.pylab as plt
# # import seaborn as sns
# from tsfresh.examples.robot_execution_failures import download_robot_execution_failures, load_robot_execution_failures
# from tsfresh import extract_features, extract_relevant_features, select_features
# from tsfresh.utilities.dataframe_functions import impute
# from tsfresh.feature_extraction import ComprehensiveFCParameters
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.cross_validation import train_test_split
# from sklearn.metrics import classification_report

# extraction_settings = ComprehensiveFCParameters()
X = extract_features(df_transformed, 
                     column_id='id', column_sort='mjd',
                     default_fc_parameters=extraction_settings,
                     impute_function= impute)
X.head(5)

Feature Extraction: 100%|██████████| 20/20 [33:15<00:00, 56.04s/it]


variable,b1__abs_energy,b1__absolute_sum_of_changes,"b1__agg_autocorrelation__f_agg_""mean""","b1__agg_autocorrelation__f_agg_""median""","b1__agg_autocorrelation__f_agg_""var""","b1__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""intercept""","b1__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""rvalue""","b1__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""slope""","b1__agg_linear_trend__f_agg_""max""__chunk_len_10__attr_""stderr""","b1__agg_linear_trend__f_agg_""max""__chunk_len_50__attr_""intercept""",...,b6__time_reversal_asymmetry_statistic__lag_1,b6__time_reversal_asymmetry_statistic__lag_2,b6__time_reversal_asymmetry_statistic__lag_3,b6__value_count__value_-inf,b6__value_count__value_0,b6__value_count__value_1,b6__value_count__value_inf,b6__value_count__value_nan,b6__variance,b6__variance_larger_than_standard_deviation
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
615.0,377891.561386,2959.923972,-0.119171,-0.182411,0.076302,94.873296,-0.096833,-2.402768,12.348419,125.182808,...,-7368693.0,3495249.0,2653816.0,0.0,0.0,0.0,0.0,0.0,85370.493377,1.0
713.0,2783.646129,165.239681,-0.138864,-0.162331,0.42084,15.19688,-0.977437,-4.336846,0.468605,14.509829,...,-62.5749,43.00838,47.09408,0.0,0.0,0.0,0.0,0.0,49.427194,1.0
730.0,188.050735,116.502597,0.009156,-0.042686,0.081196,4.059111,-0.516882,-0.433336,0.358844,5.942166,...,-1421.634,-2198.193,-2198.178,0.0,0.0,0.0,0.0,0.0,174.276892,1.0
745.0,1548.589675,179.9533,-0.050507,-0.105709,0.054559,6.958834,-0.079838,-0.263196,1.643062,18.014029,...,-15395.67,-21920.34,-22605.88,0.0,0.0,0.0,0.0,0.0,666.782529,1.0
1124.0,353.801546,155.457677,-0.022428,-0.009965,0.022863,2.741523,0.330871,0.245951,0.350738,5.330927,...,-6365.682,-7782.086,-5472.836,0.0,0.0,0.0,0.0,0.0,451.38282,1.0


In [15]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 7848 entries, 615.0 to 130779836.0
Columns: 4764 entries, b1__abs_energy to b6__variance_larger_than_standard_deviation
dtypes: float64(4764)
memory usage: 285.3 MB


In [43]:
#checkpoint 2 
X.to_csv('X.csv')
y = dfmetatrain['target']

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1)
cl = DecisionTreeClassifier()
cl.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [35]:
#Validation size 10% no bootsrap
print(classification_report(y_test, cl.predict(X_test)))   
#targets 6, 52, 64, 67 failed because of minimal number of observations

             precision    recall  f1-score   support

          6       0.00      0.00      0.00        22
         15       0.33      0.34      0.34        53
         16       0.79      0.80      0.80        95
         42       0.31      0.31      0.31       117
         52       0.00      0.00      0.00        23
         53       0.33      0.25      0.29         4
         62       0.14      0.19      0.16        43
         64       0.00      0.00      0.00        13
         65       0.52      0.53      0.53        98
         67       0.00      0.00      0.00        18
         88       0.61      0.66      0.63        29
         90       0.50      0.50      0.50       239
         92       0.68      0.83      0.75        18
         95       0.20      0.23      0.21        13

avg / total       0.43      0.44      0.44       785



In [3]:
y = dfmetatrain['target']
y.index = dfmetatrain['object_id']

X_filtered = extract_relevant_features(unpickled_df, y, 
                                       column_id='id', column_sort='mjd', 
                                       default_fc_parameters=extraction_settings)
X_filtered.head(5)

Feature Extraction: 100%|██████████| 20/20 [18:28<00:00, 38.38s/it]


variable,"b2__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.2","b3__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.2","b2__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.0",b2__mean_abs_change,b2__absolute_sum_of_changes,"b3__change_quantiles__f_agg_""var""__isabs_False__qh_1.0__ql_0.2","b2__change_quantiles__f_agg_""mean""__isabs_True__qh_0.8__ql_0.0",b3__mean_abs_change,"b3__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.0",b3__absolute_sum_of_changes,...,"b4__linear_trend__attr_""intercept""","b2__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_9__w_10",b4__number_peaks__n_1,b2__autocorrelation__lag_7,"b1__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_14__w_2","b6__cwt_coefficients__widths_(2, 5, 10, 20)__coeff_12__w_20",b3__c3__lag_2,b5__symmetry_looking__r_0.25,b5__symmetry_looking__r_0.30000000000000004,b1__symmetry_looking__r_0.30000000000000004
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
615.0,519.504357,423.716195,647.078397,647.078397,36236.390207,273618.311367,447.784514,503.257899,503.257899,28182.442351,...,-309.031313,-845.660896,16.0,-0.051752,-105.344718,-554.608005,15251370.0,1.0,1.0,1.0
713.0,2.138922,2.120542,2.241918,2.241918,123.305483,8.156167,2.024384,2.051365,2.051365,112.825074,...,8.056757,14.1078,17.0,0.677033,3.142531,11.621645,-89.40954,1.0,1.0,1.0
730.0,1.517284,2.015711,1.856014,1.856014,92.800712,16.361706,1.412415,1.888433,1.888433,94.421627,...,-2.154664,-1.488664,17.0,-0.043021,0.22155,-3.167655,213.844,1.0,1.0,1.0
745.0,6.382901,12.654187,8.632928,8.632928,466.17813,1877.64438,1.630198,9.276039,9.276039,500.906092,...,17.777045,-22.191762,15.0,-0.01277,-1.816087,50.504436,6531.625,1.0,1.0,1.0
1124.0,3.39024,6.607588,2.810284,2.810284,157.375895,388.366325,1.517726,4.686253,4.686253,262.430156,...,-5.657373,1.054442,18.0,0.255567,0.507083,-18.002683,10374.87,1.0,1.0,1.0


In [4]:
X_filtered.to_pickle('X_filtered.pkl')
X_filtered.info()           #avg 253*6 

<class 'pandas.core.frame.DataFrame'>
Float64Index: 7848 entries, 615.0 to 130779836.0
Columns: 1902 entries, b2__change_quantiles__f_agg_"mean"__isabs_True__qh_1.0__ql_0.2 to b1__symmetry_looking__r_0.30000000000000004
dtypes: float64(1902)
memory usage: 113.9 MB


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_filtered, y, test_size=.1)
cl = DecisionTreeClassifier()
cl.fit(X_train, y_train)
print(classification_report(y_test, cl.predict(X_test)))

              precision    recall  f1-score   support

           6       0.08      0.08      0.08        13
          15       0.28      0.27      0.27        48
          16       0.86      0.81      0.83       104
          42       0.30      0.30      0.30       114
          52       0.03      0.07      0.04        15
          53       1.00      0.50      0.67         2
          62       0.09      0.10      0.10        40
          64       0.00      0.00      0.00         5
          65       0.66      0.54      0.59       125
          67       0.00      0.00      0.00        21
          88       0.65      0.74      0.69        35
          90       0.59      0.56      0.57       229
          92       0.82      0.86      0.84        21
          95       0.12      0.15      0.13        13

   micro avg       0.48      0.48      0.48       785
   macro avg       0.39      0.35      0.37       785
weighted avg       0.51      0.48      0.50       785



In [27]:
#bootstrap training_set file
dfconcat = pd.concat([X_filtered, y], axis=1)
dfconcat = bootstrap(dfconcat, 1902)
idcounts = dfconcat.iloc[:, 1902].value_counts().to_frame('counts')
idcounts

Unnamed: 0,counts
95,2313
15,2313
62,2313
92,2313
90,2313
42,2313
88,2313
6,2313
53,2313
52,2313


In [28]:
X_train, X_test, y_train, y_test = train_test_split(dfconcat.iloc[:, :1902], dfconcat.iloc[:,1902], test_size=.1)
cl = DecisionTreeClassifier()
cl.fit(X_train, y_train)
print("Validation score: - {}".format(accuracy_score(y_test, cl.predict(X_test))))
print(classification_report(y_test, cl.predict(X_test))) 

Validation score: - 0.945353504167953
              precision    recall  f1-score   support

           6       0.98      1.00      0.99       213
          15       0.94      0.99      0.96       239
          16       0.98      1.00      0.99       231
          42       0.83      0.87      0.85       232
          52       0.96      1.00      0.98       236
          53       1.00      1.00      1.00       241
          62       0.91      0.98      0.94       214
          64       0.98      1.00      0.99       247
          65       0.88      0.91      0.90       223
          67       0.94      1.00      0.97       214
          88       1.00      1.00      1.00       230
          90       0.87      0.51      0.65       245
          92       1.00      1.00      1.00       241
          95       0.95      1.00      0.98       233

   micro avg       0.95      0.95      0.95      3239
   macro avg       0.94      0.95      0.94      3239
weighted avg       0.94      0.95      0.9