# Time Series Distance Measure

In [1]:
import pandas as pd
import numpy as np
import datetime
from collections import Counter

#ML Libraries
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn import preprocessing
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

#Distance Measures
from tslearn.neighbors import KNeighborsTimeSeriesClassifier, KNeighborsTimeSeries

## CSV reading

In [18]:
#Los csv que se leen son generados del notebook: 2-Time Series Classification - Feature Extraction Approach
#Se reduce el procesamiento

#name_file = "ARQ-BAQ-2002-0_Sem1_2_3_4_Var_grau"
name_file = "TimeSeriesProgramsToRunServer/Res_Feature_Extraction/ADM-BAN-2001-0_Sem1_2_3_4_5_6_7_8_Var_grau"

df_timeseries = pd.read_csv(name_file + '.csv', sep=';')
df_timeseries_label = pd.read_csv(name_file + '_labels.csv', sep=';', header = None , names = ['id','class'], index_col = 'id' ).to_dict('index')

timeseries = []
timeseries_label = []
for matricula,df_aluno in df_timeseries.groupby('id'):
    df_aluno.sort_values('semestre')
    df_aluno_dc = df_aluno.copy() 
    df_aluno_dc.drop(columns=['Unnamed: 0', 'id', 'semestre'],inplace=True)
    #print df_aluno.values
    #print df_aluno_dc.values
    timeseries.append(df_aluno_dc.values.transpose())
    timeseries_label.append(df_timeseries_label[matricula]['class'])

timeseries = np.asarray(timeseries)
timeseries_label = np.asarray(timeseries_label)

print "Dimensiones del dataset (num_matricu, dimensions (disciplinas), num_semes): "
print timeseries.shape
print "Number of samples per class: "
contador_class = Counter(timeseries_label) #Despues del Sampling
print contador_class




Dimensiones del dataset (num_matricu, dimensions (disciplinas), num_semes): 
(570, 45, 8)
Number of samples per class: 
Counter({0: 532, 1: 38})


## Undersampling

In [19]:

no_dropout_indices, = np.where( timeseries_label == 0)
dropout_indices, = np.where( timeseries_label == 1)

np.random.seed(42)
random_no_dropout_indices = np.random.choice(no_dropout_indices,contador_class[1],replace=False)

under_sample_indices = np.concatenate([dropout_indices,random_no_dropout_indices])

timeseries_under_sample = timeseries[under_sample_indices]
timeseries_label_under_sample = timeseries_label[under_sample_indices]

print "Despues del Undersampling:"
print timeseries_under_sample.shape
print Counter(timeseries_label_under_sample)


Despues del Undersampling:
(76, 45, 8)
Counter({0: 38, 1: 38})


## KNN Training

In [21]:

print(datetime.datetime.now())
# Nearest neighbor classification
scoring = {'accuracy': 'accuracy',
           'precision': 'precision',
           'recall': 'recall',
           'f1': 'f1'}

#KNN Classifier
knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=2, metric="dtw")
scores = cross_validate(knn_clf, timeseries_under_sample, timeseries_label_under_sample, scoring=scoring, cv=2)

print(datetime.datetime.now())


print(scores.keys())
print "Accuracy Score:" 
print np.average(scores['test_accuracy'])
print "Recall Score:"
print np.average(scores['test_recall'])
print "Precision Score:" 
print np.average(scores['test_precision'])
print "F1 Score:" 
print np.average(scores['test_f1'])



2018-09-12 18:20:44.411826
2018-09-12 18:21:49.834522
['test_f1', 'train_accuracy', 'test_recall', 'score_time', 'train_f1', 'fit_time', 'train_precision', 'test_accuracy', 'train_recall', 'test_precision']
Accuracy Score:
0.75
Recall Score:
0.763157894736842
Precision Score:
0.7459893048128342
F1 Score:
0.7513550135501355


In [None]:
##Con la representacion donde el tiempo es el semestre.
"""

timeseries = []
timeseries_label = []
for matricula,df_aluno in df_timeseries.groupby('id'):
    df_aluno.sort_values('semestre')
    df_aluno_dc = df_aluno.copy() 
    df_aluno_dc.drop(columns=['Unnamed: 0', 'id', 'semestre'],inplace=True)
    #print df_aluno.values
    #print df_aluno_dc.values
    timeseries.append(df_aluno_dc.values)
    timeseries_label.append(df_timeseries_label[matricula]['class'])
    

print "Dimensiones del dataset (num_matricu, num_semes, dimensions (disciplinas)): "
print timeseries.shape
print "Number of samples per class: "
contador_class = Counter(timeseries_label) #Despues del Sampling
print contador_class

#Undersamplig criollo
no_dropout_indices, = np.where( timeseries_label == 0)
dropout_indices, = np.where( timeseries_label == 1)
np.random.seed(42)
random_no_dropout_indices = np.random.choice(no_dropout_indices,contador_class[1],replace=False)

under_sample_indices = np.concatenate([dropout_indices,random_no_dropout_indices])

timeseries_under_sample = timeseries[under_sample_indices]
timeseries_label_under_sample = timeseries_label[under_sample_indices]

print "Despues del Undersampling:"
print timeseries_under_sample.shape
print Counter(timeseries_label_under_sample)


print(datetime.datetime.now())
# Nearest neighbor classification
scoring = {'accuracy': 'accuracy',
           'precision': 'precision',
           'recall': 'recall',
           'f1': 'f1'}

#KNN Classifier
knn_clf = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric="dtw")
scores = cross_validate(knn_clf, timeseries_under_sample, timeseries_label_under_sample, scoring=scoring, cv=5)

print(datetime.datetime.now())


print(scores.keys())
print "Accuracy Score:" 
print np.average(scores['test_accuracy'])
print "Recall Score:"
print np.average(scores['test_recall'])
print "Precision Score:" 
print np.average(scores['test_precision'])
print "F1 Score:" 
print np.average(scores['test_f1'])

"""