In [1]:
import pandas as pd
import numpy as np
import datetime
from collections import Counter

#ML Libraries
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn import preprocessing
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

## CSV reading

In [29]:
#name_file = "ARQ-BAQ-2002-0_Sem1_2_3_Var_grau"
name_file = "TimeSeriesProgramsToRunServer/Res_Feature_Extraction/ADM-BAN-2001-0_Sem1_Var_grau"

df_timeseries = pd.read_csv(name_file + '.csv', sep=';')
df_timeseries_label = pd.read_csv(name_file + '_labels.csv', sep=';', header = None , names = ['id','class'], index_col = 'id' ).to_dict('index')


lista_features = list(df_timeseries.columns.values) #Cada curso es un feature
lista_features.remove('Unnamed: 0')
lista_features.remove('id')
lista_features.remove('semestre')


dataset = []
dataset_label = []

for matricula,df_aluno in df_timeseries.groupby('id'):
    matricula_row = []
    df_aluno.sort_values('semestre')
    df_aluno_dc = df_aluno.copy() 
    df_aluno_dc.drop(columns=['Unnamed: 0', 'id', 'semestre'],inplace=True)
    num_semestres = df_aluno_dc.shape[0]
    
    for feature in lista_features:
        matricula_row.append(df_aluno_dc[feature].max())
        valores = df_aluno_dc[feature].value_counts()
        
        #TODO: SI SE INCLUYEN EL NUMERO DE INTENTOS?
        if -1 in valores.index:
            #El numero de intentos es igual al numero de semestres donde obtuvo nota. -1 indica que no inscribio la disciplina.
            num_intentos = num_semestres - valores[-1]
        else:
            num_intentos = num_semestres
             
    dataset.append(matricula_row)
    dataset_label.append(df_timeseries_label[matricula]['class'])


X = pd.DataFrame(dataset)
X.columns = lista_features
y = dataset_label

X.head()


Unnamed: 0,ADM1251_grau,ADM1258_grau,ADM1259_grau,ADM1271_grau,ADM1272_grau,ADM1276_grau,ADM1451_grau,ADM1551_grau,ADM1552_grau,ADM1951_grau,...,FIL0201_grau,JUR1016_grau,JUR1018_grau,LET1040_grau,MAT1127_grau,MAT1128_grau,MAT1129_grau,PSI1033_grau,SOC0201_grau,SOC0203_grau
0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,0.0
1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,0.0,-1.0,0.8,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.72,...,-1.0,-1.0,-1.0,-1.0,0.2,-1.0,-1.0,-1.0,-1.0,-1.0
3,0.38,-1.0,0.83,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.57,...,-1.0,-1.0,-1.0,-1.0,0.6,-1.0,-1.0,-1.0,-1.0,-1.0
4,0.55,-1.0,0.83,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.77,...,-1.0,-1.0,-1.0,-1.0,0.85,-1.0,-1.0,-1.0,-1.0,-1.0


## Machine Learning Training Models

In [30]:

print "Dimensiones antes del sampling: "
print Counter(y).items() #Antes del Sampling
#X_resampled, y_resampled = RandomOverSampler(random_state=42).fit_sample(X_selected, y)
X_resampled, y_resampled = SMOTE(random_state=42).fit_sample(X, y)
print "Dimensiones despues del sampling: "
print Counter(y_resampled).items() #Despues del Sampling


scoring = {'accuracy': 'accuracy',
           'precision': 'precision',
           'recall': 'recall',
           'f1': 'f1'}


clasificadores_score = {}

gnb = GaussianNB()
scores = cross_validate(gnb, X_resampled, y_resampled, scoring=scoring, cv=10)
res_tem = {"Acc" : np.average(scores['test_accuracy']), "Recall": np.average(scores['test_recall']),
          "Precision": np.average(scores['test_precision']), "F1": np.average(scores['test_f1'])}
clasificadores_score['GaussianNB'] = res_tem

svc = SVC(C=1)
scores = cross_validate(svc, X_resampled, y_resampled, scoring=scoring, cv=10)
res_tem = {"Acc" : np.average(scores['test_accuracy']), "Recall": np.average(scores['test_recall']),
          "Precision": np.average(scores['test_precision']), "F1": np.average(scores['test_f1'])}
clasificadores_score['SVC'] = res_tem

rf = RandomForestClassifier(n_estimators = 200)
scores = cross_validate(rf, X_resampled, y_resampled, scoring=scoring, cv=10)
res_tem = {"Acc" : np.average(scores['test_accuracy']), "Recall": np.average(scores['test_recall']),
          "Precision": np.average(scores['test_precision']), "F1": np.average(scores['test_f1'])}
clasificadores_score['RandomForestClassifier'] = res_tem


gbc = GradientBoostingClassifier(learning_rate=0.1,n_estimators=200,max_depth=10)
scores = cross_validate(gbc, X_resampled, y_resampled, scoring=scoring, cv=10)
res_tem = {"Acc" : np.average(scores['test_accuracy']), "Recall": np.average(scores['test_recall']),
          "Precision": np.average(scores['test_precision']), "F1": np.average(scores['test_f1'])}
clasificadores_score['GradientBoostingClassifier'] = res_tem



print "RESULTADOS DE LOS MODELOS DE CLASIFICACION"
print clasificadores_score


Dimensiones antes del sampling: 
[(0, 790), (1, 186)]
Dimensiones despues del sampling: 
[(0, 790), (1, 790)]
RESULTADOS DE LOS MODELOS DE CLASIFICACION
{'GaussianNB': {'Acc': 0.5943037974683544, 'Recall': 0.8240506329113926, 'Precision': 0.5662915556847105, 'F1': 0.6704594578413859}, 'SVC': {'Acc': 0.7455696202531646, 'Recall': 0.6974683544303798, 'Precision': 0.7762803577571381, 'F1': 0.733247419698899}, 'GradientBoostingClassifier': {'Acc': 0.8537974683544304, 'Recall': 0.8696202531645569, 'Precision': 0.8447405625660869, 'F1': 0.8539821406138911}, 'RandomForestClassifier': {'Acc': 0.8620253164556961, 'Recall': 0.8696202531645569, 'Precision': 0.8601869044951714, 'F1': 0.8615119195047877}}


In [31]:

#Mean decrease impurity
rf.fit(X_resampled, y_resampled)
feature_importances = pd.DataFrame(rf.feature_importances_, index = lista_features, columns=['importance']).sort_values('importance',ascending=False)
feature_importances.head(10)


Unnamed: 0,importance
ADM1251_grau,0.185813
ADM1951_grau,0.165721
MAT1127_grau,0.155514
ECO1101_grau,0.139214
ADM1259_grau,0.131322
SOC0201_grau,0.036361
FIL0201_grau,0.026612
ADM1952_grau,0.017733
ADM1271_grau,0.015858
MAT1128_grau,0.013951
