In [None]:
import numpy as np
import pandas as pd
import glob as gl
import os
import time

from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.decomposition import PCA
from sklearn.svm import SVC 
from sklearn.inspection import permutation_importance
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

In [None]:
path = "src/results"

""" datetime = "/2011-03-24_15"
df = pd.read_csv(path + datetime + '.csv')  """


dfs = []

for file in gl.glob(path + '/20*.csv'):
    dfs.append(pd.read_csv(file))

df = pd.concat(dfs, ignore_index=True)
df.count() #cada csv tiene 160920 filas

## Label Encoder

In [3]:
modelo = df.iloc[:, 8].values 
modelo = LabelEncoder().fit_transform(modelo) #codificación del modelo

In [None]:
datetime = df.iloc[:, 14].values 
datetime = LabelEncoder().fit_transform(datetime) #codificación del datetime

In [4]:
df = df.drop(df.columns[[8, 14]], axis=1) #se eliminan las antiguas con los strings del modelo y datetime
df['modelo'] = modelo #se añade la nueva codificada al final
df['datetime'] = datetime #se añade la nueva codificada al final

X = df.iloc[:, 1:] 
X = X.drop(['datetime', 'timestamp', 'load', 'DC Array Output (W)' , 'Pavg', 'dif'], axis=1)
y = df.iloc[:, 0].values #valores de overflow
X.columns

Index(['cap', 'dist', 'origen_id', 'dest_id', 'len_origen_tag', 'len_dest_tag',
       'criterion', 'degree', 'total_balance', 'abs_flux', 'h',
       'Beam Irradiance (W/m2)', 'Diffuse Irradiance (W/m2)',
       'Ambient Temperature (C)', 'Plane of Array Irradiance (W/m2)',
       'Cell Temperature (C)', 'modelo'],
      dtype='object')

## Split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

sc = StandardScaler() #escalado de datos
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## PCA

In [7]:
pca = PCA(n_components=2)
#pca = PCA(n_components=4)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [None]:
#Elbow Method

plt.figure(1, figsize=(12, 6))
plt.clf()
plt.axes([.2, .2, .7, .7])
plt.plot(pca.explained_variance_ratio_, linewidth=2) #Percentage of variance explained by each of the selected components.
plt.axis('tight')
plt.xlabel('Número de componentes')
plt.title('Aplicación de Análisis de Componentes Principales (PCA)')
plt.ylabel('Varianza (%)')

## Univariate feature selection

In [None]:
scaler = MinMaxScaler() # Escalar las características al rango [0, 1]

#5 features para probar, despues de ejecutasr rfecv se utiliza k=8 que ese el nº óptimo
#select_feature = SelectKBest(chi2, k=5).fit(scaler.fit_transform(X_train), y_train) 
select_feature = SelectKBest(chi2, k=8).fit(scaler.fit_transform(X_train), y_train) 
selected_features = X.columns[select_feature.get_support(indices=True)]

print('Selected features:', selected_features)

In [None]:
X_train_fs = select_feature.transform(X_train)
X_test_fs = select_feature.transform(X_test)

## SVM

In [8]:
inicl = time.time() 
#ESTO HA TARDADO LA VIDA Y MEDIA.... 487min, con PCA tarda 69 min
#7237seg en lab (sin),
classifier = SVC(kernel = 'rbf', random_state = 0) 

#classifier.fit(X_train, y_train)
classifier.fit(X_train_pca, y_train)
classifier.estimators_

fincl = time.time()
print(fincl-inicl) 

### Evaluación de features (classifier sin aplicar PCA o FS)

In [None]:
# Importancia de las características basada en la distancia a los vectores de soporte

support_vectors = classifier.support_vectors_ #vectores de soporte
dual_coef = classifier.dual_coef_ #multiplicadores de Lagrange asociados
print("Vectores soporte y coeficientes: ")
print(support_vectors)
print(dual_coef)

importances = np.abs(np.dot(dual_coef, support_vectors)).flatten()
importances_series = pd.Series(importances, index=X.columns)
plt.figure(figsize=(12, 6))
importances_series.plot(kind='bar')
plt.ylabel('Importancia')
plt.title('Importancia de las características del SVC con kernel RBF')
plt.xticks(rotation=45, ha='right')
plt.show() 

In [None]:
# Feature importance based on feature permutation 

result = permutation_importance(
    classifier, X_test, y_test, n_repeats=5, random_state=42, n_jobs=2
)

importances_mean_series = pd.Series(result.importances_mean, index=X.columns)

plt.figure(figsize=(12, 6))
importances_mean_series.plot(kind='bar')
plt.ylabel('Importancia')
plt.title('Importancia de las características del RF a partir de la permutación')
plt.xticks(rotation=45, ha='right')
plt.show() 

## Evaluación

### Matriz de confusión

In [None]:
#y_pred = classifier.predict(X_test)
#y_pred = classifier.predict(X_test_pca)
y_pred = classifier.predict(X_test_fs)

cm = confusion_matrix(y_test, y_pred)
print(cm) 
accuracy_score(y_test, y_pred)

#0.9780040807026266 || [[377074     70] [  8425    639]] sin PCA
#0.9765307813406248 || [[377144      0] [  9064      0]] con PCA (2)
# ||  con PCA (4)

### K-Fold

In [13]:
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 5)
#accuracies = cross_val_score(estimator = classifier, X = X_train_pca, y = y_train, cv = 5)
#accuracies = cross_val_score(estimator = classifier, X = X_train_fs, y = y_train, cv = 5)

print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

## Grid Search

### Estimación de tiempos

In [None]:
parameters = [{'C': [0.25, 0.5, 0.75, 1], 'kernel': ['poly']},
              {'C': [0.25, 0.5, 0.75, 1], 'kernel': ['rbf']},
              {'C': [0.25, 0.5, 0.75, 1], 'kernel': ['sigmoid']}]

processors = 32
cv = 5 
combos = 1

for j in parameters.values():
    combos *= len(j)

num_models = combos * cv / processors 
seconds = num_models * (fincl-inicl)
minutes = seconds / 60
hours = minutes / 60

print("{:.6f}".format(hours), "| {:.6f}".format(minutes), "| {:.6f}".format(seconds)) 

In [None]:
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 5,
                           n_jobs = -1)

grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

print(grid_search.best_estimator_)
y_pred_gs = grid_search.predict(X_test) 
classification_report(y_test, y_pred_gs)