In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [38]:
data = pd.read_csv("wind_ava.csv")
#Esto quita todas las columnas que no terminan en .13
data = data.filter(regex='.*\.13|energy|datetime')
train, test = train_test_split(data, test_size=1099, shuffle=False)



In [39]:
quantile_75per_train = np.quantile(train['energy'], 0.75)
quantile_75per_test = np.quantile(test['energy'], 0.75)

train['energy'] = train['energy'].apply(lambda x: 1 if x > quantile_75per_train else 0)
test['energy'] = test['energy'].apply(lambda x: 1 if x > quantile_75per_test else 0)

X_train = train.drop(['datetime', 'energy'], axis='columns')
y_train = train['energy']

X_test = test.drop(['datetime', 'energy'], axis='columns')
y_test = test['energy']

# Elección del Modelo

In [52]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import NuSVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix


best_scores = {}
best_params = {}

## KNN

In [41]:
param_knn = {'knn__n_neighbors': np.arange(2,20,1),
              'knn__weights':  ['uniform','distance'],
              'knn__p': [1,2],
              }

pipeline_knn = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier())])
knn_model = GridSearchCV(pipeline_knn, param_knn, cv=TimeSeriesSplit(n_splits=3), scoring= 'f1')

knn_model.fit(X_train, y_train)

best_params['Params elegidos para knn con HPO'] = knn_model.best_params_
best_scores['Error para knn con HPO'] = knn_model.best_score_


print(f"Mejores parametros para KNN con HPO: {knn_model.best_params_}")
print(f"Error para KNN con HPO: {knn_model.best_score_}")

Mejores parametros para KNN con HPO: {'knn__n_neighbors': 11, 'knn__p': 1, 'knn__weights': 'uniform'}
Error para KNN con HPO: 0.6798475570431824


## Árboles de Decisión

In [42]:
tree_params = {'max_depth': np.arange(4,10,2),
               'min_samples_split':np.arange(10,42,2),
               'min_impurity_decrease':[0.0, 1.0],
              'class_weight':['balanced']}

pipeline_tree = DecisionTreeClassifier()
tree_model = GridSearchCV(pipeline_tree, tree_params, cv=TimeSeriesSplit(n_splits=3), scoring= 'f1')

tree_model.fit(X_train, y_train)

best_params['Params elegidos para arbol con HPO'] = tree_model.best_params_
best_scores['Error para arbol con HPO'] = tree_model.best_score_

print(f"Mejores parametros para arbol con HPO: {tree_model.best_params_}")
print(f"Error para arbol con HPO: {tree_model.best_score_}")

Mejores parametros para arbol con HPO: {'class_weight': 'balanced', 'max_depth': 6, 'min_impurity_decrease': 0.0, 'min_samples_split': 10}
Error para arbol con HPO: 0.7019022881090203


## Regresión Logística

In [43]:
logistic_reg_params = {'logistic_reg__penalty':["l2"],
                'logistic_reg__C':np.arange(0.1,3,0.5),
                'logistic_reg__class_weight': ["balanced"],
                'logistic_reg__max_iter': [5000, 10000, 20000, 30000],
                'logistic_reg__solver': ["lbfgs","newton-cholesky"]}
pipeline_logistic_reg = Pipeline([('scaler', StandardScaler()), ('logistic_reg', LogisticRegression())])
logistic_reg_model = GridSearchCV(pipeline_logistic_reg, logistic_reg_params, cv=TimeSeriesSplit(n_splits=3), scoring= 'f1')

logistic_reg_model.fit(X_train, y_train)

best_params['Params elegidos para regresión logística con HPO'] = logistic_reg_model.best_params_
best_scores['Error para regresión logística con HPO'] = logistic_reg_model.best_score_

print(f"Mejores parametros para regresión logística con HPO: {logistic_reg_model.best_params_}")
print(f"Error para regresión logística con HPO: {logistic_reg_model.best_score_}")

Mejores parametros para regresión logística con HPO: {'logistic_reg__C': 1.1, 'logistic_reg__class_weight': 'balanced', 'logistic_reg__max_iter': 5000, 'logistic_reg__penalty': 'l2', 'logistic_reg__solver': 'newton-cholesky'}
Error para regresión logística con HPO: 0.5451683802560995


## Máquinas de Vector Soporte

In [44]:
svm_params = {'svm__kernel': ["linear","poly","rbf","sigmoid"],
            'svm__degree': np.arange(1,6,1),
            'svm__class_weight':['balanced'],
            'svm__nu': np.arange(0.1,0.5,0.1)}

pipeline_svm = Pipeline([('scaler', StandardScaler()), ('svm', NuSVC())])
svm_model = GridSearchCV(pipeline_svm, svm_params, cv=TimeSeriesSplit(n_splits=3), scoring= 'f1')

svm_model.fit(X_train, np.ravel(y_train))

best_params['Params elegidos para SVM con HPO'] = svm_model.best_params_
best_scores['Error para SVM con HPO'] = svm_model.best_score_

print(f"Mejores parametros para SVM con HPO: {svm_model.best_params_}")
print(f"Error para SVM con HPO: {svm_model.best_score_}")

Mejores parametros para SVM con HPO: {'svm__class_weight': 'balanced', 'svm__degree': 2, 'svm__kernel': 'poly', 'svm__nu': 0.30000000000000004}
Error para SVM con HPO: 0.7075713886675891


In [45]:
print("Mejores parametros elegidos para cada modelo:")
for modelo, param in best_params.items():
    print(f"\t{modelo}: {param}")

print("Best Score de cada modelo:")
for modelo, score in best_scores.items():
    print(f"\t{modelo}: {score}")

Mejores parametros elegidos para cada modelo:
	Params elegidos para knn con HPO: {'knn__n_neighbors': 11, 'knn__p': 1, 'knn__weights': 'uniform'}
	Params elegidos para arbol con HPO: {'class_weight': 'balanced', 'max_depth': 6, 'min_impurity_decrease': 0.0, 'min_samples_split': 10}
	Params elegidos para regresión logística con HPO: {'logistic_reg__C': 1.1, 'logistic_reg__class_weight': 'balanced', 'logistic_reg__max_iter': 5000, 'logistic_reg__penalty': 'l2', 'logistic_reg__solver': 'newton-cholesky'}
	Params elegidos para SVM con HPO: {'svm__class_weight': 'balanced', 'svm__degree': 2, 'svm__kernel': 'poly', 'svm__nu': 0.30000000000000004}
Best Score de cada modelo:
	Error para knn con HPO: 0.6798475570431824
	Error para arbol con HPO: 0.7019022881090203
	Error para regresión logística con HPO: 0.5451683802560995
	Error para SVM con HPO: 0.7075713886675891


# Evaluación del Modelo

## Evaluación

In [58]:
y_pred=svm_model.predict(X_test)
conf_mat = confusion_matrix(y_test, y_pred)

outer_score =f1_score(y_test, y_pred)
#precision, recall, fbeta_score, support = precision_recall_fscore_support(y_test, y_pred)
print(conf_mat)
print(outer_score)
#print(precision, recall, fbeta_score)



[[779  45]
 [113 162]]
0.6721991701244814
