In [4]:
### Load libraries ###

# interactive plotting
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

# plotting libraries
import seaborn as sns
import matplotlib.pyplot as plt

# Data management libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Machine learning libraries
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import sklearn.mixture as mixture

# Other
import joblib


# Load data
eolica = pd.read_csv("eolica.csv")
df = eolica

numeric_inputs = ['Tmax','Tmin','Tmed','Rmed','Vmax','Pmed_00_24','Pmed_00_06','Pmed_06_12', 'Pmed_12_18','Pmed_18_24']
X = df[numeric_inputs]
Y = df[['gen']]

# Preprocessing the values to perform PCA
numeric_features = X.select_dtypes(include=['int64','float64']).columns.values.tolist()
scaler = StandardScaler()
X_transformed = scaler.fit_transform(X=X)

## PCA -----------------------------------------------------------
pca = PCA(n_components=5,) #95% of variance
X_pca = pca.fit_transform(X_transformed)

## GMM -----------------------------------------------------------
gmm = mixture.GaussianMixture(n_components=6, covariance_type='full')
gmm.fit(X_pca)
# Save PCA model
joblib.dump(pca, 'pca_model.pkl')

# Save GMM model
joblib.dump(gmm, 'gmm_model.pkl')

#Save scaler
joblib.dump(scaler, 'scaler_model.pkl')

['scaler_model.pkl']

In [5]:
### Load libraries ###
import pandas as pd

# plotting libraries
import matplotlib.pyplot as plt

# Data management libraries
import numpy as np # linear algebra

# # Machine learning libraries
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

# others
from mltools.regression_tools import LinearRegressor
import math
from sklearn.cross_decomposition import PLSRegression

INPUTS = ['Tmax', 'Tmin', 'Tmed', 'Rmed', 'Vmax', 'Pmed_00_24', 'Pmed_00_06', 'Pmed_06_12', 'Pmed_12_18', 'Pmed_18_24']
OUTPUT = 'gen'
errores = pd.DataFrame()

lluviosos = eolica[(eolica.ccaa == "Galicia") & (eolica.Pmed_00_24 >= 1)]
X = lluviosos[INPUTS]
y = lluviosos[OUTPUT]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,  #percentage of test data
                                                    random_state=0) #seed for replication
## Create dataset to store model predictions

dfTR_eval = X_train.copy()
dfTR_eval['gen'] = y_train
dfTS_eval = X_test.copy()
dfTS_eval['gen'] = y_test

## Inputs of the model. Change accordingly to perform variable selection
INPUTS_LR_NUM = X_train.select_dtypes(include=['int64','float64']).columns.values.tolist()
INPUTS_LR_CAT = X_train.select_dtypes(include=['category']).columns.values.tolist()
INPUTS_LR = INPUTS_LR_NUM + INPUTS_LR_CAT

# Prepare the numeric variables by imputing by scaling
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

# Prepare the categorical variables by encoding the categories
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Create a preprocessor to perform the steps defined above
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, INPUTS_LR_NUM),
        ('cat', categorical_transformer, INPUTS_LR_CAT)
        ])

param = {'PLSR_model__n_components': [1,2,3,4,5,6,7,8]} 
pipe = Pipeline([
            ('preprocessor', preprocessor),
            ('PLSR_model', PLSRegression())
        ])
# We use Grid Search Cross Validation to find the best parameter for the model in the grid defined 
nFolds = 10
PLSR_fit_ll = GridSearchCV(estimator=pipe, # Structure of the model to use
                    param_grid=param, # Defined grid to search in
                    n_jobs=-1, # Number of cores to use (parallelize)
                    scoring='neg_mean_squared_error', # RMSE https://scikit-learn.org/stable/modules/model_evaluation.html
                    cv=nFolds) # Number of Folds 
PLSR_fit_ll.fit(X_train[INPUTS_LR], y_train) # Search in grid

# Save the model
joblib.dump(PLSR_fit_ll, 'PLSR_fit_ll.pkl')

No_lluviosos = eolica[(eolica.ccaa == "Galicia") & (eolica.Pmed_00_24 < 1)]
X = No_lluviosos[INPUTS]
y = No_lluviosos[OUTPUT]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,  #percentage of test data
                                                    random_state=0) #seed for replication
## Create dataset to store model predictions

dfTR_eval = X_train.copy()
dfTR_eval['gen'] = y_train
dfTS_eval = X_test.copy()
dfTS_eval['gen'] = y_test

## Inputs of the model. Change accordingly to perform variable selection
INPUTS_LR_NUM = X_train.select_dtypes(include=['int64','float64']).columns.values.tolist()
INPUTS_LR_CAT = X_train.select_dtypes(include=['category']).columns.values.tolist()
INPUTS_LR = INPUTS_LR_NUM + INPUTS_LR_CAT

# Prepare the numeric variables by imputing by scaling
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

# Prepare the categorical variables by encoding the categories
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Create a preprocessor to perform the steps defined above
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, INPUTS_LR_NUM),
        ('cat', categorical_transformer, INPUTS_LR_CAT)
        ])

param = {'PLSR_model__n_components': [1,2,3,4,5,6,7,8]} 
pipe = Pipeline([
            ('preprocessor', preprocessor),
            ('PLSR_model', PLSRegression())
        ])
# We use Grid Search Cross Validation to find the best parameter for the model in the grid defined 
nFolds = 10
PLSR_fit_nll = GridSearchCV(estimator=pipe, # Structure of the model to use
                    param_grid=param, # Defined grid to search in
                    n_jobs=-1, # Number of cores to use (parallelize)
                    scoring='neg_mean_squared_error', # RMSE https://scikit-learn.org/stable/modules/model_evaluation.html
                    cv=nFolds) # Number of Folds 
PLSR_fit_nll.fit(X_train[INPUTS_LR], y_train) # Search in grid

# Save the model
joblib.dump(PLSR_fit_nll, 'PLSR_fit_nll.pkl')



['PLSR_fit_nll.pkl']