<div align="center">
  <img src="http://sct.inf.utfsm.cl/wp-content/uploads/2020/04/logo_di.png" width="70%" style="min-width: 300px; max-width: 600px;">
  <h1>INF396 - Introducción a la Ciencia de Datos</h1>
  <h3 style="color: #555;">Tarea #2</h3>
  <p>
    <strong>Alessandro Bruno Cintolesi Rodriguez</strong><br>
    <em>202173541-0</em><br>
    <em>alessandro.cintolesi@usm.cl</em>
  </p>
</div>

<hr>

In [None]:
import h5py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import SGDRegressor, SGDClassifier
from sklearn.preprocessing import StandardScaler, RobustScaler, FunctionTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from skimpy import skim
from statsmodels.graphics.tsaplots import plot_acf

In [None]:
def kfold_cv(pipeline, x, y, n_splits=10):
	kf = KFold(n_splits=n_splits, shuffle=True, random_state=123)
	mse_list = []

	for train_index, test_index in kf.split(x):
		x_train, x_test = x[train_index], x[test_index]
		y_train, y_test = y[train_index], y[test_index]

		pipeline.fit(x_train, y_train)
		y_pred = pipeline.predict(x_test)

		mse = mean_squared_error(y_test, y_pred)
		mse_list.append(mse)

	w = pipeline.named_steps['Model'].coef_
	b = pipeline.named_steps['Model'].intercept_

	avg_mse = np.mean(mse_list)
	print(f"Mean MSE (KFold) = {avg_mse:.10f}")
	print(f"Weigths (w) = {w}")
	print(f"Bias (b) = {b}\n")

In [None]:
def generate_acf_plots(X, title="", flag=True, perc=0.1, lags=40, markersize=6):
	if flag:
		for i in range(X.shape[1]):
			x = X[:,i]
			size = round(x.shape[0]*perc)
			temp = x[:size]
			data = temp[~np.isnan(temp)]
			
			plot_acf(x=data, lags=lags, markersize=markersize, title=f"ACF Plot Feature {i+1}")
			plt.show()
			plt.close()
	else:
		size = round(X.shape[0]*perc)
		temp = X[:size]
		data = temp[~np.isnan(temp)]

		plot_acf(x=data, lags=lags, markersize=markersize, title=f"ACF Plot {title}")
		plt.show()
		plt.close()


In [None]:
def generate_violin_plots(X, title="", flag=True):
	if flag:
		for i in range(X.shape[1]):
			title = f"Feature {i+1}"
			x = X[:,i]
			data = x[~np.isnan(x)]
			df = pd.DataFrame({title: data})

			sns.violinplot(y=title, data=df)
			plt.title(f"Violin Plot {title}")
			plt.ylabel("Value")
			plt.xlabel("")
			plt.show()
			plt.close()
	else:
		x = X
		data = x[~np.isnan(x)]
		df = pd.DataFrame({title: data})

		sns.violinplot(y=title, data=df)
		plt.title(f"Violin Plot {title}")
		plt.ylabel("Value")
		plt.xlabel("")
		plt.show()
		plt.close()

# Load Datasets

Pueden descargar el archivo desde este [link](https://usmcl-my.sharepoint.com/:f:/g/personal/camilo_nunezf_usm_cl/EjV-YI_uBatJh0gfB4pk6yIBNArCTaLq_MJOYtEcbId5rQ?e=U49wS9).

In [None]:
def cargar_datasets_desde_h5(archivo_h5):
    datasets = {}
    with h5py.File(archivo_h5, "r") as f:
        for grupo in f:
            print(f"\nDataSet: {grupo}")
            datasets[grupo] = {}
            for key in f[grupo]:
                print(f" - Feature: {key}")
                datasets[grupo][key] = f[grupo][key][:]
    return datasets

datasets_train = cargar_datasets_desde_h5("datasets_train_651BDBFB.h5")

# Question 1

In [None]:
X_train = datasets_train['mercado_financiero']['X']
y_train_reg = datasets_train['mercado_financiero']['y_reg']
y_train_class = datasets_train['mercado_financiero']['y_class']

In [None]:
skim(pd.DataFrame(X_train)) 

In [None]:
generate_violin_plots(X=X_train)

In [None]:
generate_acf_plots(X=X_train)
generate_acf_plots(X=y_train_reg, title="Return Reg", flag=False)
generate_acf_plots(X=y_train_class, title="Return Class", flag=False)

In [None]:
reg_pipeline_squared = Pipeline([
	('Imputation', KNNImputer()),
	('Scaler', StandardScaler()),
	('Model', SGDRegressor(loss="squared_error"))
])

reg_pipeline_epsilon = Pipeline([
	('Imputation', KNNImputer()),
	('Scaler', StandardScaler()),
	('Model', SGDRegressor(loss="epsilon_insensitive"))
])

reg_pipeline_huber = Pipeline([
	('Imputation', KNNImputer()),
	('Scaler', StandardScaler()),
	('Model', SGDRegressor(loss="huber"))
])

In [None]:
cls_pipeline_squared = Pipeline([
	('Imputation', KNNImputer()),
	('Scaler', StandardScaler()),
	('Model', SGDClassifier(loss="squared_error"))
])

cls_pipeline_epsilon = Pipeline([
	('Imputation', KNNImputer()),
	('Scaler', StandardScaler()),
	('Model', SGDClassifier(loss="epsilon_insensitive"))
])

cls_pipeline_huber = Pipeline([
	('Imputation', KNNImputer()),
	('Scaler', StandardScaler()),
	('Model', SGDClassifier(loss="huber"))
])

In [None]:
kfold_cv(reg_pipeline_squared, X_train, y_train_reg)
kfold_cv(reg_pipeline_epsilon, X_train, y_train_reg)
kfold_cv(reg_pipeline_huber, X_train, y_train_reg)

In [None]:
kfold_cv(cls_pipeline_squared, X_train, y_train_class)
kfold_cv(cls_pipeline_epsilon, X_train, y_train_class)
kfold_cv(cls_pipeline_huber, X_train, y_train_class)

# Question 2

In [None]:
X_train = datasets_train['decaimiento_radioactivo']['X']
y_train = datasets_train['decaimiento_radioactivo']['y']

In [None]:
def IQR(X, y):
	Q1 = np.percentile(X, 25, axis=0)
	Q3 = np.percentile(X, 75, axis=0)
	IQR = Q3 - Q1
	lower_bound = Q1 - 1.5 * IQR
	upper_bound = Q3 + 1.5 * IQR

	mask = (X >= lower_bound) & (X <= upper_bound)
	valid_rows = np.all(mask, axis=1)

	return X[valid_rows], y[valid_rows]

In [None]:
q2_pipeline_SE_l2 = Pipeline([
	('Model', SGDRegressor(loss="squared_error", penalty="l2"))
])
q2_pipeline_SE_l1 = Pipeline([
	('Model', SGDRegressor(loss="squared_error", penalty="l1"))
])
q2_pipeline_H_l2 = Pipeline([
	('Model', SGDRegressor(loss="huber", penalty="l2"))
])
q2_pipeline_H_l1 = Pipeline([
	('Model', SGDRegressor(loss="huber", penalty="l1"))
])

In [None]:
skim(pd.DataFrame(X_train))

In [None]:
generate_violin_plots(X=y_train, title="y", flag=False)

In [None]:
y_train_log = np.log1p(y_train)
#generate_violin_plots(X=y_train_log, title="log(y)", flag=False)

In [None]:
X_train_iqr, y_train_iqr = IQR(X=X_train, y=y_train)
X_train_log_iqr, y_train_log_iqr = IQR(X=X_train, y=y_train_log)

In [None]:
kfold_cv(q2_pipeline_SE_l1, x=X_train, y=y_train)
kfold_cv(q2_pipeline_SE_l1, x=X_train_iqr, y=y_train_iqr)

In [None]:
kfold_cv(q2_pipeline_SE_l2, x=X_train, y=y_train)
kfold_cv(q2_pipeline_SE_l2, x=X_train_iqr, y=y_train_iqr)

In [None]:
kfold_cv(q2_pipeline_H_l1, x=X_train, y=y_train)
kfold_cv(q2_pipeline_H_l1, x=X_train_iqr, y=y_train_iqr)

In [None]:
kfold_cv(q2_pipeline_H_l2, x=X_train, y=y_train)
kfold_cv(q2_pipeline_H_l2, x=X_train_iqr, y=y_train_iqr)

In [None]:
kfold_cv(q2_pipeline_SE_l1, x=X_train, y=y_train_log)
kfold_cv(q2_pipeline_SE_l1, x=X_train_log_iqr, y=y_train_log_iqr)

In [None]:
kfold_cv(q2_pipeline_SE_l2, x=X_train, y=y_train_log)
kfold_cv(q2_pipeline_SE_l2, x=X_train_log_iqr, y=y_train_log_iqr)

In [None]:
kfold_cv(q2_pipeline_H_l1, x=X_train, y=y_train_log)
kfold_cv(q2_pipeline_H_l1, x=X_train_log_iqr, y=y_train_log_iqr)

In [None]:
kfold_cv(q2_pipeline_H_l2, x=X_train, y=y_train_log)
kfold_cv(q2_pipeline_H_l2, x=X_train_log_iqr, y=y_train_log_iqr)

# Question 3

In [None]:
X_train = datasets_train['dinamica_fluidos']['X']
y_train = datasets_train['dinamica_fluidos']['y']

In [None]:
skim(pd.DataFrame(X_train))

# Question 4

In [None]:
X_train = datasets_train['fallas_maquinaria']['X']
y_train = datasets_train['fallas_maquinaria']['y']

In [None]:
skim(pd.DataFrame(X_train))

# Question 5

In [None]:
X_train = datasets_train['fraude']['X']
y_train = datasets_train['fraude']['y']

In [None]:
skim(pd.DataFrame(X_train))