## Imports and read in as before

In [None]:
import numpy as np 
import pandas as pd 

from sklearn import preprocessing
import matplotlib.pyplot as plt 

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import IsolationForest
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer


from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.impute import SimpleImputer

############# read training and test data from the files #############
X_train = pd.read_csv("X_train.csv").to_numpy()
X_train = X_train[:,1:]  #remove the id column
y_train = pd.read_csv("y_train.csv").to_numpy()
y_train = y_train[:,1]
X_test = pd.read_csv("X_test.csv").to_numpy()
X_test = X_test[:,1:]  #remove the id column

X_train0 = X_train
y_train0 = y_train
X_test0 = X_test

## Removal of redundant features as before

In [None]:
# first remove features with very low variance
normalized_std_threshold = 0.001
normalized_std = np.zeros(X_train.shape[1])
means_abs = np.zeros(X_train.shape[1])
for i in range(X_train.shape[1]):
    vec = X_train[:,i]
    means_abs[i] = np.abs(np.mean(vec[~np.isnan(vec)])) + 1
    normalized_std[i] = np.std(vec[~np.isnan(vec)])/means_abs[i]

mask = np.ones(len(normalized_std), dtype=bool)
mask[normalized_std < normalized_std_threshold] = False

X_train = X_train[:, mask]
X_test = X_test[:, mask]

# Scaling as before and then outlier rejection before initial imputation
### Imputation perhaps with MICE already

In [None]:
# # Scaling
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


# proposal: first outlier removal:
############# Outlier detection: IsolationForest #############
outlier_det = IsolationForest(random_state=0, contamination=0.05, max_features=0.5).fit(X_train_sc_imp0)
anomaly_score = outlier_det.decision_function(X_train_sc_imp0)

### to determine the contamination parameter, I propose to first plot the anomaly score function and then decide on the
### the contamination ratio based on visual thresholding.
### HERE WE NEED TO VISUALLY DETERMINE A THRESHOLD:
anomaly_threshold = 0.0
plt.plot(anomaly_score)
plt.plot(np.ones(anomaly_score.shape)*anomaly_threshold)
plt.ylabel('anomaly score')
plt.show()

outliers=np.where(anomaly_score < anomaly_threshold)

mask = np.ones(len(anomaly_score), dtype=bool)
mask[outliers[0]] = False
X_train_cl0 = X_train_sc_imp0[mask,:]
y_train_cl0 = y_train[mask]

print('Size of training set after initial preprocessing:', X_train_cl0.shape)

# and then imputation

############ Imputation #############
# imputer = KNNImputer(n_neighbors=50) 
# X_train_sc_imp0 = imputer.fit_transform(X_train_scaled)
# X_test_sc_imp0 = imputer.fit_transform(X_test_scaled)

Imputer = SimpleImputer(missing_values=np.nan, strategy='median')
X_train_sc_imp0 = Imputer.fit_transform(X_train_scaled)
X_test_sc_imp0 = Imputer.fit_transform(X_test_scaled)



## Feature Selection

In [None]:
############# Feature selection using ExtraTreesRegressor #############

selector = ExtraTreesRegressor(n_estimators=50, random_state=0, min_samples_split=0.02, max_samples=0.9)
selector = selector.fit(X_train_cl0, y_train_cl0)
selector.feature_importances_  

support = np.where(selector.feature_importances_ > 1e-3)[0]
print('number of selected features:', len(support))

## Correlated Feature Removal

In [None]:

# corr_threshold = 0.9

# corr_matrix = np.abs(np.corrcoef(X_train_cl0[:,support], rowvar=False))
# mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
# corr_mat_triu = np.ma.masked_array(corr_matrix, mask=mask)

# # plt.imshow(corr_matrix, cmap='hot', interpolation='nearest')
# # plt.show()

# drop_features = [c for c in range(corr_mat_triu.shape[1]) if any(corr_mat_triu[:,c] > corr_threshold)]
# corr_mask = np.ones(corr_mat_triu.shape[1], dtype=bool)
# corr_mask[drop_features] = False
# support = support[corr_mask]

# Outlier Rejection and Data Imputation with relevant features
## Again with the order Outlier detection -> Imputation

In [None]:
#Outlier Rejection 2:
outlierdet = IsolationForest(random_state=0, contamination=0.05, max_features=0.5).fit(X_train_sc_imp)
anomaly_score2 = outlierdet.decision_function(X_train_sc_imp)

anomaly_threshold2 = 0.02
plt.plot(anomaly_score2)
plt.plot(np.ones(anomaly_score2.shape)*anomaly_threshold2)
plt.ylabel('anomaly score after feature sel')
plt.show()

outliers=np.where(anomaly_score2 < anomaly_threshold2)

mask = np.ones(len(anomaly_score2), dtype=bool)
mask[outliers[0]] = False
X_train_cl = X_train_sc_imp[mask,:]
y_train_cl = y_train_sel[mask]

print('Shape of the training set:', X_train_cl.shape)


#Imputation 2:
#############  #############
X_train_sel = X_train_scaled[:,support]
X_train_sel = X_train_sel[mask, :]
X_test_sel = X_test_scaled[:,support]
y_train_sel = y_train[mask]

############# MICE imputation #############

imp_mean = IterativeImputer(random_state=0, n_nearest_features = None, sample_posterior = False, max_iter=100) 
X_train_sc_imp = imp_mean.fit_transform(X_train_sel)
X_test_sc_imp = imp_mean.fit_transform(X_test_sel)

# Regression
## Regression models and combination thereof
### Just implement it differently than the vectorized approach
### What is this split between decision trees and the other methods?

In [None]:
############# Regression using a decision tree #############

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
reg_model = DecisionTreeRegressor(random_state=0, min_samples_split=0.01)
n_scores = cross_val_score(reg_model, X_train_cl, y_train_cl, scoring='r2', cv=cv, n_jobs=-1, error_score='raise')
print('MAE: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

run_list = [0,0,1,0,1]

############# Regression using a random forest #############
if run_list[0]:
    rf_reg = ExtraTreesRegressor(n_estimators=100, random_state=0, min_samples_split=5, max_samples=None)
    n_scores = cross_val_score(rf_reg, X_train_cl, y_train_cl, scoring='r2', cv=cv, n_jobs=-1, error_score='raise')
    print('ExtraTrees: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))
    regressor = rf_reg.fit(X_train_cl, y_train_cl)
    y_predicted_rf = regressor.predict(X_test_sc_imp)



############# Regression using HistGradientBoostingRegressor #############
if run_list[1]:
    HGB_reg = HistGradientBoostingRegressor(max_iter=200, learning_rate = 0.1, l2_regularization = 10)
    n_scores = cross_val_score(HGB_reg, X_train_cl, y_train_cl, scoring='r2', cv=cv, n_jobs=-1, error_score='raise')
    print('HBG: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))
    regressor = HGB_reg.fit(X_train_cl, y_train_cl)
    y_predicted_hgb = regressor.predict(X_test_sc_imp)

############# Regression using SVR #############
if run_list[2]:
    SVR_reg = SVR(kernel='rbf', degree=50, gamma=0.011, coef0=0.0, tol=0.1, C=100, epsilon=0.1, shrinking=True, cache_size=200)
    n_scores = cross_val_score(SVR_reg, X_train_cl, y_train_cl, scoring='r2', cv=cv, n_jobs=-1, error_score='raise')
    print('SVR: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))
    regressor = SVR_reg.fit(X_train_cl, y_train_cl)
    y_predicted_svr = regressor.predict(X_test_sc_imp)
    
############# Regression using adaboost #############
if run_list[3]:
    ada_regr = AdaBoostRegressor(random_state=0, n_estimators=1000, loss='square', learning_rate=0.5)
    n_scores = cross_val_score(ada_regr, X_train_cl, y_train_cl, scoring='r2', cv=cv, n_jobs=-1, error_score='raise')
    print('Ada: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

############# Regression using MLP #############
if run_list[4]:
    MLP_reg = MLPRegressor(random_state=10, max_iter=10000,activation='tanh',solver='sgd',alpha=10, hidden_layer_sizes=(200))
    n_scores = cross_val_score(MLP_reg, X_train_cl, y_train_cl, scoring='r2', cv=cv, n_jobs=-1, error_score='raise')
    print('MLP: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))
    regressor = MLP_reg.fit(X_train_cl, y_train_cl)
    y_predicted_mlp = regressor.predict(X_test_sc_imp)


# Evaluate Predictions and write to file


In [None]:
############# Write out the predictions to a csv file #############
y_predicted = 1/2*(y_predicted_svr + y_predicted_mlp)
d = {'id': range(len(y_predicted)), 'y': y_predicted}
y_predicted_df = pd.DataFrame(data=d)
y_predicted_df.to_csv("AMLES_submission_v6.csv", index=False)