In [4]:
!pip install pyod
!pip install dataheroes

# !pip install preprocess

Collecting pyod
  Downloading pyod-1.1.1.tar.gz (159 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.4/159.4 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: pyod
  Building wheel for pyod (setup.py) ... [?25ldone
[?25h  Created wheel for pyod: filename=pyod-1.1.1-py3-none-any.whl size=190078 sha256=af1b8ea91cdaee04ba39fe8eff3e33930afbe2f9bc90fdcee6ed92058b5cfda4
  Stored in directory: /root/.cache/pip/wheels/a3/42/d7/48a53ffc1466bd63932f28583c64ebf442114db14a0bfa8c95
Successfully built pyod
Installing collected packages: pyod
Successfully installed pyod-1.1.1
Collecting dataheroes
  Downloading dataheroes-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting licensing>=0.31 (from datahe

In [5]:
import numpy as np
import umap
from mlxtend.plotting.pca_correlation_graph import corr2_coeff
from pyod.models.ecod import ECOD
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn.impute import SimpleImputer
from sklearn.linear_model import BayesianRidge
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.feature_selection import f_regression, SelectKBest, chi2, VarianceThreshold
import numpy as np
from sklearn.ensemble import ExtraTreesRegressor, AdaBoostRegressor, StackingRegressor, RandomForestRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import RidgeCV, Lasso, ElasticNet, LinearRegression
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import KNNImputer
from sklearn.svm import LinearSVR, SVR
from xgboost import XGBRegressor
# from preprocess import preprocess
from sklearn.metrics import r2_score
from dataheroes import CoresetTreeServiceDTC


In [47]:
##################### PRE-PROCESSSS ##################


def preprocess(X_train: np.array, y_train: np.array, X_test: np.array):
    X_train, X_test = impute_mv(X_train, X_test)
    print("Done with imputation")
    X_train, X_test = scale_data(X_train, X_test)
    print("Done with Scaling")

    # TODO
    X_train, X_test = detect_remove_outliers(X_train,y_train, X_test)
    print("Done with outliers")

    X_train, X_test = select_features(X_train, y_train, X_test)
    print("Done with Feature selection")

#     X_train, X_test = reduce_dim(X_train, X_test)
#     print("Done with reducing dim")

    return X_train, y_train, X_test


def reduce_dim(X_train, X_test, method: str = 'PCA'):
    if method == 'PCA':
        reducer = PCA(n_components='mle', svd_solver='auto')

    elif method == 'UMAP':
        reducer = umap.UMAP()

    else:
        return X_train, X_test

    X_train = reducer.fit_transform(X_train)
    X_test = reducer.transform(X_test)
    return X_train, X_test


def select_features(X_train: np.array, y_train: np.array, X_test: np.array):
    X_train, X_test = remove_correlated(X_train, X_test)

    # # Chi
    # f_p_values = chi2(X_train, y_train)
    # print(f_p_values)

    # Select k best
    fs = SelectKBest(score_func=f_regression, k=175)

    X_train = fs.fit_transform(X_train, y_train.ravel())
    X_test = fs.transform(X_test)
    return X_train, X_test


def remove_correlated(X_train: np.array, X_test: np.array):
    # Constant features
    var_threshold = VarianceThreshold(threshold=0)  # threshold = 0 for constant
    var_threshold.fit_transform(X_train)
    var_threshold.transform(X_test)

    # Correlated
    cor = corr2_coeff(X_train.T, X_train.T)
    p = np.argwhere(np.triu(np.isclose(cor, 1), 1))
    X_train = np.delete(X_train, p[:, 1], axis=1)
    X_test = np.delete(X_test, p[:, 1], axis=1)
    return X_train, X_test


def scale_data(X_train: np.array, X_test: np.array, method: str = 'min_max'):
    if method == 'robust':
        transformer = RobustScaler()
    elif method == 'min_max':
        transformer = MinMaxScaler()
    elif method == 'NONE':
        return X_train, X_test
    else:
        raise Exception(f"Scale: {method} is not implemented")

    X_train = transformer.fit_transform(X_train)
    X_test = transformer.transform(X_test)
    return X_train, X_test


def impute_mv(X_train: np.array, X_test: np.array, method: str = 'mean'):
    if method == 'median':
        imp = SimpleImputer(missing_values=np.nan, strategy='median')
    elif method == 'mean':
        imp = SimpleImputer(missing_values=np.nan,strategy='mean')
    elif method == 'KNN':
        K=3
        imp = KNNImputer(n_neighbors=K)
    elif method == 'iterative': #aka mice
        imp = IterativeImputer(estimator=BayesianRidge(), n_nearest_features=None, imputation_order='ascending')

    else:
        raise Exception(f"Impute: {method} is not implemented")

    X_train = imp.fit_transform(X_train)
    X_test = imp.fit_transform(X_test)

    return X_train, X_test


def detect_remove_outliers(X_train: np.array, y_train: np.array, X_test: np.array):
    # TODO
    train_pred_indices, test_pred = detect_outlier_obs(X_train, y_train, X_test)
    #TODO:: UPDATE TO GET THE SPECIFIC INDICES BACK
    X_train = X_train[train_pred_indices]
#     X_test = X_test[train_pred]
    return X_train, X_test


def detect_outlier_obs(X_train: np.array, y_train: np.array, X_test: np.array, method: str = 'coresets'):
    train_pred, test_pred = [], []
    if method == 'ECOD':
        for i in range(X_train.shape[1]):
            ecod = ECOD(contamination=0.05)
            ecod.fit(X_train[:, i].reshape(-1, 1))
            y_train_pred = np.array(ecod.labels_) == 0
            y_test_pred = np.array(ecod.predict(X_test[:, i].reshape(-1, 1))) == 0
            train_pred.append(y_train_pred)
            test_pred.append(y_test_pred)

    elif method == 'isolation_forest':
        for i in range(X_train.shape[1]):
            clf = IsolationForest(n_estimators=150, max_samples='auto', contamination=float(0.1))
            y_train_pred = np.array(clf.fit_predict(X_train[:, i].reshape(-1, 1))) == 1
            y_test_pred = np.array(clf.predict(X_test[:, i].reshape(-1, 1))) == 1
            train_pred.append(y_train_pred)
            test_pred.append(y_test_pred)

#             print(sum([t == -1 for t iisolation_forestn y_test_pred]))
    elif method=="coresets":
 
        tree = CoresetTreeServiceDTC(optimized_for = 'cleaning')
        tree = tree.build(X=X_train,y=y_train, chunk_size=-1)
        result = tree.get_cleaning_samples(20)
        print(result)
#         treeCoreset=tree.get_coreset(level=y_train.shape[0])
        tree.remove_samples(result['idx'])
#         print(tree.auto_preprocessing())
#         treeCoreset=tree.get_coreset(level=X_train.shape[0])
        res = tree.get_cleaning_samples(1212)
#         indices,xTrain,yTrain=treeCoreset['data']
        print(res['idx'].shape)
        train_pred.append(res['idx'])
#         print(xTrain.shape)
#         print(yTrain.shape)

    else:
        raise Exception(f"Detect: {method} is not implemented")
    print(train_pred)
    return train_pred, test_pred

In [48]:
# test
def read_data(X_train_path, y_train_path, X_test_path):
    X_train = np.genfromtxt(X_train_path, delimiter=",")
    y_train = np.genfromtxt(y_train_path, delimiter=",")
    X_test = np.genfromtxt(X_test_path, delimiter=",")
    return X_train, y_train, X_test

xTrainPath="/kaggle/input/aml-task1-allfiles/X_train.csv"
yTrainPath="/kaggle/input/aml-task1-allfiles/y_train.csv"
xTestPath="/kaggle/input/aml-task1-allfiles/X_test.csv"
X_train, y_train, X_test = read_data(X_train_path=xTrainPath,
                                     y_train_path=yTrainPath,
                                     X_test_path=xTestPath)
ids_train, ids_test = X_train[1:, 0], X_test[1:, 0].astype(int)
X_train, y_train, X_test = X_train[1:, 1:], y_train[1:, 1:].ravel(), X_test[1:, 1:]
X_train, y_train, X_test = preprocess(X_train, y_train, X_test)


Done with imputation
Done with Scaling
{'idx': array([ 514,  972,  219,  661,  396,  740,  257,  189,  304,  295,  967,
        973,  593,  492,  868,  522, 1152, 1098,  203,  167]), 'X': array([[0.53635734, 0.29711162, 0.45551874, ..., 0.51369909, 0.31247425,
        0.50399746],
       [0.42336666, 0.53088952, 0.62083674, ..., 0.64987716, 0.45522162,
        0.44872309],
       [0.41517125, 0.39780631, 0.33961371, ..., 0.36922128, 0.80351068,
        0.49648618],
       ...,
       [0.45500437, 0.37986422, 0.47377481, ..., 0.35594187, 0.16130835,
        0.46243459],
       [0.24742966, 0.31121444, 0.40946946, ..., 0.29630813, 0.5377157 ,
        0.51048071],
       [0.1839957 , 0.01986472, 0.        , ..., 0.13870795, 0.45843811,
        0.54971493]]), 'y': array([69., 65., 79., 78., 60., 60., 83., 58., 87., 52., 88., 49., 50.,
       48., 48., 45., 93., 94., 47., 44.]), 'importance': array([1.        , 1.        , 1.        , 1.        , 1.        ,
       0.98387393, 1.        , 1

ValueError: Found array with dim 3. VarianceThreshold expected <= 2.

In [6]:
###################### Train ######################
# import numpy as np
# from sklearn.ensemble import ExtraTreesRegressor, AdaBoostRegressor, StackingRegressor, RandomForestRegressor
# from sklearn.kernel_ridge import KernelRidge
# from sklearn.linear_model import RidgeCV, Lasso, ElasticNet, LinearRegression
# from sklearn.model_selection import KFold
# from sklearn.neighbors import KNeighborsRegressor
# from sklearn.svm import LinearSVR, SVR
# from xgboost import XGBRegressor
# from preprocess import preprocess
# from sklearn.metrics import r2_score

np.random.seed(42)


def read_data(X_train_path, y_train_path, X_test_path):
    X_train = np.genfromtxt(X_train_path, delimiter=",")
    y_train = np.genfromtxt(y_train_path, delimiter=",")
    X_test = np.genfromtxt(X_test_path, delimiter=",")
    return X_train, y_train, X_test


def get_model():
    estimators = [
        ('lr', RidgeCV()),
        ('lasso', Lasso(alpha=0.134694)),
        # ('enet', ElasticNet(alpha=0.201, l1_ratio=0.005)),
        # ('lm', LinearRegression()),
        # ('kernel_ridge', KernelRidge(alpha=2.0, kernel='polynomial', degree=1, coef0=0.005)),
        ('xgb', XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, colsample_bytree=0.8)),
        ('extratree', ExtraTreesRegressor(n_estimators=1000, random_state=0)),
        ('adaboost', AdaBoostRegressor(n_estimators=1000, random_state=0)),
        # ('svr_lin', SVR(kernel='linear')),
        # ['svr_rbf', SVR(kernel='rbf')],
        ('mn', KNeighborsRegressor()),
    ]
    model = StackingRegressor(estimators=estimators,
                           final_estimator=RandomForestRegressor(n_estimators=100, random_state=42))
    return model


def get_splits(X_train: np.array, nfolds: int = 10):
    kf = KFold(n_splits=nfolds, random_state=42, shuffle=True)
    return kf.split(X_train)


def main():
    methods={
        "impute_data": ['median','mean','iterative','KNN'],
        "outlier_detection":['ECOD','isolation_forest'],
        "feature_selection":[],
        "scale_data":["NONE"],
        "reduce_dim":["NONE"]
    }
    xTrainPath="/kaggle/input/aml-task1-allfiles/X_train.csv"
    yTrainPath="/kaggle/input/aml-task1-allfiles/y_train.csv"
    xTestPath="/kaggle/input/aml-task1-allfiles/X_test.csv"
    X_train, y_train, X_test = read_data(X_train_path=xTrainPath,
                                         y_train_path=yTrainPath,
                                         X_test_path=xTestPath)
    ids_train, ids_test = X_train[1:, 0], X_test[1:, 0].astype(int)
    X_train, y_train, X_test = X_train[1:, 1:], y_train[1:, 1:].ravel(), X_test[1:, 1:]
    X_train, y_train, X_test = preprocess(X_train, y_train, X_test)

    print("Preprocessed.")

    model = get_model()

    nfolds = 10
    splits = get_splits(X_train, nfolds)

    print("\nModels and folds.")

    r2 = 0
    for i, (train_index, test_index) in enumerate(splits):
        model.fit(X_train[train_index], y_train[train_index])
        pred = model.predict(X_train[test_index])
        score = r2_score(y_train[test_index], pred)
        r2 += score

        print(f"Fold {i} R2 score: {score}")

    print(f"\nAvg R2: {r2 / nfolds}")

    print("\nTrained.")

    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    res = np.column_stack((ids_test, pred))
    np.savetxt("data/out.csv", res, fmt=['%1i', '%1.4f'], delimiter=",", header="id,y", comments='')


if __name__ == "__main__":
    main()

Done with imputation
Done with Scaling
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

IndexError: boolean index did not match indexed array along dimension 0; dimension is 1212 but corresponding boolean dimension is 832