In [6]:
# This serves as a template which will guide you through the implementation of this task.  It is advised
# to first read the whole template and get a sense of the overall structure of the code before trying to fill in any of the TODO gaps
# First, we import necessary libraries:
import numpy as np
import pandas as pd
import xgboost
from xgboost import XGBRegressor


def iterativeImp(X):
    max_iter=20
    tol=0.0001
    initial_strategy='median'
    from sklearn.experimental import enable_iterative_imputer
    from sklearn.impute import IterativeImputer
    imp = IterativeImputer(max_iter=max_iter,tol=tol,initial_strategy=initial_strategy)
    imp.fit(X)
    IterativeImputer(random_state=0)
    return imp.transform(X)

def data_loading():
    """
    This function loads the training and test data, preprocesses it, removes the NaN values and interpolates the missing 
    data using imputation

    Parameters
    ----------
    Returns
    ----------
    X_train: matrix of floats, training input with features
    y_train: array of floats, training output with labels
    X_test: matrix of floats: dim = (100, ?), test input with features
    """
    # Load training data
    train_df = pd.read_csv("train.csv")
    
    #print("Training data:")
    #print("Shape:", train_df.shape)
    #print(train_df.head(2))
    #print('\n')
    
    # Load test data
    test_df = pd.read_csv("test.csv")

    #print("Test data:")
    #print(test_df.shape)
    #print(test_df.head(2))

    # Dummy initialization of the X_train, X_test and y_train   
    X_train = np.zeros_like(train_df.drop(['price_CHF'],axis=1))
    y_train = np.zeros_like(train_df['price_CHF'])
    X_test = np.zeros_like(test_df)
    
    # TODO: Perform data preprocessing, imputation and extract X_train, y_train and X_test
    #would it make sense to 1st do test and train and then train with chf alone?
    #or group by season?
    
    
    #1st extract y values from train_df => y, train  AND create test
    y = train_df['price_CHF'].to_numpy()
        #preprocess on all data!?
    train_df = train_df.drop(['price_CHF'],axis=1) 
    
    #2nd put put X_train and X_test together => data
    data = [train_df, test_df]
    data = pd.concat(data)
    
    #3rd onehotencoding & imputation on data
    from sklearn.preprocessing import OneHotEncoder
    ohe = OneHotEncoder()
    data_season = ohe.fit_transform(data[['season']]).toarray()
    X = np.append(data_season,data.drop(['season'],axis=1).to_numpy(),1)
    
    X = iterativeImp(X)   
    
        #now we can delete the price_CHF column, such as we never have used it
    #X = np.delete(X,5,1)
    
    #4th split data into test and train
    size = X_train.shape[0]
    train = X[:size,:]
    X_test = X[size:,:]
    
    #5th add y column to X_train
    X_train = np.insert(train,5,y,1)
    
    #6th delete y=nan from X_train
    X_train = X_train[~np.isnan(X_train).any(axis=1)]
    
    #7th get X_train, y_train
    y_train = X_train[:,5]
    X_train = np.delete(X_train,5,1)
        
    assert (X_train.shape[1] == X_test.shape[1]), "Invalid data shape:X_train X_test"
    assert  (X_train.shape[0] == y_train.shape[0]), "Invalid data shape:X_train y_train"
    assert (X_test.shape[0] == 100), "Invalid data shape:X_test"
    return X_train, y_train, X_test

def new_model(X_train,y_train):
    
    y_train = y_train.reshape(-1,1)
    model = XGBRegressor(random_state = 1002)
    search_space = {
        "n_estimators" : [100,200,500],
        "max_depth" : [3,6,9],
        "gamma" : [0.01, 0.1],
        "learning_rate" : [0.001, 0.01, 0.1, 1]
    }
    from sklearn.model_selection import GridSearchCV
    GS = GridSearchCV( estimator = model, param_grid = search_space, scoring = ["r2","neg_root_mean_squared_error"], refit = "r2", cv = 5, verbose = 4
    )
    GS.fit(X_train, y_train)
    print(GS.best_estimator_)
    print(GS.best_params_)
    
def modeling_and_prediction(X_train, y_train, X_test):
    """
    This function defines the model, fits training data and then does the prediction with the test data 

    Parameters
    ----------
    X_train: matrix of floats, training input with 10 features
    y_train: array of floats, training output
    X_test: matrix of floats: dim = (100, ?), test input with 10 features

    Returns
    ----------
    y_test: array of floats: dim = (100,), predictions on test set
    """

    y_pred=np.zeros(X_test.shape[0])
    #TODO: Define the model and fit it using training data. Then, use test data to make predictions
    alpha = 1e-10
    n_restarts_optimizer = 0
    #model
    from sklearn.gaussian_process import GaussianProcessRegressor
    from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel, RBF, Matern, RationalQuadratic, PairwiseKernel
    from sklearn.metrics import r2_score
    
    new_model(X_train,y_train)
    
    model = XGBRegressor(n_estimators = 200, max_depth = 3, learning_rate = 0.1, gamma =0.01, random_state = 1002)
    #'gamma': 0.01, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    
    
    
    print(y_pred)
    assert y_pred.shape == (100,), "Invalid data shape"
    return y_pred

# Main function. You don't have to change this
if __name__ == "__main__":
    # Data loading
    X_train, y_train, X_test = data_loading()
    # The function retrieving optimal LR parameters
    y_pred=modeling_and_prediction(X_train, y_train, X_test)
    # Save results in the required format
    dt = pd.DataFrame(y_pred) 
    dt.columns = ['price_CHF']
    dt.to_csv('results.csv', index=False)
    print("\nResults file successfully generated!")



shape of data
(900, 11)
2700
<bound method NDFrame._add_numeric_operations.<locals>.max of 0      3
1      3
2      3
3      3
4      3
      ..
895    3
896    3
897    3
898    3
899    3
Length: 900, dtype: int64>
[-3.0068564  -2.858772   -2.918718   -2.7979836  -1.3434502  -1.0761555
  1.1553545  -0.8506837  -0.4288469   0.40301594  0.8011815   2.0647051
  3.1891823   3.366945    2.3130054   1.9745752   2.01099     1.4661626
  2.1034665   1.5077587   3.594775    3.8479419   3.3088171   2.8572178
  2.9606562   4.0249257   6.263088    8.056987    8.485607    9.186927
  8.82988     8.553162    8.157765    7.643731    7.869851    7.965272
  7.1436505   7.5689454   8.063204    7.496813    7.8920484   7.674539
  7.5197043   8.2170315   7.846694    7.948244    7.255376    7.9147577
  7.7807565   7.705129    8.12854     8.480243    8.460199    8.473308
  8.207515    9.090643    8.564186    8.030152    7.794072    7.1555595
  6.445611    6.251153    5.474255    5.074155    5.6037364   5.134