Importing Libraries

In [20]:
import os
import tensorflow as tf

# For data maniplution
import pandas as pd
import numpy as np

# for data visualization
import seaborn as sns
import matplotlib.pyplot as plt

# for dataset handling and MAE
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import FunctionTransformer, RobustScaler
from sklearn.base import clone
from sklearn.pipeline import make_pipeline

# importing models for training and testing
import lightgbm as lgbm
from lightgbm import *
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from catboost import CatBoostRegressor


In [21]:
# load train dataset to dataframe
data_train = pd.read_csv('train.csv').drop(['row_id', 'time_id'], axis=1)

In [39]:
# load test dataset to dataframe
data_test = pd.read_csv('test.csv').drop(['row_id', 'time_id'], axis=1)

In [None]:
# displaying the first 10 rows
data_train.head(10)

###Exploratory Data Analysis

In [None]:
# getting information about the train data
data_train.info()

In [None]:
# getting descriptive statistics about the train data
data_train.describe()

In [None]:
# Shape of the data: number of rows and columns
data_train.shape

In [None]:
# checking for null values
data_train.isna().sum()

In [None]:
# plotting the heatmap for the train dataset
plt.figure(figsize=(16,16))
sns.heatmap(data_train.corr(),annot=True,cmap='crest')
plt.title('Heatmap of the train dataset')
plt.show()

In [None]:
# visualize the time series data
plt.figure(figsize=(10, 6))
plt.plot(data_train['target'], label='Target')
plt.title('Time Series Data')
plt.legend()
plt.show()

In [22]:
# new dataframe with null values replaced by 0
train_df_0 = data_train.copy()
train_df_0.fillna(0, inplace=True)
train_df_0.head(10)

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target
0,0,0,0,3180602.69,1,0.999812,13380276.64,0.0,0.0,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704
1,1,0,0,166603.91,-1,0.999896,1642214.25,0.0,0.0,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986
2,2,0,0,302879.87,-1,0.999561,1819368.03,0.0,0.0,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995
3,3,0,0,11917682.27,-1,1.000171,18389745.62,0.0,0.0,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102
4,4,0,0,447549.96,-1,0.999532,17860614.95,0.0,0.0,0.999394,16485.54,1.000016,434.1,1.0,-7.349849
5,5,0,0,0.0,0,1.000635,13552875.92,0.0,0.0,0.999779,1962.72,1.000635,5647.65,1.0,6.779432
6,6,0,0,969969.4,1,1.000115,3647503.98,0.0,0.0,0.999506,6663.16,1.000283,3810.48,1.0,-2.499819
7,7,0,0,9412959.1,1,0.999818,21261245.87,0.0,0.0,0.999741,5139.2,1.00013,2570.6,1.0,-1.959801
8,8,0,0,2394875.85,1,0.999916,9473209.08,0.0,0.0,0.999022,52011.6,1.000041,2169.36,1.0,-5.970001
9,9,0,0,3039700.65,-1,1.000969,6248958.45,0.0,0.0,0.999354,6191.0,1.000646,6199.0,1.0,7.970333


In [23]:
# new dataframe with null values replaced by mean
train_df_mean = data_train.copy()
train_df_mean.fillna(train_df_mean.mean(), inplace=True)
train_df_mean.head(10)

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target
0,0,0,0,3180602.69,1,0.999812,13380276.64,1.001713,0.99966,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704
1,1,0,0,166603.91,-1,0.999896,1642214.25,1.001713,0.99966,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986
2,2,0,0,302879.87,-1,0.999561,1819368.03,1.001713,0.99966,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995
3,3,0,0,11917682.27,-1,1.000171,18389745.62,1.001713,0.99966,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102
4,4,0,0,447549.96,-1,0.999532,17860614.95,1.001713,0.99966,0.999394,16485.54,1.000016,434.1,1.0,-7.349849
5,5,0,0,0.0,0,1.000635,13552875.92,1.001713,0.99966,0.999779,1962.72,1.000635,5647.65,1.0,6.779432
6,6,0,0,969969.4,1,1.000115,3647503.98,1.001713,0.99966,0.999506,6663.16,1.000283,3810.48,1.0,-2.499819
7,7,0,0,9412959.1,1,0.999818,21261245.87,1.001713,0.99966,0.999741,5139.2,1.00013,2570.6,1.0,-1.959801
8,8,0,0,2394875.85,1,0.999916,9473209.08,1.001713,0.99966,0.999022,52011.6,1.000041,2169.36,1.0,-5.970001
9,9,0,0,3039700.65,-1,1.000969,6248958.45,1.001713,0.99966,0.999354,6191.0,1.000646,6199.0,1.0,7.970333


In [24]:
# new dataframe with null values replaced by first mode
train_df_mode = data_train.copy()

# selecting first mode if more than one
mode_values = train_df_mode.mode().iloc[0]
train_df_mode.fillna(mode_values, inplace=True)
train_df_mode.head(10)

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target
0,0,0,0,3180602.69,1,0.999812,13380276.64,1.0,1.0,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704
1,1,0,0,166603.91,-1,0.999896,1642214.25,1.0,1.0,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986
2,2,0,0,302879.87,-1,0.999561,1819368.03,1.0,1.0,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995
3,3,0,0,11917682.27,-1,1.000171,18389745.62,1.0,1.0,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102
4,4,0,0,447549.96,-1,0.999532,17860614.95,1.0,1.0,0.999394,16485.54,1.000016,434.1,1.0,-7.349849
5,5,0,0,0.0,0,1.000635,13552875.92,1.0,1.0,0.999779,1962.72,1.000635,5647.65,1.0,6.779432
6,6,0,0,969969.4,1,1.000115,3647503.98,1.0,1.0,0.999506,6663.16,1.000283,3810.48,1.0,-2.499819
7,7,0,0,9412959.1,1,0.999818,21261245.87,1.0,1.0,0.999741,5139.2,1.00013,2570.6,1.0,-1.959801
8,8,0,0,2394875.85,1,0.999916,9473209.08,1.0,1.0,0.999022,52011.6,1.000041,2169.36,1.0,-5.970001
9,9,0,0,3039700.65,-1,1.000969,6248958.45,1.0,1.0,0.999354,6191.0,1.000646,6199.0,1.0,7.970333


In [None]:
# checking outliers in the data
plt.figure(figsize=(6,6))
plt.title("Boxplot to detect outlier in the train data", fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
sns.boxplot(x=data_train['target'])
plt.show()

In [25]:
def find_outliers(dataframe, attribute):
    # determine the number of rows containing outliers
    # compute the 25th and 75th percentile value in target
    percentile25 = dataframe[attribute].quantile(0.25)
    percentile75 = dataframe[attribute].quantile(0.75)

    # compute the interquatile range in target
    iqr = percentile75 - percentile25

    # define upper limit and lower limit for non-outlier values
    upper_limit = percentile75 + (1.5 * iqr)
    lower_limit = percentile25 - (1.5 * iqr)
    print("Upper Limit:", upper_limit)
    print("Lower_Limit:", lower_limit)

    # identify the subset pf data containing outliers in target
    outliers = dataframe[(dataframe[attribute] > upper_limit) | (dataframe[attribute] < lower_limit)]

    # count how many rows in the data contain outliers in `target`
    print("Number of rows in the data containing outliers in ", attribute, ":", len(outliers))


In [27]:
print('For train_df_0: ')
find_outliers(train_df_0, "target")

print('\nFor train_df_mean: ')
find_outliers(train_df_mean, "target")

print('\nFor train_df_mode: ')
find_outliers(train_df_mode, "target")

For train_df_0: 
Upper Limit: 17.86351195
Lower_Limit: -18.013715649999998
Number of rows in the data containing outliers in  target : 292263

For train_df_mean: 
Upper Limit: 17.86351195
Lower_Limit: -18.013715649999998
Number of rows in the data containing outliers in  target : 292263

For train_df_mode: 
Upper Limit: 17.86351195
Lower_Limit: -18.013715649999998
Number of rows in the data containing outliers in  target : 292263


In [23]:
# handling outliers using log

train_df_0_log = train_df_0.copy()
train_df_0_log['target'] = np.log1p(pd.to_numeric(train_df_0_log['target'], errors='coerce'))

# train_df_mean_log = train_df_mean.copy()
# train_df_mean_log['target'] = np.log1p(pd.to_numeric(train_df_mean_log['target'], errors='coerce'))

# train_df_mode_log = train_df_mode.copy()
# train_df_mode_log['target'] = np.log1p(pd.to_numeric(train_df_mode_log['target'], errors='coerce'))

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [26]:
# creating a RobustScaler obj
scaler = RobustScaler()

# handling outliers using robust scaling
train_df_0_rob = train_df_0.copy()
train_df_0_rob['target'] = scaler.fit_transform(train_df_0_rob['target'].values.reshape(-1, 1))

# train_df_mean_rob = train_df_mean.copy()
# train_df_mean_rob['target'] = scaler.fit_transform(train_df_mean_rob['target'].values.reshape(-1, 1))

# train_df_mode_rob = train_df_mode.copy()
# train_df_mode_rob['target'] = scaler.fit_transform(train_df_mode_rob['target'].values.reshape(-1, 1))


In [11]:
# setting a seed value to ensure reproducibility across different runs
seed = 69

# creating a TimeSeriesSplit object with 10 splits. 
# TimeSeriesSplit is a cross-validator that provides train/test indices to split time series data.
tss = TimeSeriesSplit(10)

# setting the Python hash seed to the same value as the random seed
# ensuring reproducibility when using hash-based operations in Python.
os.environ['PYTHONHASHSEED'] = '69'

# setting the random seed for TensorFlow's Keras API to ensure reproducibility in neural network training. 
tf.keras.utils.set_random_seed(seed)

In [15]:
def feature_addition(x):
    """
    Function to add various features to the input DataFrame.

    Parameters:
    - x: Input DataFrame

    Returns:
    - DataFrame with additional features
    """
    # list of initial features
    list_of_features = ['seconds_in_bucket', 'imbalance_buy_sell_flag', 'imbalance_size', 'matched_size', 'bid_size', 'ask_size',
                        'reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap']
    
    # create a copy of the input DataFrame
    x_copy = x.copy()
    
    # finding imbalance ratios
    # measures the relative difference between the number of shares available for buying (bid) and selling (ask). 
    # it helps the model understand the balance between buyers and sellers in the order book.
    x_copy['imb_s1'] = x.eval('(bid_size - ask_size) / (bid_size + ask_size)')

    # captures the relative difference between the total unmatched shares and the shares that have found a match. 
    # this provides insights into the imbalance between supply and demand.
    x_copy['imb_s2'] = x.eval('(imbalance_size - matched_size) / (matched_size + imbalance_size)')
    
    # list of price-related features
    list_of_prices = ['reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap']
    
    # for each pair of prices the code calculates a ratio that expresses the imbalance between them. 
    # valuable for understanding how different price levels interact and influence each other.
    for i, a in enumerate(list_of_prices):
        for j, b in enumerate(list_of_prices):
            if i > j:
                # price differences as ratios
                x_copy[f'{a}_{b}_imb'] = x.eval(f'({a} - {b}) / ({a} + {b})')
                list_of_features.append(f'{a}_{b}_imb')

    # takes into account sets of three prices and calculates a ratio based on their maximum, minimum, and middle values. 
    # introduces a more sophisticated understanding of the relationships between multiple price levels.
    # in financial markets, understanding the dynamics between various price levels is crucial for predicting price movements and making informed trading decisions.               
    for i, a in enumerate(list_of_prices):
        for j, b in enumerate(list_of_prices):
            for k, c in enumerate(list_of_prices):
                if i > j and j > k:
                    # triple-wise price differences as ratios
                    max_ = x[[a, b, c]].max(axis=1)
                    min_ = x[[a, b, c]].min(axis=1)
                    mid_ = x[[a, b, c]].sum(axis=1) - min_ - max_
 
                    x_copy[f'{a}_{b}_{c}_imb2'] = (max_ - mid_) / (mid_ - min_)
                    list_of_features.append(f'{a}_{b}_{c}_imb2')
                    
    # additional Features
    x_copy['spread'] = x['ask_price'] - x['bid_price']
    x_copy['midpoint'] = 0.5 * (x['ask_price'] + x['bid_price'])
    x_copy['price_range'] = x['far_price'] - x['near_price']
    x_copy['wap_difference'] = x['wap'] - x['reference_price']
    
    # volatility Features
    x_copy['price_volatility'] = x['wap'].pct_change().rolling(window=10).std()
    x_copy['imbalance_volatility'] = x_copy['imbalance_size'].pct_change().rolling(window=10).std()
    x_copy['ask_size_mean'] = x['ask_size'].rolling(window=10).mean()
    x_copy['bid_size_mean'] = x['bid_size'].rolling(window=10).mean()

    # appending the newly created features into the list
    list_of_features.extend(['imb_s1', 'imb_s2', 'spread', 'midpoint', 'price_range', 'wap_difference', 'price_volatility', 'imbalance_volatility', 'ask_size_mean', 'bid_size_mean'])
 
    return x_copy[list_of_features]
 
FeatureAddition = FunctionTransformer(feature_addition)

In [None]:
def train_predict_impute(estimator, cv = tss, label = ''):
    
    X = data_train[~data_train.target.isna()].drop(['row_id', 'time_id'], axis = 1)
    y = X.pop('target')
    
    #initiate prediction arrays and score lists
    val_predictions = np.zeros((len(X)))
    #train_predictions = np.zeros((len(sample)))
    train_scores, val_scores = [], []
    
    #training model, predicting prognosis probability, and evaluating metrics
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        
        model = clone(estimator)
        
        #define train set
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        
        #define validation set
        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx]
        
        #train model
        model.fit(X_train, y_train)
        
        #make predictions
        train_preds = model.predict(X_train)
        val_preds = model.predict(X_val)
                  
        val_predictions[val_idx] += val_preds
        
        #evaluate model for a fold
        train_score = mean_absolute_error(y_train, train_preds)
        val_score = mean_absolute_error(y_val, val_preds)
        
        #append model score for a fold to list
        train_scores.append(train_score)
        val_scores.append(val_score)
    print('Imputed Train Dataframe')
    print(f'Val Score: {np.mean(val_scores):.5f} ± {np.std(val_scores):.5f} | Train Score: {np.mean(train_scores):.5f} ± {np.std(train_scores):.5f} | {label}')
    
    return val_scores, val_predictions

In [35]:
def train_predict_0(estimator, cv = tss, label = ''):
    
    X = train_df_0[~train_df_0.target.isna()]
    y = X.pop('target')
    
    #initiate prediction arrays and score lists
    val_predictions = np.zeros((len(X)))
    #train_predictions = np.zeros((len(sample)))
    train_scores, val_scores = [], []
    
    #training model, predicting prognosis probability, and evaluating metrics
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        
        model = clone(estimator)
        
        #define train set
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        
        #define validation set
        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx]
        
        #train model
        model.fit(X_train, y_train)
        
        #make predictions
        train_preds = model.predict(X_train)
        val_preds = model.predict(X_val)
                  
        val_predictions[val_idx] += val_preds
        
        #evaluate model for a fold
        train_score = mean_absolute_error(y_train, train_preds)
        val_score = mean_absolute_error(y_val, val_preds)
        
        #append model score for a fold to list
        train_scores.append(train_score)
        val_scores.append(val_score)
    print('Train Dataframe with NA replaced with 0')
    print(f'Val Score: {np.mean(val_scores):.5f} ± {np.std(val_scores):.5f} | Train Score: {np.mean(train_scores):.5f} ± {np.std(train_scores):.5f} | {label}')
    
    return val_scores, val_predictions

In [14]:
def train_predict_mean(estimator, cv = tss, label = ''):
    
    X = train_df_mean[~train_df_mean.target.isna()]
    y = X.pop('target')
    
    #initiate prediction arrays and score lists
    val_predictions = np.zeros((len(X)))
    #train_predictions = np.zeros((len(sample)))
    train_scores, val_scores = [], []
    
    #training model, predicting prognosis probability, and evaluating metrics
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        
        model = clone(estimator)
        
        #define train set
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        
        #define validation set
        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx]
        
        #train model
        model.fit(X_train, y_train)
        
        #make predictions
        train_preds = model.predict(X_train)
        val_preds = model.predict(X_val)
                  
        val_predictions[val_idx] += val_preds
        
        #evaluate model for a fold
        train_score = mean_absolute_error(y_train, train_preds)
        val_score = mean_absolute_error(y_val, val_preds)
        
        #append model score for a fold to list
        train_scores.append(train_score)
        val_scores.append(val_score)
    print('Train Dataframe with NA replaced with mean')
    print(f'Val Score: {np.mean(val_scores):.5f} ± {np.std(val_scores):.5f} | Train Score: {np.mean(train_scores):.5f} ± {np.std(train_scores):.5f} | {label}')
    
    return val_scores, val_predictions

In [18]:
def train_predict_mode(estimator, cv = tss, label = ''):
    
    X = train_df_mode[~train_df_mode.target.isna()]
    y = X.pop('target')
    
    #initiate prediction arrays and score lists
    val_predictions = np.zeros((len(X)))
    #train_predictions = np.zeros((len(sample)))
    train_scores, val_scores = [], []
    
    #training model, predicting prognosis probability, and evaluating metrics
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        
        model = clone(estimator)
        
        #define train set
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        
        #define validation set
        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx]
        
        #train model
        model.fit(X_train, y_train)
        
        #make predictions
        train_preds = model.predict(X_train)
        val_preds = model.predict(X_val)
                  
        val_predictions[val_idx] += val_preds
        
        #evaluate model for a fold
        train_score = mean_absolute_error(y_train, train_preds)
        val_score = mean_absolute_error(y_val, val_preds)
        
        #append model score for a fold to list
        train_scores.append(train_score)
        val_scores.append(val_score)
    print('Train Dataframe with NA replaced with mode')
    print(f'Val Score: {np.mean(val_scores):.5f} ± {np.std(val_scores):.5f} | Train Score: {np.mean(train_scores):.5f} ± {np.std(train_scores):.5f} | {label}')
    
    return val_scores, val_predictions

In [36]:
models = [
    # ('XGBoost', XGBRegressor(random_state = seed, objective = 'reg:absoluteerror', tree_method = 'gpu_hist', missing = np.nan)),
    ('LightGBM', LGBMRegressor(random_state = seed, objective = 'mae', device_type = 'gpu')),
    # ('CatBoost', CatBoostRegressor(random_state = seed, objective = 'MAE', verbose = 0))
]

for (label, model) in models:
    ans = train_predict_0(
        make_pipeline(
            FeatureAddition,
            model
        ),
        label = label
    )

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 12043
[LightGBM] [Info] Number of data points in the train set: 476180, number of used features: 49
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 49 dense feature groups (23.61 MB) transferred to GPU in 0.036600 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score -0.060201
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 12043
[LightGBM] [Info] Number of data points in the train set: 952360, number of used features: 49
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8


In [None]:
train_df_0_X = train_df_0[~train_df_0.target.isna()]
train_df_0_Y = train_df_0_X.pop('target')

In [174]:
model = make_pipeline(
    FeatureAddition,
    LGBMRegressor(random_state = seed, objective = 'mae', device_type = 'gpu',  n_estimators=250)
)

model.fit(train_df_0_X, train_df_0_Y)

sample_prediction = pd.read_csv('sample_submission.csv')
sample_prediction['target'] = model.predict(data_test)    

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 12043
[LightGBM] [Info] Number of data points in the train set: 5237980, number of used features: 49
[LightGBM] [Info] Using GPU Device: Intel(R) UHD Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 49 dense feature groups (259.76 MB) transferred to GPU in 0.288197 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score -0.060201


In [None]:
print(sample_prediction)

In [None]:
sample_prediction.isna().sum()

In [123]:
revealed_data = pd.read_csv('revealed_targets.csv')

In [None]:
revealed_data.isna().sum()

In [141]:
revealed_data_target = revealed_data['revealed_target']
revealed_data_target.head()

0    -2.310276
1   -12.850165
2    -0.439882
3     7.259846
4     4.780292
Name: revealed_target, dtype: float64

In [175]:
predicted_values = sample_prediction['target']

In [105]:
iqbal = pd.read_csv('C:/Users/avani/Downloads/submission.csv')
iqbaltarget = iqbal['target']

In [126]:
mae = mean_absolute_error(iqbaltarget, revealed_data_target)

# Print MAE
print(f'Mean Absolute Error: {mae:.5f}')

Mean Absolute Error: 5.49364


In [171]:
mae = mean_absolute_error(predicted_values, revealed_data_target)

# Print MAE
print(f'Mean Absolute Error: {mae:.5f}')

Mean Absolute Error: 5.81679


In [176]:
mae = mean_absolute_error(iqbaltarget, predicted_values)

# Print MAE
print(f'Mean Absolute Error: {mae:.5f}')

Mean Absolute Error: 0.59933


In [152]:
predicted_values.max()

26.909637888176167

In [177]:
r2 = r2_score(iqbaltarget, predicted_values)
print(f'R-squared: {r2:.4f}')

R-squared: 0.5628


In [5]:
data = pd.read_csv('train.csv').drop(['row_id', 'time_id'], axis=1)

data.fillna(data.mean(), inplace=True)


In [6]:
# Assume 'target' is your target variable column
X = data.drop('target', axis=1)
y = data['target']



In [7]:
# Split the data into training (80%), testing (10%), and validation (10%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [18]:
model = make_pipeline(
    FeatureAddition,
    LGBMRegressor(random_state = seed, objective = 'mae', n_estimators=500)
)

model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 3.035871 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14083
[LightGBM] [Info] Number of data points in the train set: 4190384, number of used features: 57
[LightGBM] [Info] Start training from score -0.060201


In [13]:
y_test_pred = model.predict(X_test)

In [19]:
# Evaluate the model using Mean Absolute Error (MAE) on the validation set
mae_val = mean_absolute_error(y_val, model.predict(X_val))

print(f'MAE on Validation Set: {mae_val:.4f}')

MAE on Validation Set: 6.2461


In [58]:
print(data.shape, X_train.shape, X_test.shape, X_val.shape, y_test_pred.shape)

(5237980, 15) (4190384, 14) (523798, 14) (523798, 14) (523798,)


In [59]:
y_val.max()

241.22

In [60]:
y_test_pred.max()

59.43255478622195

In [61]:
r2 = r2_score(y_val, model.predict(X_val))
print(f'R-squared: {r2:.4f}')

R-squared: 0.0688


In [2]:
y.max()

NameError: name 'y' is not defined