In [1]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings('ignore')

# Reading Data


In [15]:
TRAIN_DATA_PATH = "/content/Train.csv"
TEST_DATA_PATH = "/content/Test.csv"

train_df = pd.read_csv(TRAIN_DATA_PATH)
test_df = pd.read_csv(TEST_DATA_PATH)

In [17]:
import numpy as np
# Split inputs and targets
train_inputs = train_df.drop(columns=['target'])

train_targets = train_df['target']
test_inputs = test_df.copy()

# Select features
dropped_columns = ['id']
train_inputs.drop(columns=dropped_columns, inplace=True)
test_inputs.drop(columns=dropped_columns, inplace=True)

# Transform categorical features
categorical_features = ['area']
oe = OrdinalEncoder()
train_inputs[categorical_features] = oe.fit_transform(train_inputs[categorical_features])
test_inputs[categorical_features] = oe.transform(test_inputs[categorical_features])

# Missing value imputation with mean
train_inputs.fillna(0, inplace=True)
test_inputs.fillna(0, inplace=True)

def timestamp_to_date(timestamp):
    return pd.to_datetime(timestamp, unit='s')

train_inputs['timestamp'] = train_inputs['timestamp'].apply(timestamp_to_date)
test_inputs['timestamp'] = test_inputs['timestamp'].apply(timestamp_to_date)

def separate_date(df):
    df['minute'] = df['timestamp'].dt.minute
    df['second'] = df['timestamp'].dt.second

    df['minute_sin'] = np.sin(2 * np.pi * df['minute'] / 60)
    df['second_sin'] = np.sin(2 * np.pi * df['second'] / 60)

    return df

train_inputs = separate_date(train_inputs)
test_inputs = separate_date(test_inputs)


# Feature Engineering


In [18]:

def feature_engineer(df,X_train, train=True):

    df['distance_from_origin'] = ((df['Latitude'] - df['Latitude'].mean()) ** 2 +
                              (df['Longitude'] - df['Longitude'].mean()) ** 2) ** 0.5
    df['altitude_above_mean'] = df['Altitude'] - df['Altitude'].mean()

    # Frequency Difference
    def calculate_frequency_difference(primary_freq, secondary_freq):
        return abs(primary_freq - secondary_freq)

    # Operator Frequency Ratio
    def calculate_operator_frequency_ratio(primary_freq, secondary_freq):
        if secondary_freq == 0:
            return 0
        else:
            return primary_freq / secondary_freq


    df['frequency_difference'] = df.apply(lambda x: calculate_frequency_difference(x['PCell_freq_MHz'], x['SCell_freq_MHz']), axis=1)
    df['operator_frequency_ratio'] = df.apply(lambda x: calculate_operator_frequency_ratio(x['PCell_freq_MHz'], x['SCell_freq_MHz']), axis=1)

    if train:
        df['Pcell_identity_count'] = df.groupby('PCell_Cell_Identity')['PCell_Cell_Identity'].transform('count')
        df['Scell_identity_count'] = df.groupby('SCell_Cell_Identity')['SCell_Cell_Identity'].transform('count')
    else:
        df['Pcell_identity_count'] = df['PCell_Cell_Identity'].map(X_train.groupby('PCell_Cell_Identity')['PCell_Cell_Identity'].count())
        df['Scell_identity_count'] = df['SCell_Cell_Identity'].map(X_train.groupby('SCell_Cell_Identity')['SCell_Cell_Identity'].count())


    lag_features = [
        'PCell_RSRP_max',
        'PCell_RSRQ_max',
        'PCell_SNR_max',
        'SCell_RSRP_max',
        'SCell_RSRQ_max',
        'SCell_SNR_max',
        'SCell_Downlink_Average_MCS',
        'SCell_Downlink_Num_RBs',
        'SCell_RSSI_max',
        'PCell_Downlink_Average_MCS',
    ]

    for feature in lag_features:
        if train:
            df[f"{feature}_groupby_PCell_Cell_Identity_mean"] = df.groupby('PCell_Cell_Identity')[feature].transform('mean')
            df[f"{feature}_groupby_SCell_Cell_Identity_mean"] = df.groupby('SCell_Cell_Identity')[feature].transform('mean')
        else:
            df[f"{feature}_groupby_PCell_Cell_Identity_mean"] = df['PCell_Cell_Identity'].map(X_train.groupby('PCell_Cell_Identity')[feature].mean())
            df[f"{feature}_groupby_SCell_Cell_Identity_mean"] = df['SCell_Cell_Identity'].map(X_train.groupby('SCell_Cell_Identity')[feature].mean())

    def bin_cut(df, feature, bins):
        df[f'{feature}_binned'], train_bins = pd.cut(df[feature], bins=bins, labels=False,retbins=True)
        return df, train_bins


    bins = 5
    for feature in lag_features:
        if train:
            df, train_bins = bin_cut(df, feature, bins)
        else:
            _, train_bins = bin_cut(X_train, feature, bins)
            df[f'{feature}_binned'] = pd.cut(df[feature], bins=train_bins, labels=False)

    return df


def generate_lag(df, train=True):
    lag_features = [
        'PCell_RSRP_max',
        'PCell_RSRQ_max',
        'PCell_SNR_max',
        'SCell_RSRP_max',
        'SCell_RSRQ_max',
        'SCell_SNR_max',
        'SCell_Downlink_Average_MCS',
        'SCell_Downlink_Num_RBs',
        'SCell_RSSI_max',
        'PCell_Downlink_Average_MCS',
    ]
    shift_by = [1]
    df.sort_values(by=['timestamp', 'device'], inplace=True)
    if train:
        df = pd.concat([df[df['device'] == 'pc1'], df[df['device'] != 'pc1']], axis=0)
    else:
        df = pd.concat([df[df['device'] == 'pc2'], df[df['device'] != 'pc2']], axis=0)
    for feature in lag_features:
        for shift in shift_by:
            df[f'{feature}_lag_{shift}'] = df.groupby('device')[feature].shift(shift)


    df = df.drop(['device','timestamp','visibility','PCell_RSSI_max'], axis=1)
    df.sort_index(inplace=True)
    return df

train_inputs = generate_lag(train_inputs)
test_inputs = generate_lag(test_inputs, train=False)
test_inputs = feature_engineer(test_inputs,train_inputs, train=False)



# Model


In [19]:
import lightgbm as lgb


class Regressor:
    def __init__(self, n_estimators=50000, device="cpu", random_state=42):
        self.n_estimators = n_estimators
        self.device = device
        self.random_state = random_state
        self.models = self._define_model()
        self.models_name = list(self._define_model().keys())
        self.len_models = len(self.models)

    def _define_model(self):

        lgb_params = {
            'n_estimators': self.n_estimators,
            'metric': 'rmse',
            'device': self.device,
            'random_state': self.random_state,
            'reg_alpha': 9.271004546600699,
              'reg_lambda': 0.0010084442599664978,
                'colsample_bytree': 0.3,
                  'subsample': 0.7,
                    'learning_rate': 0.08,
                      'max_depth': 20,
                        'num_leaves': 20,
                        'min_child_samples': 143,
        }


        models = {
            'lgb': lgb.LGBMRegressor(**lgb_params)
        }

        return models

In [22]:
from copy import deepcopy
from sklearn.model_selection import StratifiedKFold
import random

n_splits = 7
fold = StratifiedKFold(n_splits=n_splits)
random_state = 42
n_estimators = 30000
early_stopping_rounds = 300
verbose = 500
device = 'cpu'

# Fix seed
random.seed(random_state)

# Initialize an array for storing test predictions
regressor = Regressor(n_estimators, device, random_state)
test_predss = np.zeros((test_inputs.shape[0]))
oof_predss = np.zeros((train_inputs.shape[0],))
ensemble_score, ensemble_score_ = [], []
weights = []
oof_each_predss = []
oof_each_preds = np.zeros((train_inputs.shape[0], regressor.len_models))
test_each_predss = []
test_each_preds = np.zeros((test_inputs.shape[0], regressor.len_models))
trained_models = {'lgb':[]}
score_dict = dict(zip(regressor.models_name, [[] for _ in range(regressor.len_models)]))

for i, (train_index, test_index) in enumerate(fold.split(train_inputs, train_inputs['operator'])):
    n = i % n_splits

    # Get a set of regressor models
    regressor = Regressor(n_estimators, device, random_state)
    models = regressor.models

    # Initialize lists to store oof and test predictions for each base model
    oof_preds = []
    test_preds = []
    print(f'Fold {i}')
    X_train, X_val = train_inputs.iloc[train_index], train_inputs.iloc[test_index]
    y_train, y_val = train_targets.iloc[train_index], train_targets.iloc[test_index]

    ### Feature Engineering
    X_train = feature_engineer(X_train, X_train, True)
    X_val = feature_engineer(X_val, X_train, False)

    # Loop over each base model and fit it to the training data, evaluate on validation data, and store predictions
    for name, model in models.items():
        if ('xgb' in name) or ('lgb' in name):
            if 'xgb' in name:
                print("Training XGB...")
                model.fit(
                    X_train, y_train,
                    eval_set=[(X_train,y_train),(X_val, y_val)],
                    early_stopping_rounds=early_stopping_rounds, verbose=verbose)
            elif 'lgb' in name:
                print("Training LGB...")
                model.fit(
                    X_train, y_train,
                    eval_set=[(X_train,y_train),(X_val, y_val)],
                    early_stopping_rounds=early_stopping_rounds, verbose=verbose)

        if name in trained_models.keys():
            trained_models[f'{name}'].append(deepcopy(model))

        test_pred = model.predict(test_inputs[X_train.columns])
        test_pred[test_pred<0] = 0
        y_val_pred = model.predict(X_val)

        score = mean_squared_error(y_val, y_val_pred, squared=False)
        score_dict[name].append(score)
        print(f'{name} [FOLD-{n} SEED-{random_state}] RMSE score: {score:.5f}')

        oof_preds.append(y_val_pred)

        test_preds.append(test_pred)


    oof_each_preds[X_val.index] = np.stack(oof_preds).T
    test_each_preds += np.array(test_preds).T / n_splits
    if n == (n_splits - 1):
        oof_each_predss.append(oof_each_preds)
        oof_each_preds = np.zeros((X_train.shape[0], regressor.len_models))
        test_each_predss.append(test_each_preds)
        test_each_preds = np.zeros((test_inputs.shape[0], regressor.len_models))



oof_each_predss = np.mean(np.array(oof_each_predss), axis=0)
test_each_predss = np.mean(np.array(test_each_predss), axis=0)
oof_each_predss = np.concatenate([oof_each_predss, oof_predss.reshape(-1, 1)], axis=1)
test_each_predss = np.concatenate([test_each_predss, test_predss.reshape(-1, 1)], axis=1)

Fold 0
Training LGB...
[500]	training's rmse: 6.08685e+06	valid_1's rmse: 7.26502e+06
[1000]	training's rmse: 5.07609e+06	valid_1's rmse: 6.84721e+06
[1500]	training's rmse: 4.44325e+06	valid_1's rmse: 6.66391e+06
[2000]	training's rmse: 4.00488e+06	valid_1's rmse: 6.59114e+06
[2500]	training's rmse: 3.65384e+06	valid_1's rmse: 6.5313e+06
[3000]	training's rmse: 3.36756e+06	valid_1's rmse: 6.4994e+06
[3500]	training's rmse: 3.12665e+06	valid_1's rmse: 6.48217e+06
[4000]	training's rmse: 2.91703e+06	valid_1's rmse: 6.4735e+06
[4500]	training's rmse: 2.73964e+06	valid_1's rmse: 6.46339e+06
[5000]	training's rmse: 2.58631e+06	valid_1's rmse: 6.46505e+06
lgb [FOLD-0 SEED-42] RMSE score: 6460969.79835
Fold 1
Training LGB...
[500]	training's rmse: 6.10934e+06	valid_1's rmse: 7.55204e+06
[1000]	training's rmse: 5.10361e+06	valid_1's rmse: 7.12243e+06
[1500]	training's rmse: 4.4835e+06	valid_1's rmse: 6.91935e+06
[2000]	training's rmse: 4.02676e+06	valid_1's rmse: 6.82366e+06
[2500]	training's

# Submission

In [23]:
test_predictions = test_each_predss[:,0]
test_predictions[test_predictions < 0] = 0
# Add index to results
predictions_df = pd.DataFrame({'id': test_df.id, 'target': test_predictions})
predictions_df.to_csv("Submission.csv", index = False)
predictions_df.head()

Unnamed: 0,id,target
0,Id_ln0e0hfrgx,22467420.0
1,Id_svf7nz9fxv,63828600.0
2,Id_ww2mh07gwj,88503090.0
3,Id_v88r4y03ww,18495310.0
4,Id_2u4y4kzglh,2862721.0


In [24]:
predictions_df.tail()

Unnamed: 0,id,target
18238,Id_thgt73179f,37757080.0
18239,Id_r6wd5ofvtt,21884460.0
18240,Id_p6sqas9rgd,22062050.0
18241,Id_yre3gwygul,15120480.0
18242,Id_e2duej4gcq,36184100.0
