In [49]:
# Import libraries

import os
import optuna
import json
import pandas as pd
import numpy as np
import warnings
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
# datasets

train_df = pd.read_csv("../data/train/GNSS_raw_train.csv")
test_df = pd.read_csv("../data/test/GNSS_raw_test.csv")

In [3]:
# feature selection

feature_selection = ['Satelite_Code', 'Code_L1', 'Phase_L1', 'Doppler_L1', 'Cnr_L1', 'Cnr_L2']
feature_selection_train = feature_selection + ['Label']

In [4]:
# feature selected training dataframe

train_df = train_df[feature_selection_train]
test_df = test_df[feature_selection]

In [5]:
# processing data function

def DataMapper():
    with open('../data/satelite mapper/satelitecode_mapper.json', 'r') as f:
        map_data = json.load(f)
    return map_data

def DataPipeline(data):
    data_ = data.dropna()
    map_data_satelite_code = DataMapper()
    data_['Satelite_Code'] = data_['Satelite_Code'].map(map_data_satelite_code)
    return data_

In [6]:
# feature selected data with processed data

train_data = DataPipeline(train_df)
test_data = DataPipeline(test_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_['Satelite_Code'] = data_['Satelite_Code'].map(map_data_satelite_code)


In [7]:
# x and y data

x = train_data.iloc[:, :-1]
y = train_data.iloc[:, -1]

In [8]:
# train, val, eval data

xtrain, xtest, ytrain, ytest = train_test_split(x, y, train_size = 0.8, random_state = 42, shuffle = True, stratify = y)

In [39]:
xtrain_arr = np.array(xtrain)
xtest_arr = np.array(xtest)
ytrain_arr = np.array(ytrain)
ytest_arr = np.array(ytest)

In [43]:
def objective(trial, modelname):
    accuracy_folds = []
    cv_ = StratifiedKFold(n_splits = 7, shuffle = True, random_state = 42)
    for fold, (train_index, valid_index) in enumerate(cv_.split(xtrain_arr, ytrain_arr)):
        xtrain_arr_fold, xvalid_arr_fold = xtrain_arr[train_index], xtrain_arr[valid_index]
        ytrain_arr_fold, yvalid_arr_fold = ytrain_arr[train_index], ytrain_arr[valid_index]
        if modelname == "rf":
            _params = {
                "n_estimators": trial.suggest_int("n_estimators", 200, 1000),
                "max_depth": trial.suggest_int("max_depth", 3, 20),
                "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
                "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
                "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
                "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
                "n_jobs": -1,
                "random_state": 42}
            model_rf = RandomForestClassifier(**_params).fit(xtrain_arr_fold, ytrain_arr_fold)
            pred_rf = model_rf.predict(xvalid_arr_fold)
            metric_rf = accuracy_score(yvalid_arr_fold, pred_rf)
            accuracy_folds.append(metric_rf)
        elif modelname == "lgbm":
            _params = {
                "n_estimators": trial.suggest_int("n_estimators", 200, 1500),
                "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.2, log=True),
                "num_leaves": trial.suggest_int("num_leaves", 20, 256),
                "max_depth": trial.suggest_int("max_depth", 3, 15),
                "min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
                "subsample": trial.suggest_float("subsample", 0.6, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
                "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
                "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
                "random_state": 42,
                "n_jobs": -1,
                "verbose": -1,
                "verbosity": -1}
            model_lgb = lgb.LGBMClassifier(**_params).fit(xtrain_arr_fold, ytrain_arr_fold)
            pred_lgb = model_lgb.predict(xvalid_arr_fold)
            metric_lgb = accuracy_score(yvalid_arr_fold, pred_lgb)
            accuracy_folds.append(metric_lgb)
    agg_metric = np.mean(accuracy_folds)
    return agg_metric

In [69]:
study_rf = optuna.create_study(direction="maximize")

study_rf.optimize(
    lambda trial: objective(
        trial,
        modelname="rf"),
    n_trials=5
)

In [50]:
study_lgb = optuna.create_study(direction="maximize")

study_lgb.optimize(
    lambda trial: objective(
        trial,
        modelname="lgbm"),
    n_trials=5
)

In [51]:
# inference from the best parameters

In [80]:
# random forst

In [72]:
best_params_rf = study_rf.best_params
best_params_rf.update({'random_state':42})

In [73]:
best_params

{'n_estimators': 982,
 'max_depth': 17,
 'min_samples_split': 17,
 'min_samples_leaf': 3,
 'max_features': 'log2',
 'bootstrap': True,
 'random_state': 42}

In [74]:
model_rf = RandomForestClassifier(**best_params_rf).fit(xtrain_arr, ytrain_arr)
pred_rf = model_rf.predict(xtest_arr)
metric_rf = accuracy_score(ytest_arr, pred_rf)
metric_rf

0.940946210433961

In [81]:
# lgbm classifier

In [None]:
best_params_lgb = study_lgb.best_params
best_params_lgb.update({'random_state':42})

In [None]:
best_params_lgb

In [None]:
model_lgb = lgb.LGBMClassifier(**best_params_lgb).fit(xtrain_arr, ytrain_arr)
pred_lgb = model_lgb.predict(xtest_arr)
metric_lgb = accuracy_score(ytest_arr, pred_lgb)
metric_lgb

In [82]:
# kaggle

In [None]:
test_data_arr = np.array(test_data)

In [None]:
model_rf_kaggle = RandomForestClassifier(**best_params_rf).fit(xtrain_arr, ytrain_arr)
pred_rf = model_rf_kaggle.predict(test_data_arr)

In [None]:
model_lgb_kaggle = lgb.LGBMClassifier(**best_params_lgb).fit(xtrain_arr, ytrain_arr)
pred_lgb = model_lgb.predict(test_data_arr)

In [78]:
sub_df = pd.read_csv("../data/submission/sample_submission.csv")
sub_df["Predict"] = [int(x) for x in pred_rf.tolist()]

In [79]:
sub_df.to_csv("submit_5.csv", index=False)