In [1]:
# Import libraries

import os
import json
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
# datasets

train_df = pd.read_csv("../data/train/GNSS_raw_train.csv")
test_df = pd.read_csv("../data/test/GNSS_raw_test.csv")

In [3]:
# feature selection

feature_selection = ['Satelite_Code', 'Code_L1', 'Phase_L1', 'Doppler_L1', 'Cnr_L1', 'Cnr_L2']
feature_selection_train = feature_selection + ['Label']

In [4]:
# feature selected training dataframe

train_df = train_df[feature_selection_train]
test_df = test_df[feature_selection]

In [5]:
# processing data function

def DataMapper():
    with open('../data/satelite mapper/satelitecode_mapper.json', 'r') as f:
        map_data = json.load(f)
    return map_data

def DataPipeline(data):
    data_ = data.dropna()
    map_data_satelite_code = DataMapper()
    data_['Satelite_Code'] = data_['Satelite_Code'].map(map_data_satelite_code)
    return data_

In [6]:
# feature selected data with processed data

train_data = DataPipeline(train_df)
test_data = DataPipeline(test_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_['Satelite_Code'] = data_['Satelite_Code'].map(map_data_satelite_code)


In [7]:
train_data.head()

Unnamed: 0,Satelite_Code,Code_L1,Phase_L1,Doppler_L1,Cnr_L1,Cnr_L2,Label
0,10,23009427.72,0.0,2573.797,27.0,0.0,0.0
1,18,25128493.65,0.0,-3386.467,18.0,0.0,0.0
2,40,22522476.85,0.0,-2432.885,27.0,0.0,0.0
3,12,24456093.34,128517615.3,-2530.973,33.0,0.0,0.0
4,21,22049460.63,0.0,327.015,24.0,0.0,0.0


In [8]:
test_data.head()

Unnamed: 0,Satelite_Code,Code_L1,Phase_L1,Doppler_L1,Cnr_L1,Cnr_L2
0,71,23637620.0,124216500.0,-4086.455,39,0
1,64,20660440.0,108571400.0,694.854,45,0
2,30,21594520.0,113480000.0,-1126.554,42,0
3,28,21091130.0,110834700.0,-882.245,42,0
4,56,20136570.0,107641500.0,1756.425,37,0


In [9]:
# x and y data

x = train_data.iloc[:, :-1]
y = train_data.iloc[:, -1]

In [10]:
# train, val, eval data

xtrain, xtemp, ytrain, ytemp = train_test_split(x, y, train_size = 0.6, random_state = 42, shuffle = True, stratify = y)
xval, xtest, yval, ytest = train_test_split(xtemp, ytemp, train_size = 0.6, random_state = 42, shuffle = True, stratify = ytemp)

In [11]:
xtrain.shape, ytrain.shape, xtemp.shape, ytemp.shape

((44451, 6), (44451,), (29634, 6), (29634,))

In [12]:
xval.shape, yval.shape, xtest.shape, ytest.shape

((17780, 6), (17780,), (11854, 6), (11854,))

In [13]:
np.unique(ytrain, return_counts=True)

(array([0., 1.]), array([17969, 26482]))

In [14]:
np.unique(yval, return_counts=True)

(array([0., 1.]), array([ 7188, 10592]))

In [15]:
np.unique(ytest, return_counts=True)

(array([0., 1.]), array([4792, 7062]))

In [16]:
# random forest classifier

In [17]:
model_rf = RandomForestClassifier(n_estimators=500).fit(xtrain, ytrain)
pred_rf = model_rf.predict(xval)

In [18]:
accuracy_score(yval, pred_rf)

0.9440382452193475

In [19]:
# xgboost classifier

In [20]:
model_xgb = xgb.XGBClassifier(objective='binary:logistic', eval_metric = 'logloss', 
                              random_state = 42).fit(xtrain, ytrain)
pred_xgb = model_xgb.predict(xval) 

In [21]:
accuracy_score(yval, pred_xgb)

0.9292463442069742

In [22]:
# lgbm classifier

In [25]:
model_lgb = lgb.LGBMClassifier(objective = 'binary', n_estimators = 500, learning_rate=0.1, class_weight = 'balanced').fit(xtrain, ytrain)
pred_lgb = model_lgb.predict(xval)

[LightGBM] [Info] Number of positive: 26482, number of negative: 17969
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000565 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 900
[LightGBM] [Info] Number of data points in the train set: 44451, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


In [26]:
accuracy_score(yval, pred_lgb)

0.9352080989876266

In [27]:
test_data

Unnamed: 0,Satelite_Code,Code_L1,Phase_L1,Doppler_L1,Cnr_L1,Cnr_L2
0,71,2.363762e+07,1.242165e+08,-4086.455,39,0
1,64,2.066044e+07,1.085714e+08,694.854,45,0
2,30,2.159452e+07,1.134800e+08,-1126.554,42,0
3,28,2.109113e+07,1.108347e+08,-882.245,42,0
4,56,2.013657e+07,1.076415e+08,1756.425,37,0
...,...,...,...,...,...,...
36183,19,2.523544e+07,1.326132e+08,-2366.205,40,0
36184,52,2.737342e+07,0.000000e+00,1473.338,30,0
36185,58,3.682122e+07,1.917378e+08,85.440,40,37
36186,35,3.687088e+07,1.919963e+08,33.936,37,37


In [58]:
pred_test_lgb = model_lgb.predict(test_data)

In [59]:
np.unique(pred_test_lgb, return_counts=True)

(array([0., 1.]), array([11577, 24611]))

In [60]:
sub_df = pd.read_csv("../data/submission/sample_submission.csv")

In [61]:
sub_df["Predict"] = [int(x) for x in pred_test_lgb.tolist()]

In [62]:
sub_df

Unnamed: 0,ID,Predict
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1
...,...,...
36183,36183,1
36184,36184,0
36185,36185,1
36186,36186,1


* random forest -> 0.78 
* xgb -> 0.756
* lgb -> 0.77

In [63]:
sub_df.to_csv("submit_3.csv", index=False)

In [64]:
# selected models -> random forest and lgbm