In [42]:
# Import libraries

import os
import json
import pandas as pd
import numpy as np

In [2]:
# datasets

train_df = pd.read_csv("../data/train/GNSS_raw_train.csv")
test_df = pd.read_csv("../data/test/GNSS_raw_test.csv")

For model;
* model without gps time
* map satelite code to numerical form
* remove null data

In [5]:
# selecting the features
_features = ['Satelite_Code', 'Code_L1', 'Phase_L1', 'Doppler_L1',
       'Cnr_L1', 'code_L2', 'Phase_L2', 'Doppler_L2', 'Cnr_L2', 'Label']

In [7]:
# training data from selected features
train_data = train_df[_features]
test_data = test_df.copy()
test_data = test_data[_features[:-1]] #remove label from features

In [10]:
train_data.columns, test_data.columns

(Index(['Satelite_Code', 'Code_L1', 'Phase_L1', 'Doppler_L1', 'Cnr_L1',
        'code_L2', 'Phase_L2', 'Doppler_L2', 'Cnr_L2', 'Label'],
       dtype='object'),
 Index(['Satelite_Code', 'Code_L1', 'Phase_L1', 'Doppler_L1', 'Cnr_L1',
        'code_L2', 'Phase_L2', 'Doppler_L2', 'Cnr_L2'],
       dtype='object'))

In [12]:
# remove nans -> train data only
train_data = train_data.dropna()

In [40]:
# satelite code to numerical form
train_sc = train_data['Satelite_Code'].unique().tolist()
test_sc = test_data['Satelite_Code'].unique().tolist()
comb_sc = train_sc + test_sc
comb_sc = set(comb_sc)
map_sc = {k:v+1 for v, k in enumerate(comb_sc)}

In [43]:
with open('satelitecode_mapper.json', 'w') as f:
    json.dump(map_sc, f, indent=4)

In [44]:
# read json files

In [45]:
with open('satelitecode_mapper.json', 'r') as f:
    map_data = json.load(f)

In [58]:
map_data

{'R1': 1,
 'C16': 2,
 'C21': 3,
 'C25': 4,
 'G25': 5,
 'R6': 6,
 'G17': 7,
 'G27': 8,
 'C36': 9,
 'G1': 10,
 'G30': 11,
 'G9': 12,
 'R16': 13,
 'C12': 14,
 'G7': 15,
 'R14': 16,
 'R19': 17,
 'G4': 18,
 'E25': 19,
 'G28': 20,
 'G21': 21,
 'R22': 22,
 'E19': 23,
 'C19': 24,
 'R24': 25,
 'R18': 26,
 'C30': 27,
 'G32': 28,
 'R2': 29,
 'G31': 30,
 'E8': 31,
 'R3': 32,
 'E26': 33,
 'C8': 34,
 'C7': 35,
 'E21': 36,
 'E4': 37,
 'E2': 38,
 'C23': 39,
 'G8': 40,
 'G2': 41,
 'C24': 42,
 'G14': 43,
 'G16': 44,
 'C13': 45,
 'G22': 46,
 'E1': 47,
 'C10': 48,
 'G19': 49,
 'E27': 50,
 'R17': 51,
 'E30': 52,
 'C28': 53,
 'G3': 54,
 'R10': 55,
 'R5': 56,
 'R15': 57,
 'C6': 58,
 'R21': 59,
 'E5': 60,
 'G6': 61,
 'R20': 62,
 'E15': 63,
 'G26': 64,
 'C20': 65,
 'C27': 66,
 'E3': 67,
 'C9': 68,
 'R9': 69,
 'E7': 70,
 'G10': 71,
 'C11': 72,
 'R4': 73,
 'E13': 74}

In [47]:
map_data.keys()

dict_keys(['R1', 'C16', 'C21', 'C25', 'G25', 'R6', 'G17', 'G27', 'C36', 'G1', 'G30', 'G9', 'R16', 'C12', 'G7', 'R14', 'R19', 'G4', 'E25', 'G28', 'G21', 'R22', 'E19', 'C19', 'R24', 'R18', 'C30', 'G32', 'R2', 'G31', 'E8', 'R3', 'E26', 'C8', 'C7', 'E21', 'E4', 'E2', 'C23', 'G8', 'G2', 'C24', 'G14', 'G16', 'C13', 'G22', 'E1', 'C10', 'G19', 'E27', 'R17', 'E30', 'C28', 'G3', 'R10', 'R5', 'R15', 'C6', 'R21', 'E5', 'G6', 'R20', 'E15', 'G26', 'C20', 'C27', 'E3', 'C9', 'R9', 'E7', 'G10', 'C11', 'R4', 'E13'])

In [48]:
map_data.values()

dict_values([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74])

In [49]:
# update dataframes

In [59]:
train_data['Satelite_Code'].map(map_data).isna().sum(axis=0)

np.int64(0)

In [60]:
test_data['Satelite_Code'].map(map_data).isna().sum(axis=0)

np.int64(0)

In [61]:
train_data['Satelite_Code'] = train_data['Satelite_Code'].map(map_data)
test_data['Satelite_Code'] = test_data['Satelite_Code'].map(map_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['Satelite_Code'] = train_data['Satelite_Code'].map(map_data)


In [62]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 74085 entries, 0 to 74084
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Satelite_Code  74085 non-null  int64  
 1   Code_L1        74085 non-null  float64
 2   Phase_L1       74085 non-null  float64
 3   Doppler_L1     74085 non-null  float64
 4   Cnr_L1         74085 non-null  float64
 5   code_L2        74085 non-null  float64
 6   Phase_L2       74085 non-null  float64
 7   Doppler_L2     74085 non-null  float64
 8   Cnr_L2         74085 non-null  float64
 9   Label          74085 non-null  float64
dtypes: float64(9), int64(1)
memory usage: 6.2 MB


In [63]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36188 entries, 0 to 36187
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Satelite_Code  36188 non-null  int64  
 1   Code_L1        36188 non-null  float64
 2   Phase_L1       36188 non-null  float64
 3   Doppler_L1     36188 non-null  float64
 4   Cnr_L1         36188 non-null  int64  
 5   code_L2        36188 non-null  float64
 6   Phase_L2       36188 non-null  float64
 7   Doppler_L2     36188 non-null  float64
 8   Cnr_L2         36188 non-null  int64  
dtypes: float64(6), int64(3)
memory usage: 2.5 MB
