In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import os
import glob
from sklearn.preprocessing import StandardScaler
import json
import warnings
warnings.filterwarnings('ignore')

# Load Training Data

In [10]:
train = pd.read_csv("./train.csv")

In [11]:
train

Unnamed: 0,Station,Ob,value,measure,target,R_flag,I_flag,Z_flag,B_flag
0,AURO,1/2/2021 0:30,19.200,temp_wxt,False,2,-1,0,1
1,AURO,1/2/2021 4:30,19.800,temp_wxt,False,2,-1,0,1
2,AURO,1/2/2021 5:30,19.500,temp_wxt,False,2,-1,0,1
3,AURO,1/2/2021 7:30,18.500,temp_wxt,False,2,-1,0,1
4,AURO,2/16/2021 2:30,17.700,temp_wxt,False,2,-1,0,1
...,...,...,...,...,...,...,...,...,...
6593269,WINE,4/3/2021 6:26,-5.289,blackglobetemp,False,4,-1,-1,-1
6593270,WINE,4/3/2021 6:27,-5.189,blackglobetemp,False,4,-1,-1,-1
6593271,WINE,4/3/2021 6:28,-5.123,blackglobetemp,False,4,-1,-1,-1
6593272,WINE,4/3/2021 6:29,-5.090,blackglobetemp,False,4,-1,-1,-1


# Load Full Station Data and Merge with Training data
### Since the Training data does not contain enough info, we have to get data  from their full observation from the full dataset and merge them based on 'Station' and 'Ob'

# Standardize the data
### we use StandardScalar to standardize the applicable columns

In [14]:
path = os.getcwd()
csv_files = glob.glob(os.path.join(path, "full/*.csv"))
each_Station = {}

# load each full file
for f in csv_files:
    each_Station[f[101:105].lower()] = pd.read_csv(f)
    
    temp = each_Station[f[101:105].lower()][each_Station[f[101:105].lower()].columns[3:26]]
    temp2 = each_Station[f[101:105].lower()][['Station', 'Ob']]
    
    temp = pd.DataFrame(StandardScaler().fit_transform(temp),columns = temp.columns).fillna(0)
    each_Station[f[101:105].lower()] = pd.concat([temp2, temp], axis=1, join='inner')
    
temp = []

# add each dataframe to a list
for key in each_Station:
    
    temp.append(each_Station[key])

# concat the dataframe, combine all rows
all_stations = pd.concat(temp)

In [15]:
# merge the training data and all_station data on 'Station', 'Ob'
dfull = pd.merge(train, all_stations, how="inner", on=['Station', 'Ob'])

In [16]:
dfull

Unnamed: 0,Station,Ob,value,measure,target,R_flag,I_flag,Z_flag,B_flag,temp_wxt,...,sm,temp10,ws02,wd02,gust02,ws06,wd06,gust06,leafwetness,blackglobetemp
0,AURO,1/2/2021 0:30,19.200,temp_wxt,False,2,-1,0,1,0.200332,...,1.299891,0.285183,2.134199,-0.188946,1.805884,2.412060,-0.164213,2.866926,1.336306,-0.081997
1,AURO,1/2/2021 0:30,0.516,sm,False,0,-1,-1,1,0.200332,...,1.299891,0.285183,2.134199,-0.188946,1.805884,2.412060,-0.164213,2.866926,1.336306,-0.081997
2,AURO,1/2/2021 4:30,19.800,temp_wxt,False,2,-1,0,1,0.272691,...,1.299891,0.265536,-0.318629,-0.020069,-0.165024,0.472992,0.095770,0.467511,0.528870,-0.049966
3,AURO,1/2/2021 4:30,0.516,sm,False,0,-1,-1,0,0.272691,...,1.299891,0.265536,-0.318629,-0.020069,-0.165024,0.472992,0.095770,0.467511,0.528870,-0.049966
4,AURO,1/2/2021 4:30,19.840,temp10,False,0,-1,-1,0,0.272691,...,1.299891,0.265536,-0.318629,-0.020069,-0.165024,0.472992,0.095770,0.467511,0.528870,-0.049966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6593269,WINE,6/29/2021 7:51,3.905,ws06,False,0,0,2,-1,0.769594,...,0.071498,0.783270,-0.520629,-0.902249,-0.317398,1.982894,-1.365303,2.302407,0.860462,0.153844
6593270,WINE,6/29/2021 9:08,1.842,ws06,False,0,0,2,-1,0.881537,...,0.071498,0.901790,0.950665,-2.164559,0.944897,0.241764,-1.194010,0.588780,-0.140997,0.502130
6593271,WINE,6/29/2021 11:00,0.787,ws06,False,0,0,2,-1,1.007473,...,0.071498,1.010315,-0.520629,-0.747047,-0.485704,-0.648635,-1.261304,-0.693925,0.397455,0.662877
6593272,WINE,8/31/2021 0:30,0.601,ws06,False,0,4,0,-1,0.881537,...,-0.552324,0.901790,2.106683,-1.171266,2.627957,-0.805615,-0.015357,1.020332,0.631026,0.203140


### Save the full CSV if needed

In [17]:
# dfull.to_csv('full.csv', index=False)

# Handling Imbalanced Training Data
### Since the Training data is so imbalanced, we have to create a balanced dataset

In [18]:
dtrue = dfull[dfull['target'] == True]
dfalse = dfull[dfull['target'] == False]

In [19]:
# Over sample then randomize the dataset
train_true = dtrue

# create more dtrue data
for i in range(0,5):
    result = [train_true, dtrue, dtrue, dtrue, dtrue, dtrue]
    train_true = pd.concat(result, ignore_index=True)


In [20]:
train_true = pd.concat([train_true, dtrue], ignore_index=True)
train_true = train_true.sample(frac=1).reset_index(drop=True)


In [21]:
dfalse = dfalse.sample(frac=1).reset_index(drop=True)

dfalse

Unnamed: 0,Station,Ob,value,measure,target,R_flag,I_flag,Z_flag,B_flag,temp_wxt,...,sm,temp10,ws02,wd02,gust02,ws06,wd06,gust06,leafwetness,blackglobetemp
0,BAHA,11/20/2021 6:59,0.257,sm,False,0,-1,-1,1,-1.925079,...,-0.324461,-1.940870,0.622460,-0.921663,3.491187,-0.399864,1.476926,-0.339364,-0.595005,-1.861200
1,WINE,3/27/2021 22:12,0.211,sm,False,0,-1,-1,2,0.475745,...,0.944850,0.510530,2.001590,-1.129879,2.207192,-0.716997,-0.713780,0.908984,0.778816,-0.100136
2,WILL,4/2/2021 17:06,7.553,temp10,False,2,-1,-1,0,-0.947249,...,1.055457,-1.064666,0.989978,-1.737565,2.005740,0.613313,1.424923,1.527944,-0.628373,-0.385333
3,NCAT,9/2/2021 5:06,0.455,sm,False,0,-1,-1,1,0.301380,...,-0.151725,0.307149,-0.229896,-1.355512,-0.186268,-0.337728,-1.564363,-0.370402,-0.343569,-0.037071
4,PLYM,4/28/2021 10:13,893.000,sr,False,1,0,-1,-1,0.990174,...,-0.967447,0.620632,0.771021,0.554417,1.218666,1.367492,0.577623,1.490037,-0.619560,1.039256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6358097,HAML,10/16/2021 5:26,0.109,sm,False,2,-1,-1,0,0.113051,...,-0.368246,0.151883,-0.539393,0.395750,-0.544328,0.079052,0.303143,-0.084569,2.126114,-0.136358
6358098,FRYI,9/20/2021 12:19,1.500,ws02,False,0,-1,-1,-1,0.635594,...,-0.553613,0.352244,-0.070796,1.130773,-0.258408,0.231326,-0.371201,0.454393,2.193093,0.303954
6358099,MITC,7/8/2021 1:53,13.120,temp10,False,0,-1,-1,0,0.820681,...,-0.404157,0.827139,-0.869918,0.236151,-0.873462,-1.232698,-0.838753,-1.270636,-0.138252,0.120789
6358100,BALD,1/10/2021 8:24,360.700,par,False,3,0,-1,-1,-2.003181,...,1.588995,-2.070212,0.185799,-1.066552,0.055369,0.000000,0.000000,0.000000,-0.499515,0.000000


In [22]:
new_train = pd.concat([train_true, dfalse], ignore_index=True)
new_train = new_train.sample(frac=0.5).reset_index(drop=True)

new_train

Unnamed: 0,Station,Ob,value,measure,target,R_flag,I_flag,Z_flag,B_flag,temp_wxt,...,sm,temp10,ws02,wd02,gust02,ws06,wd06,gust06,leafwetness,blackglobetemp
0,NEWL,2/14/2021 12:34,0.671,sm,True,0,-1,-1,2,-1.258843,...,2.772014,-1.385473,0.250615,-1.487211,0.038704,0.520901,-1.188249,0.614006,-0.185027,-1.062627
1,SPRU,4/8/2021 7:10,13.520,temp10,True,0,-1,-1,0,-0.434669,...,-0.347779,-0.174979,-0.188056,-0.133357,-0.449335,-0.636664,0.127069,-0.672008,-0.591830,0.000000
2,WINE,1/14/2021 16:34,30.320,sr,False,0,2,-1,-1,-0.615697,...,-0.240413,-0.596850,0.320111,0.349713,0.187520,0.560788,0.126367,0.440945,-0.745593,0.000000
3,WINE,2/1/2021 20:29,0.000,ws02,True,0,4,2,-1,-2.560702,...,-0.084457,-2.682381,-1.256277,-2.299067,-1.327234,2.474934,1.370275,1.895389,-0.446912,0.000000
4,BEAR,6/3/2021 22:04,0.134,sm,False,0,-1,-1,1,0.641464,...,-2.718316,0.371907,-0.273754,-0.163248,-0.315636,-0.246484,-0.175897,-0.095499,0.815865,0.269776
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6353868,NEWL,1/5/2021 4:54,0.636,sm,True,0,-1,-1,0,-1.737840,...,2.542184,-1.849172,-0.597638,-1.566183,-0.753927,-0.699913,-1.619639,-0.603107,-0.279829,-1.516225
6353869,NEWL,2/11/2021 8:52,0.571,sm,True,0,-1,-1,2,-0.790733,...,2.115355,-0.881768,-0.739014,-1.279910,-0.753927,0.012144,-1.016178,-0.285415,0.334700,-0.733212
6353870,SPRU,3/17/2021 5:38,13.570,temp10,True,0,-1,-1,1,-0.812166,...,-0.938590,-0.166786,-0.250338,-0.933852,-0.557897,-0.790340,-0.727592,-0.848420,3.125765,0.000000
6353871,GOLD,8/14/2021 14:31,39.980,st,False,2,4,0,1,1.904279,...,0.385904,1.855807,0.271490,0.155175,0.423814,0.433035,-0.103866,0.466815,-0.511260,2.117444


# Now save the new Training Dataset

In [27]:
new_train['target'].value_counts()

False    3178393
True     3175480
Name: target, dtype: int64

In [28]:
new_train.to_csv('new_train.csv', index=False)