# **SETUP**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Libraries**

In [2]:
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
import sklearn
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA, FastICA, TruncatedSVD

import warnings
warnings.filterwarnings("ignore")

In [3]:
print("pandas==",pd.__version__)
print("numpy==",np.__version__)
print("scikit-learn==",sklearn.__version__)
print("lightgbm==",lgb.__version__)

pandas== 1.3.5
numpy== 1.21.6
scikit-learn== 1.0.2
lightgbm== 2.2.3


In [4]:
def rmse(y_true, y_pred):
    return np.sqrt(mse(y_true, y_pred))

# **Dataset**

In [5]:
csv_path = "/content/drive/MyDrive/TrailblazersQualificationChallenge/"
train = pd.read_csv(f'{csv_path}Train.csv')
test = pd.read_csv(f'{csv_path}Test.csv')
sub  = pd.read_csv(f'{csv_path}SampleSubmission.csv')

# **Feature Engineering**

In [6]:
shiftColumns = ["temperature_2m_above_ground","precipitable_water_entire_atmosphere",'L3_NO2_NO2_column_number_density', 'L3_O3_O3_column_number_density',
        'L3_HCHO_tropospheric_HCHO_column_number_density', 'L3_CO_CO_column_number_density', 'L3_CLOUD_surface_albedo', 'L3_CLOUD_cloud_optical_depth']

for column in shiftColumns:
  for period in range(2, 5):
    train[f"{column}_shif{period}"] = train[column].shift(periods = period)
    test[f"{column}_shif{period}"] = test[column].shift(periods = period)


tempColumns = ['precipitable_water_entire_atmosphere', 'temperature_2m_above_ground','relative_humidity_2m_above_ground', ]

for column in tempColumns:
  train[f"{column}_shift1"] = [np.NaN] + list(train[column][0:len(train)-1])
  for i in range (len(train)): 
    if i % 94 == 0:
      train[f"{column}_shift1"][i]=np.NaN

  test[f"{column}_shift1"] = [np.NaN] + list(test[column][0:len(test)-1])
  for i in range (len(test)): 
    if i % 94 == 0:
      test[f"{column}_shift1"][i]=np.NaN

dropColumns = ['L3_HCHO_sensor_azimuth_angle', 'L3_HCHO_sensor_zenith_angle', 'L3_HCHO_solar_azimuth_angle', 'L3_HCHO_solar_zenith_angle',
              'L3_CO_sensor_zenith_angle', 'L3_CO_solar_zenith_angle','L3_SO2_sensor_azimuth_angle', 'L3_SO2_sensor_zenith_angle',
              'L3_SO2_solar_azimuth_angle', 'L3_SO2_solar_zenith_angle','L3_CH4_solar_zenith_angle','L3_CH4_solar_azimuth_angle','L3_CH4_sensor_zenith_angle',
               'L3_CH4_sensor_azimuth_angle','L3_CH4_aerosol_optical_depth','L3_CH4_aerosol_height','L3_CH4_CH4_column_volume_mixing_ratio_dry_air']

train = train.drop(columns = dropColumns)
test = test.drop(columns = dropColumns)

l = list(test.columns)
p = [col for col in train.columns if col not in (l + ["target"])]
train = train.drop(p , axis=1)

train["Date"] = pd.to_datetime(train['Date'])
test["Date"] = pd.to_datetime(test['Date'])

for attr in ['day', 'month', 'week', 'dayofweek', 'weekofyear', 'days_in_month', 'is_month_start', 'is_month_end', 'dayofyear',"quarter"]:
  train[attr] = getattr(train['Date'].dt, attr)
  test[attr] = getattr(test['Date'].dt, attr)

train['is_weekend'] = (train['dayofweek'] >= 5)*1
train['quarter'] = train['day']%15
train['which_quarter'] = train['day']//15

test['is_weekend'] = (test['dayofweek'] >= 5)*1
test['quarter'] = test['day'] % 15
test['which_quarter'] = test['day'] // 15

def winterFunc(month):
  if month == 1 or month ==2:
    return 1
  else:
    return 0

train['winter'] = train["month"].apply(winterFunc)
test['winter'] = test["month"].apply(winterFunc)

train = train.fillna(method='ffill')
train = train.fillna(train.mean())

test = test.fillna(method ='ffill')
test = test.fillna(test.mean())

air_temperature_filler = pd.DataFrame(train.groupby(['Place_ID','month'])['temperature_2m_above_ground'].mean())
train['mean temp per month'] = 0
for i in range (len(train)):
  train['mean temp per month'][i]=air_temperature_filler.loc[(train['Place_ID'][i], train['month'][i]), :]

air_temperature_filler = pd.DataFrame(test.groupby(['Place_ID','month'])['temperature_2m_above_ground'].mean())
test['mean temp per month']=0
for i in range (len(test)):
  test['mean temp per month'][i]=air_temperature_filler.loc[(test['Place_ID'][i], test['month'][i]), :]

## PCA
pca = PCA(random_state=42,n_components=1)

pg_features =  train.filter(regex='L3_AER_.*')
train_pca = pca.fit_transform(pg_features)
train['PCA_AE'] = train_pca[:,0]
pg_features =  test.filter(regex='L3_AER.*')
test_pca = pca.transform(pg_features)
test['PCA_AE'] = test_pca[:,0]


pca = PCA(random_state=42,n_components=1)
pg_features =  train.filter(regex='L3_NO2_.*')
train_pca = pca.fit_transform(pg_features)
train['PCA_NO'] = train_pca[:,0]
pg_features =  test.filter(regex='L3_NO2.*')
test_pca = pca.transform(pg_features)
test['PCA_NO'] = test_pca[:,0]

trainag = train.copy()
testag = test.copy()

train.drop(['Place_ID X Date','Date','Place_ID'], inplace = True, axis = 1)
test.drop(['Place_ID X Date','Date','Place_ID'], inplace = True, axis = 1)
target = train['target']
train.drop('target', axis = 1, inplace = True)

feature=['precipitable_water_entire_atmosphere', 	'relative_humidity_2m_above_ground', 	'specific_humidity_2m_above_ground',
         'temperature_2m_above_ground', 	'u_component_of_wind_10m_above_ground', 	'v_component_of_wind_10m_above_ground', 	
         'L3_NO2_NO2_column_number_density', 'L3_SO2_SO2_column_number_density',	'L3_SO2_SO2_column_number_density_amf',
         'L3_SO2_SO2_slant_column_number_density', 	 'L3_NO2_tropospheric_NO2_column_number_density', 'L3_HCHO_tropospheric_HCHO_column_number_density',
         'L3_CO_CO_column_number_density', 'L3_HCHO_HCHO_slant_column_number_density'	,'L3_CO_H2O_column_number_density'	,'L3_O3_O3_column_number_density',
         'L3_NO2_NO2_slant_column_number_density']


train1 = train.copy()
test1 = test.copy()
dropCol2 = [c for c in list(feature) if c not in l]
train1.drop(dropCol2, inplace = True, axis = 1)
test1.drop(dropCol2, inplace = True, axis = 1)
    
pca = PCA(random_state = 42, n_components = 1)
train_pca = pca.fit_transform(train1)
train['pca_feature'] = train_pca[: , 0]
test_pca = pca.transform(test1)
test['pca_feature'] = test_pca[: , 0]

In [7]:
aggColumns = ['L3_HCHO_tropospheric_HCHO_column_number_density', 'L3_CO_CO_column_number_density', 'L3_O3_O3_column_number_density', 'L3_NO2_NO2_slant_column_number_density',
              'L3_NO2_NO2_column_number_density', 'L3_O3_cloud_fraction', 'L3_CO_H2O_column_number_density', 'L3_HCHO_HCHO_slant_column_number_density',
              'relative_humidity_2m_above_ground', 'u_component_of_wind_10m_above_ground', 'precipitable_water_entire_atmosphere', 'specific_humidity_2m_above_ground',
              'v_component_of_wind_10m_above_ground', 'temperature_2m_above_ground', 'PCA_AE']

aggs = {}

for columns in aggColumns:
  aggs[columns] = ['sum','max','min','mean','std']

agg_trans = trainag.groupby(['Place_ID']).agg(aggs)
agg_trans.columns = ['_'.join(col).strip() for col in agg_trans.columns.values]
agg_trans.reset_index(inplace=True)

df = (trainag.groupby('Place_ID')
          .size()
          .reset_index(name='{}transactions_count'.format('1')))

agg_trans = pd.merge(df, agg_trans, on='Place_ID', how='left')
trainag = pd.merge(trainag,agg_trans, on='Place_ID', how='left')

agg_trans = testag.groupby(['Place_ID']).agg(aggs)
agg_trans.columns = ['_'.join(col).strip() for col in agg_trans.columns.values]
agg_trans.reset_index(inplace=True)

df = (testag.groupby('Place_ID')
          .size()
          .reset_index(name='{}transactions_count'.format('1')))

agg_trans = pd.merge(df, agg_trans, on='Place_ID', how='left')
testag = pd.merge(testag,agg_trans, on='Place_ID', how='left')

trainag.drop(['Place_ID X Date' ,'Date' ,'Place_ID'], inplace = True, axis = 1)
testag.drop(['Place_ID X Date','Date','Place_ID'], inplace = True, axis = 1)
target = trainag['target']
trainag.drop('target', axis = 1, inplace = True)

feature=['precipitable_water_entire_atmosphere', 	'relative_humidity_2m_above_ground', 	'specific_humidity_2m_above_ground', 	'temperature_2m_above_ground', 
         'u_component_of_wind_10m_above_ground', 	'v_component_of_wind_10m_above_ground', 	'L3_NO2_NO2_column_number_density', 'L3_SO2_SO2_column_number_density',	
         'L3_SO2_SO2_column_number_density_amf' 	,'L3_SO2_SO2_slant_column_number_density', 	 'L3_NO2_tropospheric_NO2_column_number_density', 
         'L3_HCHO_tropospheric_HCHO_column_number_density', 'L3_CO_CO_column_number_density', 'L3_HCHO_HCHO_slant_column_number_density'	,'L3_CO_H2O_column_number_density'
         	,'L3_O3_O3_column_number_density'	,'L3_NO2_NO2_slant_column_number_density']

train1 = trainag.copy()
test1 = testag.copy()
dropCol3 = [c for c in list(feature) if c not in l]

train1.drop(dropCol3 , inplace = True , axis = 1)
test1.drop(dropCol3 , inplace = True , axis = 1)
    
pca = PCA(random_state=42,n_components=1)
train_pca = pca.fit_transform(train1)
trainag['pca_feature'] = train_pca[:,0]
test_pca = pca.transform(test1)
testag['pca_feature'] = test_pca[:,0]

# **Model Training**

In [8]:
class PARAM:
  SEED = 1901
  n_splits = 10

  lgbmParams =  {'num_leaves': 100, 'min_data_in_leaf': 50, 'objective':'regression',
          'max_depth': -1, 'learning_rate': 0.075, "boosting": "gbdt",
          "feature_fraction": 0.35, "metric": 'auc', "lambda_l1": 1,
          "lambda_l2": 2,  "random_state": 6, "verbosity": -1,   'metric' : 'rmse', 'num_iterations': 1500}

skfolds = StratifiedKFold(n_splits= PARAM.n_splits, random_state= PARAM.SEED, shuffle = True)

In [9]:
def trainLGBMModel(max_iter, folds, train, test, parameters, target):
  target = target
  train_preds = np.zeros(len(train))
  test_preds = np.zeros(len(test))
  split_y = pd.qcut(target, 10, labels=False, duplicates='drop')

  for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, split_y)):
      print(50*'-')
      print(f'Fold {fold_+1} / {PARAM.n_splits}' )
      X_trn, X_val, X_test = train.iloc[trn_idx], train.iloc[val_idx], test  
      y_trn, y_val = target.iloc[trn_idx], target.iloc[val_idx]
      trn_data = lgb.Dataset(X_trn, y_trn)
      val_data = lgb.Dataset(X_val, y_val)
          
      clf = lgb.train(parameters, trn_data, valid_sets = [trn_data, val_data], 
                      verbose_eval=200, early_stopping_rounds = 200)

      predTrain = clf.predict(X_val, num_iteration=clf.best_iteration)
      train_preds[val_idx] = predTrain
      print(f"RMSE : {rmse(y_val, predTrain)}")

      predTest = clf.predict(X_test, num_iteration=clf.best_iteration)
      predTest[predTest < 0] = 0
      test_preds += predTest
      print(50*'-')

  test_preds = test_preds / PARAM.n_splits
  print(f"Train RMSE : {rmse(target, train_preds)}")
  return test_preds

In [10]:
pred = trainLGBMModel(max_iter = PARAM.n_splits , folds = skfolds, train = trainag, 
                       test = testag, parameters = PARAM.lgbmParams, target = target)

--------------------------------------------------
Fold 1 / 10
Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 11.5664	valid_1's rmse: 25.1362
[400]	training's rmse: 7.83987	valid_1's rmse: 24.4848
[600]	training's rmse: 5.63183	valid_1's rmse: 24.1592
[800]	training's rmse: 4.16043	valid_1's rmse: 23.9622
[1000]	training's rmse: 3.1273	valid_1's rmse: 23.8828
[1200]	training's rmse: 2.39692	valid_1's rmse: 23.8247
[1400]	training's rmse: 1.86586	valid_1's rmse: 23.7766
Did not meet early stopping. Best iteration is:
[1500]	training's rmse: 1.64355	valid_1's rmse: 23.76
RMSE : 23.759986414201062
--------------------------------------------------
--------------------------------------------------
Fold 2 / 10
Training until validation scores don't improve for 200 rounds.
[200]	training's rmse: 11.9479	valid_1's rmse: 17.8708
[400]	training's rmse: 7.94268	valid_1's rmse: 17.5941
[600]	training's rmse: 5.60356	valid_1's rmse: 17.5146
[800]	training's 

# **Submission**

In [11]:
prediction = pred
prediction[prediction < 0] = 0

submission = pd.DataFrame()
submission['Place_ID X Date'] = sub['Place_ID X Date']
submission['target'] = prediction
submission.to_csv(f"LGBM_SOLUTION2.csv", index=False)

In [12]:
submission.head()

Unnamed: 0,Place_ID X Date,target
0,0OS9LVX X 2020-01-02,42.565036
1,0OS9LVX X 2020-01-03,33.61831
2,0OS9LVX X 2020-01-04,29.706889
3,0OS9LVX X 2020-01-05,32.216383
4,0OS9LVX X 2020-01-06,28.871596
