# **Installation**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%%capture
!pip install catboost

# **Libraries**

In [3]:
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import catboost
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA, FastICA, TruncatedSVD

import warnings
warnings.filterwarnings("ignore")

In [4]:
print("pandas==",pd.__version__)
print("numpy==",np.__version__)
print("scikit-learn==",sklearn.__version__)
print("catboost==",catboost.__version__)

pandas== 1.3.5
numpy== 1.21.6
scikit-learn== 1.0.2
catboost== 1.0.6


# **Dataset**

In [5]:
csv_path = "/content/drive/MyDrive/TrailblazersQualificationChallenge/"
train = pd.read_csv(f'{csv_path}Train.csv')
test = pd.read_csv(f'{csv_path}Test.csv')
sub  = pd.read_csv(f'{csv_path}SampleSubmission.csv')

# **Feature Engineering**

In [6]:
shiftColumns = ["temperature_2m_above_ground","precipitable_water_entire_atmosphere",'L3_NO2_NO2_column_number_density', 'L3_O3_O3_column_number_density',
        'L3_HCHO_tropospheric_HCHO_column_number_density', 'L3_CO_CO_column_number_density', 'L3_CLOUD_surface_albedo', 'L3_CLOUD_cloud_optical_depth']

for column in shiftColumns:
  for period in range(2, 5):
    train[f"{column}_shif{period}"] = train[column].shift(periods = period)
    test[f"{column}_shif{period}"] = test[column].shift(periods = period)


tempColumns = ['precipitable_water_entire_atmosphere', 'temperature_2m_above_ground','relative_humidity_2m_above_ground', ]

for column in tempColumns:
  train[f"{column}_shift1"] = [np.NaN] + list(train[column][0:len(train)-1])
  for i in range (len(train)): 
    if i % 94 == 0:
      train[f"{column}_shift1"][i]=np.NaN

  test[f"{column}_shift1"] = [np.NaN] + list(test[column][0:len(test)-1])
  for i in range (len(test)): 
    if i % 94 == 0:
      test[f"{column}_shift1"][i]=np.NaN

dropColumns = ['L3_HCHO_sensor_azimuth_angle', 'L3_HCHO_sensor_zenith_angle', 'L3_HCHO_solar_azimuth_angle', 'L3_HCHO_solar_zenith_angle',
              'L3_CO_sensor_zenith_angle', 'L3_CO_solar_zenith_angle','L3_SO2_sensor_azimuth_angle', 'L3_SO2_sensor_zenith_angle',
              'L3_SO2_solar_azimuth_angle', 'L3_SO2_solar_zenith_angle','L3_CH4_solar_zenith_angle','L3_CH4_solar_azimuth_angle','L3_CH4_sensor_zenith_angle',
               'L3_CH4_sensor_azimuth_angle','L3_CH4_aerosol_optical_depth','L3_CH4_aerosol_height','L3_CH4_CH4_column_volume_mixing_ratio_dry_air']

train = train.drop(columns = dropColumns)
test = test.drop(columns = dropColumns)

l = list(test.columns)
p = [col for col in train.columns if col not in (l + ["target"])]
train = train.drop(p , axis=1)

train["Date"] = pd.to_datetime(train['Date'])
test["Date"] = pd.to_datetime(test['Date'])

for attr in ['day', 'month', 'week', 'dayofweek', 'weekofyear', 'days_in_month', 'is_month_start', 'is_month_end', 'dayofyear',"quarter"]:
  train[attr] = getattr(train['Date'].dt, attr)
  test[attr] = getattr(test['Date'].dt, attr)

train['is_weekend'] = (train['dayofweek'] >= 5)*1
train['quarter'] = train['day']%15
train['which_quarter'] = train['day']//15

test['is_weekend'] = (test['dayofweek'] >= 5)*1
test['quarter'] = test['day'] % 15
test['which_quarter'] = test['day'] // 15

def winterFunc(month):
  if month == 1 or month ==2:
    return 1
  else:
    return 0

train['winter'] = train["month"].apply(winterFunc)
test['winter'] = test["month"].apply(winterFunc)

train = train.fillna(method='ffill')
train = train.fillna(train.mean())

test = test.fillna(method ='ffill')
test = test.fillna(test.mean())

air_temperature_filler = pd.DataFrame(train.groupby(['Place_ID','month'])['temperature_2m_above_ground'].mean())
train['mean temp per month'] = 0
for i in range (len(train)):
  train['mean temp per month'][i]=air_temperature_filler.loc[(train['Place_ID'][i], train['month'][i]), :]

air_temperature_filler = pd.DataFrame(test.groupby(['Place_ID','month'])['temperature_2m_above_ground'].mean())
test['mean temp per month']=0
for i in range (len(test)):
  test['mean temp per month'][i]=air_temperature_filler.loc[(test['Place_ID'][i], test['month'][i]), :]

## PCA
pca = PCA(random_state=42,n_components=1)

pg_features =  train.filter(regex='L3_AER_.*')
train_pca = pca.fit_transform(pg_features)
train['PCA_AE'] = train_pca[:,0]
pg_features =  test.filter(regex='L3_AER.*')
test_pca = pca.transform(pg_features)
test['PCA_AE'] = test_pca[:,0]


pca = PCA(random_state=42,n_components=1)
pg_features =  train.filter(regex='L3_NO2_.*')
train_pca = pca.fit_transform(pg_features)
train['PCA_NO'] = train_pca[:,0]
pg_features =  test.filter(regex='L3_NO2.*')
test_pca = pca.transform(pg_features)
test['PCA_NO'] = test_pca[:,0]

trainag = train.copy()
testag = test.copy()

train.drop(['Place_ID X Date','Date','Place_ID'], inplace = True, axis = 1)
test.drop(['Place_ID X Date','Date','Place_ID'], inplace = True, axis = 1)
target = train['target']
train.drop('target', axis = 1, inplace = True)

feature=['precipitable_water_entire_atmosphere', 	'relative_humidity_2m_above_ground', 	'specific_humidity_2m_above_ground',
         'temperature_2m_above_ground', 	'u_component_of_wind_10m_above_ground', 	'v_component_of_wind_10m_above_ground', 	
         'L3_NO2_NO2_column_number_density', 'L3_SO2_SO2_column_number_density',	'L3_SO2_SO2_column_number_density_amf',
         'L3_SO2_SO2_slant_column_number_density', 	 'L3_NO2_tropospheric_NO2_column_number_density', 'L3_HCHO_tropospheric_HCHO_column_number_density',
         'L3_CO_CO_column_number_density', 'L3_HCHO_HCHO_slant_column_number_density'	,'L3_CO_H2O_column_number_density'	,'L3_O3_O3_column_number_density',
         'L3_NO2_NO2_slant_column_number_density']


train1 = train.copy()
test1 = test.copy()
dropCol2 = [c for c in list(feature) if c not in l]
train1.drop(dropCol2, inplace = True, axis = 1)
test1.drop(dropCol2, inplace = True, axis = 1)
    
pca = PCA(random_state = 42, n_components = 1)
train_pca = pca.fit_transform(train1)
train['pca_feature'] = train_pca[: , 0]
test_pca = pca.transform(test1)
test['pca_feature'] = test_pca[: , 0]

# **Model Training**

In [7]:
cb_model = CatBoostRegressor(iterations=30000,
                             learning_rate=0.045,
                             depth=8,
                             eval_metric='RMSE',
                             random_seed = 42,
                             bagging_temperature = 0.2,
                             od_type='Iter',
                             metric_period = 50,
                             od_wait=300)
cb_model.fit(train, target,
             use_best_model=True,
             verbose=50)

You should provide test set for use best model. use_best_model parameter has been switched to false value.


0:	learn: 46.1184150	total: 162ms	remaining: 1h 21m 5s
50:	learn: 32.7003280	total: 4.71s	remaining: 46m 5s
100:	learn: 29.7538029	total: 9.16s	remaining: 45m 10s
150:	learn: 28.1019135	total: 13.7s	remaining: 45m 8s
200:	learn: 26.8068896	total: 18.1s	remaining: 44m 50s
250:	learn: 25.6515596	total: 22.6s	remaining: 44m 39s
300:	learn: 24.6385400	total: 27.1s	remaining: 44m 29s
350:	learn: 23.7135960	total: 31.4s	remaining: 44m 12s
400:	learn: 22.9039912	total: 35.8s	remaining: 43m 59s
450:	learn: 22.1815837	total: 40.1s	remaining: 43m 50s
500:	learn: 21.5551343	total: 44.7s	remaining: 43m 51s
550:	learn: 20.9180291	total: 49.1s	remaining: 43m 45s
600:	learn: 20.3247191	total: 54.5s	remaining: 44m 27s
650:	learn: 19.7978598	total: 59.2s	remaining: 44m 30s
700:	learn: 19.3093031	total: 1m 3s	remaining: 44m 33s
750:	learn: 18.8515008	total: 1m 8s	remaining: 44m 23s
800:	learn: 18.4173405	total: 1m 12s	remaining: 44m 15s
850:	learn: 18.0107337	total: 1m 17s	remaining: 44m 5s
900:	learn: 

<catboost.core.CatBoostRegressor at 0x7fc3c5281390>

In [8]:
CATBoostPred1 = (cb_model.predict(test))

# **Model Training 2**

In [9]:
aggColumns = ['L3_HCHO_tropospheric_HCHO_column_number_density', 'L3_CO_CO_column_number_density', 'L3_O3_O3_column_number_density', 'L3_NO2_NO2_slant_column_number_density',
              'L3_NO2_NO2_column_number_density', 'L3_O3_cloud_fraction', 'L3_CO_H2O_column_number_density', 'L3_HCHO_HCHO_slant_column_number_density',
              'relative_humidity_2m_above_ground', 'u_component_of_wind_10m_above_ground', 'precipitable_water_entire_atmosphere', 'specific_humidity_2m_above_ground',
              'v_component_of_wind_10m_above_ground', 'temperature_2m_above_ground', 'PCA_AE']

aggs = {}

for columns in aggColumns:
  aggs[columns] = ['sum','max','min','mean','std']

agg_trans = trainag.groupby(['Place_ID']).agg(aggs)
agg_trans.columns = ['_'.join(col).strip() for col in agg_trans.columns.values]
agg_trans.reset_index(inplace=True)

df = (trainag.groupby('Place_ID')
          .size()
          .reset_index(name='{}transactions_count'.format('1')))

agg_trans = pd.merge(df, agg_trans, on='Place_ID', how='left')
trainag = pd.merge(trainag,agg_trans, on='Place_ID', how='left')

agg_trans = testag.groupby(['Place_ID']).agg(aggs)
agg_trans.columns = ['_'.join(col).strip() for col in agg_trans.columns.values]
agg_trans.reset_index(inplace=True)

df = (testag.groupby('Place_ID')
          .size()
          .reset_index(name='{}transactions_count'.format('1')))

agg_trans = pd.merge(df, agg_trans, on='Place_ID', how='left')
testag = pd.merge(testag,agg_trans, on='Place_ID', how='left')

trainag.drop(['Place_ID X Date' ,'Date' ,'Place_ID'], inplace = True, axis = 1)
testag.drop(['Place_ID X Date','Date','Place_ID'], inplace = True, axis = 1)
target = trainag['target']
trainag.drop('target', axis = 1, inplace = True)

feature=['precipitable_water_entire_atmosphere', 	'relative_humidity_2m_above_ground', 	'specific_humidity_2m_above_ground', 	'temperature_2m_above_ground', 
         'u_component_of_wind_10m_above_ground', 	'v_component_of_wind_10m_above_ground', 	'L3_NO2_NO2_column_number_density', 'L3_SO2_SO2_column_number_density',	
         'L3_SO2_SO2_column_number_density_amf' 	,'L3_SO2_SO2_slant_column_number_density', 	 'L3_NO2_tropospheric_NO2_column_number_density', 
         'L3_HCHO_tropospheric_HCHO_column_number_density', 'L3_CO_CO_column_number_density', 'L3_HCHO_HCHO_slant_column_number_density'	,'L3_CO_H2O_column_number_density'
         	,'L3_O3_O3_column_number_density'	,'L3_NO2_NO2_slant_column_number_density']

train1 = trainag.copy()
test1 = testag.copy()
dropCol3 = [c for c in list(feature) if c not in l]

train1.drop(dropCol3 , inplace = True , axis = 1)
test1.drop(dropCol3 , inplace = True , axis = 1)
    
pca = PCA(random_state=42,n_components=1)
train_pca = pca.fit_transform(train1)
trainag['pca_feature'] = train_pca[:,0]
test_pca = pca.transform(test1)
testag['pca_feature'] = test_pca[:,0]

In [10]:
cb_model = CatBoostRegressor(iterations=30000,
                             learning_rate=0.045,
                             depth=8,
                             eval_metric='RMSE',
                             random_seed = 42,
                             bagging_temperature = 0.2,
                             od_type='Iter',
                             metric_period = 50,
                             od_wait=300)
cb_model.fit(trainag,target,
             use_best_model=True,
             verbose=50)

You should provide test set for use best model. use_best_model parameter has been switched to false value.


0:	learn: 45.7870266	total: 245ms	remaining: 2h 2m 27s
50:	learn: 27.0609571	total: 9.85s	remaining: 1h 36m 25s
100:	learn: 23.7340465	total: 17.8s	remaining: 1h 27m 35s
150:	learn: 22.0927990	total: 25.6s	remaining: 1h 24m 16s
200:	learn: 20.9491942	total: 33.2s	remaining: 1h 21m 58s
250:	learn: 19.9667985	total: 40.8s	remaining: 1h 20m 37s
300:	learn: 19.0684453	total: 48.4s	remaining: 1h 19m 34s
350:	learn: 18.3215621	total: 56s	remaining: 1h 18m 46s
400:	learn: 17.6451892	total: 1m 3s	remaining: 1h 18m 10s
450:	learn: 17.0268232	total: 1m 11s	remaining: 1h 17m 44s
500:	learn: 16.4532818	total: 1m 18s	remaining: 1h 17m 23s
550:	learn: 15.9775571	total: 1m 26s	remaining: 1h 17m 4s
600:	learn: 15.5357174	total: 1m 34s	remaining: 1h 16m 54s
650:	learn: 15.1035748	total: 1m 42s	remaining: 1h 16m 47s
700:	learn: 14.7048300	total: 1m 50s	remaining: 1h 16m 39s
750:	learn: 14.3409758	total: 1m 57s	remaining: 1h 16m 24s
800:	learn: 13.9979812	total: 2m 5s	remaining: 1h 16m 7s
850:	learn: 13.

<catboost.core.CatBoostRegressor at 0x7fc3c55e4690>

# **Submission**

In [11]:
CATBoostPred2 = (cb_model.predict(testag))
blend =(CATBoostPred1*0.4 + CATBoostPred2*0.6)
submission3 = pd.DataFrame({"Place_ID X Date":sub['Place_ID X Date'], "target": np.clip(blend, 0, a_max=None)})
submission3.to_csv("CATBOOST_SOLUTION1.csv", index=False)