In [1]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Mounted at /content/drive


In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
import xgboost

In [4]:
!ls "/content/drive/My Drive/wids_datathon_2023"

datathon_wids_main.ipynb  test_data.csv   wids_datathon_23.ipynb
sample_solution.csv	  train_data.csv


In [45]:
test = pd.read_csv("/content/drive/My Drive/wids_datathon_2023/test_data.csv")
train = pd.read_csv("/content/drive/My Drive/wids_datathon_2023/train_data.csv")

In [46]:
train = train.dropna()
y = train['contest-tmp2m-14d__tmp2m']
X = train.drop(['contest-tmp2m-14d__tmp2m'], axis = 1)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=42)

In [47]:
def build_date_features(df, date_key='startdate'):
  df[date_key] = pd.to_datetime(df[date_key])
  df['day_of_year'] = df[date_key].dt.day_of_year
  df['month'] = df[date_key].dt.month
  df = df.drop([date_key], axis=1) 

build_date_features(X_train, date_key='startdate')
build_date_features(X_val, date_key='startdate')

In [48]:
def add_season(df):
  month_to_season = {
      1: 0,
      2: 0,
      3: 1,
      4: 1,
      5: 1,
      6: 2,
      7: 2,
      8: 2, 
      9: 3, 
      10: 3,
      11: 3,
      12: 0
  }
  df['season'] = df['month'].apply(lambda x: month_to_season[x])

add_season(X_train)
add_season(X_val)

In [49]:
def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))


def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))

In [50]:
def encode_cyclical(df):
  # encode the day with a period of 365
  df['day_of_year_sin'] = sin_transformer(365).fit_transform(df['day_of_year'])
  df['day_of_year_cos'] = cos_transformer(365).fit_transform(df['day_of_year'])

  # encode the month with a period of 12
  df['month_sin'] = sin_transformer(12).fit_transform(df['month'])
  df['month_cos'] = cos_transformer(12).fit_transform(df['month'])

  # encode the season with a period of 4
  df['season_sin'] = sin_transformer(4).fit_transform(df['season'])
  df['season_cos'] = cos_transformer(4).fit_transform(df['season'])

encode_cyclical(X_train)
encode_cyclical(X_val)

In [51]:
def identify_correlated(df, threshold):
    corr_matrix = df.corr().abs()
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    reduced_corr_matrix = corr_matrix.mask(mask)
    features_to_drop = [c for c in reduced_corr_matrix.columns if any(reduced_corr_matrix[c] > threshold)]
    return features_to_drop

In [52]:
X_train = X_train.drop(['startdate'], axis=1) 
X_val = X_val.drop(['startdate'], axis=1)

In [53]:
print(len(X_train.columns))

253


In [54]:
to_drop = identify_correlated(X_train, threshold=.80) ##threshold 90 was giving better result
print(len(to_drop))

106


In [55]:
x_train_reduced = pd.DataFrame(X_train.drop(to_drop, axis=1))
print(len(x_train_reduced.columns))
x_val_reduced = pd.DataFrame(X_val.drop(to_drop, axis=1))
print(len(x_val_reduced.columns))

147
147


In [56]:
cat = x_train_reduced.select_dtypes("object").columns
x_train_reduced = pd.DataFrame(x_train_reduced.drop(cat, axis=1))
print(len(x_train_reduced.columns))
x_val_reduced = pd.DataFrame(x_val_reduced.drop(cat, axis=1))
print(len(x_val_reduced.columns))

146
146


In [27]:
print(x_train_reduced.dtypes)

lat                                   float64
lon                                   float64
contest-pevpr-sfc-gauss-14d__pevpr    float64
contest-rhum-sig995-14d__rhum         float64
nmme0-prate-56w__cancm30              float64
                                       ...   
season                                  int64
month_sin                             float64
month_cos                             float64
season_sin                            float64
season_cos                            float64
Length: 180, dtype: object


In [57]:
scaler = StandardScaler()
x_train_reduced = scaler.fit_transform(x_train_reduced)
x_val_reduced = scaler.transform(x_val_reduced)

'''rf_model = pipeline.fit(x_train_reduced, y_train)

train_preds = rf_model.predict(x_train_reduced)
train_score = mean_squared_error(y_train, train_preds, squared = False)

test_preds = rf_model.predict(x_val_reduced)
test_score = mean_squared_error(y_val, test_preds, squared = False)

print (f'Training Performance: {train_score}')
print (f'Test Performance: {test_score}')'''

"rf_model = pipeline.fit(x_train_reduced, y_train)\n\ntrain_preds = rf_model.predict(x_train_reduced)\ntrain_score = mean_squared_error(y_train, train_preds, squared = False)\n\ntest_preds = rf_model.predict(x_val_reduced)\ntest_score = mean_squared_error(y_val, test_preds, squared = False)\n\nprint (f'Training Performance: {train_score}')\nprint (f'Test Performance: {test_score}')"

In [58]:
xgboost_model = xgboost.XGBRegressor(n_estimators=200, learning_rate=0.02, gamma=0, subsample=0.75,
                           colsample_bytree=0.4, max_depth=5)

xgboost_model.fit(x_train_reduced,y_train)

train_preds = xgboost_model.predict(x_train_reduced)
train_score = mean_squared_error(y_train, train_preds, squared = False)

test_preds = xgboost_model.predict(x_val_reduced)
test_score = mean_squared_error(y_val, test_preds, squared = False)

print (f'Training Performance: {train_score}')
print (f'Test Performance: {test_score}')

Training Performance: 1.611753882718399
Test Performance: 1.6326653586888924


In [18]:
print(X_train.dtypes)

index                                          int64
lat                                          float64
lon                                          float64
startdate                             datetime64[ns]
contest-pevpr-sfc-gauss-14d__pevpr           float64
                                           ...      
day_of_year_cos                              float64
month_sin                                    float64
month_cos                                    float64
season_sin                                   float64
season_cos                                   float64
Length: 254, dtype: object
