In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import math
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_selection import RFECV

In [2]:
train_url = 'https://raw.githubusercontent.com/teresadatta100/data-wids-arthur-tutorial/main/wids_sample_train.csv'
test_url = 'https://raw.githubusercontent.com/teresadatta100/data-wids-arthur-tutorial/main/wids_sample_test.csv'

In [3]:
train_split = pd.read_csv(train_url)
test_split = pd.read_csv(test_url)

In [4]:
print(len(train_split.columns))
print(train_split.info())

86
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31784 entries, 0 to 31783
Data columns (total 86 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         31784 non-null  int64  
 1   lat                31784 non-null  int64  
 2   lon                31784 non-null  int64  
 3   start_date         31784 non-null  object 
 4   cancm3_0_x         31784 non-null  float64
 5   cancm4_0_x         31784 non-null  float64
 6   ccsm3_0_x          31784 non-null  float64
 7   ccsm4_0_x          31784 non-null  float64
 8   cfsv2_0_x          31784 non-null  float64
 9   gfdl-flor-a_0_x    31784 non-null  float64
 10  gfdl-flor-b_0_x    31784 non-null  float64
 11  gfdl_0_x           31784 non-null  float64
 12  nasa_0_x           31784 non-null  float64
 13  nmme0_mean_x       31784 non-null  float64
 14  cancm3_x           31784 non-null  float64
 15  cancm4_x           31784 non-null  float64
 16  ccsm3_x            

In [5]:
x_train = train_split.drop(['tmp2m', 'start_date'], axis=1) 
y_train = train_split['tmp2m']
x_test = test_split.drop(['tmp2m', 'start_date'], axis=1)
y_test = test_split['tmp2m']

In [7]:
pipeline = Pipeline(steps = [
               ('scaler', StandardScaler())
              ,('regressor',RandomForestRegressor())
           ])
model = pipeline.fit(x_train, y_train)

In [9]:
train_preds = model.predict(x_train)
train_score = r2_score(y_train, train_preds)

test_preds = model.predict(x_test)
test_score = r2_score(y_test, test_preds)

print (f'Training Performance: {train_score}')
print (f'Test Performance: {test_score}')

Training Performance: 0.9995223112070886
Test Performance: 0.6713501817571363


In [11]:
def build_date_features(df, ref_set, date_key='start_date'):
  df[date_key] = pd.to_datetime(ref_set[date_key])
  df['day_of_year'] = df[date_key].dt.day_of_year
  df['month'] = df[date_key].dt.month
  df = df.drop([date_key], axis=1) 

build_date_features(x_train, train_split)
build_date_features(x_test, test_split)

In [12]:
print(x_train.dtypes)

Unnamed: 0                 int64
lat                        int64
lon                        int64
cancm3_0_x               float64
cancm4_0_x               float64
                       ...      
nasa_0_y.1               float64
nmme0_mean_y.1           float64
start_date        datetime64[ns]
day_of_year                int64
month                      int64
Length: 87, dtype: object


In [13]:
def add_season(df):
  month_to_season = {
      1: 0,
      2: 0,
      3: 1,
      4: 1,
      5: 1,
      6: 2,
      7: 2,
      8: 2, 
      9: 3, 
      10: 3,
      11: 3,
      12: 0
  }
  df['season'] = df['month'].apply(lambda x: month_to_season[x])

add_season(x_train)
add_season(x_test)

In [15]:
def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))


def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))

In [16]:
def encode_cyclical(df):
  # encode the day with a period of 365
  df['day_of_year_sin'] = sin_transformer(365).fit_transform(df['day_of_year'])
  df['day_of_year_cos'] = cos_transformer(365).fit_transform(df['day_of_year'])

  # encode the month with a period of 12
  df['month_sin'] = sin_transformer(12).fit_transform(df['month'])
  df['month_cos'] = cos_transformer(12).fit_transform(df['month'])

  # encode the season with a period of 4
  df['season_sin'] = sin_transformer(4).fit_transform(df['season'])
  df['season_cos'] = cos_transformer(4).fit_transform(df['season'])

encode_cyclical(x_train)
encode_cyclical(x_test)

In [17]:
print(x_train.dtypes)

Unnamed: 0           int64
lat                  int64
lon                  int64
cancm3_0_x         float64
cancm4_0_x         float64
                    ...   
day_of_year_cos    float64
month_sin          float64
month_cos          float64
season_sin         float64
season_cos         float64
Length: 94, dtype: object


In [18]:
def identify_correlated(df, threshold):
    corr_matrix = df.corr().abs()
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    reduced_corr_matrix = corr_matrix.mask(mask)
    features_to_drop = [c for c in reduced_corr_matrix.columns if any(reduced_corr_matrix[c] > threshold)]
    return features_to_drop

In [19]:
x_train = x_train.drop(['start_date'], axis=1) 
x_test = x_test.drop(['start_date'], axis=1)

In [20]:
print(len(x_train.columns))

93


In [21]:
to_drop = identify_correlated(x_train, threshold=.96)

In [22]:
x_train_reduced = pd.DataFrame(x_train.drop(to_drop, axis=1))

In [23]:
print(len(x_train_reduced.columns))

48


In [24]:
x_test_reduced = pd.DataFrame(x_test.drop(to_drop, axis=1))
print(len(x_test_reduced.columns))

48


In [25]:
pipeline = Pipeline(steps = [
              ('scaler', StandardScaler())
              ,('regressor',RandomForestRegressor())
           ])

rf_model2 = pipeline.fit(x_train_reduced, y_train)

train_preds = rf_model2.predict(x_train_reduced)
train_score = r2_score(y_train, train_preds)

test_preds = rf_model2.predict(x_test_reduced)
test_score = r2_score(y_test, test_preds)

print (f'Training Performance: {train_score}')
print (f'Test Performance: {test_score}')

Training Performance: 0.9998777052188643
Test Performance: 0.7876629439295164


In [27]:
estimator = RandomForestRegressor()
selector = RFECV(estimator, step=8, cv=3)
selector = selector.fit(x_train_reduced, y_train)

In [28]:
print("Feature ranking: ", selector.ranking_)

Feature ranking:  [1 1 3 1 1 4 2 3 1 3 1 1 3 2 4 2 2 1 4 5 2 4 3 2 4 1 3 5 1 2 1 2 3 4 1 3 1
 4 4 1 1 5 5 1 5 5 5 5]


In [29]:

mask = selector.get_support()
mask

array([ True,  True, False,  True,  True, False, False, False,  True,
       False,  True,  True, False, False, False, False, False,  True,
       False, False, False, False, False, False, False,  True, False,
       False,  True, False,  True, False, False, False,  True, False,
        True, False, False,  True,  True, False, False,  True, False,
       False, False, False])

In [30]:
features = np.array(x_train_reduced.columns)
best_features = features[mask]

In [31]:
best_features

array(['lat', 'lon', 'ccsm3_y', 'ccsm4_y', 'cancm3_0_y', 'ccsm3_0_y',
       'ccsm4_0_y', 'nmme0_mean_y', 'gfdl_0_x.1', 'tmp2m_std',
       'cancm4_x.1', 'gfdl_x.1', 'gfdl-flor-b_x.1', 'nmme_mean_y.1',
       'nmme0_mean_y.1', 'day_of_year_cos'], dtype=object)

In [32]:
x_train_reduced_import_ft = x_train_reduced[best_features]
x_test_reduced_import_ft = x_test_reduced[best_features]


pipeline = Pipeline(steps = [
              ('scaler', StandardScaler())
              ,('regressor',RandomForestRegressor())
           ])

rf_model2 = pipeline.fit(x_train_reduced_import_ft, y_train)

train_preds = rf_model2.predict(x_train_reduced_import_ft)
train_score = r2_score(y_train, train_preds)

test_preds = rf_model2.predict(x_test_reduced_import_ft)
test_score = r2_score(y_test, test_preds)

print (f'Training Performance: {train_score}')
print (f'Test Performance: {test_score}')

Training Performance: 0.9998703581569808
Test Performance: 0.7894588868628487
