In [74]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.signal import savgol_filter
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import zscore
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Preprocessing the dataset

In [75]:
train_df = pd.read_csv(r'E:\NB\Machine Learning\UOM - DataCrunch\RoboZen404_DataCrunch\dataset\train.csv') #replace with your file name.
test_df = pd.read_csv(r'E:\NB\Machine Learning\UOM - DataCrunch\RoboZen404_DataCrunch\dataset\test.csv')

# Load your train and test datasets
train_df['is_test'] = 0  # Mark training data
test_df['is_test'] = 1   # Mark test data

Check for missing values

In [76]:
# List of missing columns in test data
missing_cols = ['latitude', 'longitude', 'Avg_Temperature', 'Avg_Feels_Like_Temperature', 
                'Temperature_Range', 'Feels_Like_Temperature_Range', 'Radiation', 
                'Rain_Amount', 'Rain_Duration', 'Wind_Speed', 'Wind_Direction', 'Evapotranspiration']

# Add them with NaN values in test data
for col in missing_cols:
    test_df[col] = np.nan

print(train_df.shape)
print(train_df.head())

print(test_df.shape)
print(test_df.head())

(84960, 18)
   ID  Year  Month  Day   kingdom   latitude  longitude  Avg_Temperature  \
0   1     1      4    1   Arcadia  24.280002 -37.229980            25.50   
1   2     1      4    1  Atlantis  22.979999 -37.329990           299.65   
2   3     1      4    1    Avalon  22.880000 -37.130006            26.30   
3   4     1      4    1   Camelot  24.180003 -36.929994            24.00   
4   5     1      4    1     Dorne  25.780002 -37.530000            28.00   

   Avg_Feels_Like_Temperature  Temperature_Range  \
0                       30.50                8.5   
1                      305.15                5.9   
2                       31.50                5.2   
3                       28.40                8.2   
4                       32.80                5.7   

   Feels_Like_Temperature_Range  Radiation  Rain_Amount  Rain_Duration  \
0                          10.3      22.52        58.89             16   
1                           8.2      22.73        11.83             12

In [77]:
print(train_df.isnull().sum())

ID                              0
Year                            0
Month                           0
Day                             0
kingdom                         0
latitude                        0
longitude                       0
Avg_Temperature                 0
Avg_Feels_Like_Temperature      0
Temperature_Range               0
Feels_Like_Temperature_Range    0
Radiation                       0
Rain_Amount                     0
Rain_Duration                   0
Wind_Speed                      0
Wind_Direction                  0
Evapotranspiration              0
is_test                         0
dtype: int64


There are no any missing values in the train dataset. So we can concatinate

In [78]:
# Concatenate train and test data
full_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

Scalling the dataset

In [79]:
from sklearn.preprocessing import StandardScaler

# Define columns that should NOT be scaled (date-related + target variables)
date_cols = ['Year', 'Month', 'Day', 'is_test', 'ID']
target_cols = ['Avg_Temperature', 'Radiation', 'Rain_Amount', 'Wind_Speed', 'Wind_Direction']

# Get only numerical features (excluding target variables and date columns)
feature_cols = [col for col in full_df.select_dtypes(include=np.number).columns if col not in date_cols + target_cols]

# Apply StandardScaler only to feature columns
scaler = StandardScaler()
full_df[feature_cols] = scaler.fit_transform(full_df[feature_cols])

# Now, train the model as before without modifying the target variables
print(full_df[feature_cols].describe())



           latitude     longitude  Avg_Feels_Like_Temperature  \
count  8.496000e+04  8.496000e+04                8.496000e+04   
mean   1.784992e-15 -1.646687e-14               -8.346535e-17   
std    1.000006e+00  1.000006e+00                1.000006e+00   
min   -1.406599e+00 -9.477272e-01               -9.036781e-01   
25%   -4.048639e-01 -7.432270e-01               -8.170698e-01   
50%   -2.796493e-01 -5.386612e-01               -8.006441e-01   
75%    3.464329e-01  2.795403e-01                1.225318e+00   
max    3.226416e+00  3.143342e+00                1.268622e+00   

       Temperature_Range  Feels_Like_Temperature_Range  Rain_Duration  \
count       8.496000e+04                  8.496000e+04   8.496000e+04   
mean        1.552221e-16                  9.634478e-17   9.366853e-17   
std         1.000006e+00                  1.000006e+00   1.000006e+00   
min        -2.449927e+00                 -2.344662e+00  -1.230131e+00   
25%        -7.813450e-01                 -7.84708

In [80]:
window_size = 3  # Adjust based on need

smooth_features = ['Avg_Temperature', 'Avg_Feels_Like_Temperature', 
                   'Temperature_Range', 'Feels_Like_Temperature_Range',
                   'Radiation', 'Evapotranspiration']

for feature in smooth_features:
    full_df[feature] = full_df[feature].rolling(window=window_size, min_periods=1).mean()


In [81]:
alpha = 0.3  # Smoothing factor (0 < alpha < 1)

for feature in smooth_features:
    full_df[feature] = full_df[feature].ewm(alpha=alpha, adjust=False).mean()

Outlier removal

In [82]:
# Check first few rows of full_df
print(full_df.head())

   ID  Year  Month  Day   kingdom  latitude  longitude  Avg_Temperature  \
0   1     1      4    1   Arcadia  0.346433   0.075040        25.500000   
1   2     1      4    1  Atlantis -1.281385  -0.129534        66.622500   
2   3     1      4    1    Avalon -1.406599   0.279540        81.780750   
3   4     1      4    1   Camelot  0.221218   0.688672        92.241525   
4   5     1      4    1     Dorne  2.224680  -0.538661        72.399067   

   Avg_Feels_Like_Temperature  Temperature_Range  \
0                   -0.815577           1.595120   
1                   -0.507987           1.397924   
2                   -0.394457           1.158761   
3                   -0.316554           0.976178   
4                   -0.465365           0.838257   

   Feels_Like_Temperature_Range  Radiation  Rain_Amount  Rain_Duration  \
0                      1.660623  22.520000        58.89       0.982415   
1                      1.527817  22.551500        11.83       0.429278   
2             

In [83]:
# Step 1: Split full_df into train and test
train_df = full_df[full_df['is_test'] == 0].drop(columns=['is_test'])
test_df = full_df[full_df['is_test'] == 1].drop(columns=['is_test', 'Avg_Temperature', 'Radiation', 'Rain_Amount', 'Wind_Speed', 'Wind_Direction'])


scaled_features = ['Avg_Temperature', 'Avg_Feels_Like_Temperature', 'Temperature_Range',
                   'Feels_Like_Temperature_Range', 'Radiation', 'Rain_Amount', 'Rain_Duration',
                   'Wind_Speed', 'Evapotranspiration']


scaled_features = [feature for feature in scaled_features if feature in full_df.columns]



In [84]:
print(train_df.shape)
print(train_df.head())

(84960, 17)
   ID  Year  Month  Day   kingdom  latitude  longitude  Avg_Temperature  \
0   1     1      4    1   Arcadia  0.346433   0.075040        25.500000   
1   2     1      4    1  Atlantis -1.281385  -0.129534        66.622500   
2   3     1      4    1    Avalon -1.406599   0.279540        81.780750   
3   4     1      4    1   Camelot  0.221218   0.688672        92.241525   
4   5     1      4    1     Dorne  2.224680  -0.538661        72.399067   

   Avg_Feels_Like_Temperature  Temperature_Range  \
0                   -0.815577           1.595120   
1                   -0.507987           1.397924   
2                   -0.394457           1.158761   
3                   -0.316554           0.976178   
4                   -0.465365           0.838257   

   Feels_Like_Temperature_Range  Radiation  Rain_Amount  Rain_Duration  \
0                      1.660623  22.520000        58.89       0.982415   
1                      1.527817  22.551500        11.83       0.429278   
2 

In [85]:
z_threshold = 3  # Common threshold (|Z-score| > 3 is considered an outlier)
full_df = full_df[(np.abs(zscore(full_df[scaled_features])) < z_threshold).all(axis=1)]

print("Dataset shape after outlier removal:", full_df.shape)

Dataset shape after outlier removal: (0, 18)


In [86]:
# Step 2: Concatenate the cleaned train_df and the test_df back into full_df
# We add the 'is_test' column again for clarity, if you need it later
train_df['is_test'] = 0  # Mark the train data
test_df['is_test'] = 1   # Mark the test data

# Concatenate train and test data back into full_df
full_df = pd.concat([train_df, test_df], ignore_index=True)

# Step 3: Check the shape and preview of the concatenated data
print("Full dataset shape after concatenation:", full_df.shape)
print(full_df.head())

Full dataset shape after concatenation: (89490, 18)
   ID  Year  Month  Day   kingdom  latitude  longitude  Avg_Temperature  \
0   1     1      4    1   Arcadia  0.346433   0.075040        25.500000   
1   2     1      4    1  Atlantis -1.281385  -0.129534        66.622500   
2   3     1      4    1    Avalon -1.406599   0.279540        81.780750   
3   4     1      4    1   Camelot  0.221218   0.688672        92.241525   
4   5     1      4    1     Dorne  2.224680  -0.538661        72.399067   

   Avg_Feels_Like_Temperature  Temperature_Range  \
0                   -0.815577           1.595120   
1                   -0.507987           1.397924   
2                   -0.394457           1.158761   
3                   -0.316554           0.976178   
4                   -0.465365           0.838257   

   Feels_Like_Temperature_Range  Radiation  Rain_Amount  Rain_Duration  \
0                      1.660623  22.520000        58.89       0.982415   
1                      1.527817  22.

# Feature Engineering

Lag Features: We created lag features to capture temporal dependencies without relying on actual time. These included shifts of key variables like temperature, wind speed, and rain amount (e.g., 1-day lag).

Rolling Features: We introduced rolling statistics, such as moving averages and rolling sums, to identify trends over a specified window (e.g., 7-day moving averages for temperature and rain).

Seasonal Features: To account for cycles, we grouped the data into hypothetical seasons based on the "Year" variable, using a modulo operation to classify records into 4 seasons.

Aggregated Features by Kingdom: We aggregated key variables like temperature and wind speed by "kingdom" to capture regional patterns and trends.

External Variables: We considered adding external features like radiation levels or indicators based on thresholds (e.g., high radiation) to help the model capture important weather patterns.

In [87]:
# Creating a lag of 1 day for 'Avg_Temperature'
full_df['Avg_Temperature_Lag_1'] = full_df['Avg_Temperature'].shift(1)
full_df['Wind_Speed_Lag_1'] = full_df['Wind_Speed'].shift(1)


In [88]:
full_df['Avg_Temperature_Lag_2'] = full_df['Avg_Temperature'].shift(2)


In [89]:
# Moving average over 7 days for 'Avg_Temperature'
full_df['Avg_Temperature_MA_7'] = full_df['Avg_Temperature'].rolling(window=7).mean()
full_df['Rain_Amount_MA_7'] = full_df['Rain_Amount'].rolling(window=7).mean()


In [90]:
# Rolling sum over 7 days
full_df['Rain_Amount_Rolling_Sum_7'] = full_df['Rain_Amount'].rolling(window=7).sum()

# Rolling standard deviation
full_df['Wind_Speed_Rolling_Std_7'] = full_df['Wind_Speed'].rolling(window=7).std()


Seasonal Features (Non-time-based)

In [91]:
full_df['Season'] = (full_df['Year'] % 4)  # Group into 4 seasons based on the hypothetical year


Aggregated Features by Kingdom

In [92]:
full_df['Avg_Temperature_Kingdom_Mean'] = full_df.groupby('kingdom')['Avg_Temperature'].transform('mean')
full_df['Wind_Speed_Kingdom_Mean'] = full_df.groupby('kingdom')['Wind_Speed'].transform('mean')


External Variables

In [93]:
full_df['High_Radiation'] = (full_df['Radiation'] > full_df['Radiation'].median()).astype(int)


In [94]:
from statsmodels.tsa.stattools import adfuller

#result = adfuller(full_df['Avg_Temperature'])
#print(f'ADF Statistic: {result[0]}')
#print(f'p-value: {result[1]}')

Above values have tested ans since the ADF statistic is -31.50 and the p-value is 0.0, this means that the data is stationary. No need to do any transformations

Split the data into train and test

In [95]:
### --- SPLIT BACK INTO TRAIN AND TEST --- ###
train_df = full_df[full_df['is_test'] == 0].drop(columns=['is_test'])
test_df = full_df[full_df['is_test'] == 1].drop(columns=['is_test', 'Avg_Temperature', 'Radiation', 'Rain_Amount', 'Wind_Speed', 'Wind_Direction'])

In [96]:
print(train_df.shape)
print(train_df.head())

# print(test_df.shape)
# print(test_df.head())

(84960, 28)
   ID  Year  Month  Day   kingdom  latitude  longitude  Avg_Temperature  \
0   1     1      4    1   Arcadia  0.346433   0.075040        25.500000   
1   2     1      4    1  Atlantis -1.281385  -0.129534        66.622500   
2   3     1      4    1    Avalon -1.406599   0.279540        81.780750   
3   4     1      4    1   Camelot  0.221218   0.688672        92.241525   
4   5     1      4    1     Dorne  2.224680  -0.538661        72.399067   

   Avg_Feels_Like_Temperature  Temperature_Range  ...  Wind_Speed_Lag_1  \
0                   -0.815577           1.595120  ...               NaN   
1                   -0.507987           1.397924  ...               8.6   
2                   -0.394457           1.158761  ...              15.8   
3                   -0.316554           0.976178  ...              15.8   
4                   -0.465365           0.838257  ...               6.4   

   Avg_Temperature_Lag_2  Avg_Temperature_MA_7  Rain_Amount_MA_7  \
0                 

# Test the model

XGBoost

In [97]:
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Define features and targets
features = ['Avg_Feels_Like_Temperature', 'Temperature_Range', 'Feels_Like_Temperature_Range',
            'Rain_Duration', 'Evapotranspiration', 'Avg_Temperature_Lag_1', 'Wind_Speed_Lag_1', 
            'Avg_Temperature_Lag_2', 'Avg_Temperature_MA_7', 'Rain_Amount_MA_7', 'Rain_Amount_Rolling_Sum_7',
            'Wind_Speed_Rolling_Std_7', 'Season', 'Avg_Temperature_Kingdom_Mean', 'Wind_Speed_Kingdom_Mean',
            'High_Radiation']  # You can modify this list based on your feature selection

# Define the target columns
targets = ['Avg_Temperature', 'Radiation', 'Rain_Amount', 'Wind_Speed', 'Wind_Direction']

# Create a dictionary to store the models
models = {}

# Training the XGBoost model for each target
for target in targets:
    X_train = train_df[features]
    y_train = train_df[target]
    
    # Train-test split for validation (optional)
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    
    # Instantiate the model
    model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
    
    # Fit the model
    model.fit(X_train_split, y_train_split)
    
    # Store the model
    models[target] = model
    
    # Evaluate model (optional)
    y_pred = model.predict(X_val_split)
    mse = mean_squared_error(y_val_split, y_pred)
    print(f'Mean Squared Error for {target}: {mse}')

# Now, predict the target columns for test_df
predictions = {}

for target in targets:
    model = models[target]
    X_test = test_df[features]
    y_pred_test = model.predict(X_test)
    predictions[target] = y_pred_test

# Combine predictions with 'ID' from test_df
output_df = test_df[['ID']].copy()
for target in targets:
    output_df[target] = predictions[target]

# Save to CSV
output_df.to_csv('predictions.csv', index=False)

print("Predictions saved to predictions.csv")


Mean Squared Error for Avg_Temperature: 0.026664106499411484
Mean Squared Error for Radiation: 0.20084916122674065
Mean Squared Error for Rain_Amount: 49.263232409159855
Mean Squared Error for Wind_Speed: 4.987807104210428
Mean Squared Error for Wind_Direction: 3674.54806153635
Predictions saved to predictions.csv
