In [1]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# Load data
historical_weather = pd.read_csv('/kaggle/input/predicta-1-0-predict-the-unpredictable/historical_weather.csv')
sample_submission = pd.read_csv('/kaggle/input/predicta-1-0-predict-the-unpredictable/sample_submission.csv')
submission_key = pd.read_csv('/kaggle/input/predicta-1-0-predict-the-unpredictable/submission_key.csv')

# Data Understanding and Preprocessing

In [2]:
# Preprocess historical weather data
historical_weather['date'] = pd.to_datetime(historical_weather['date'])
historical_weather['year'] = historical_weather['date'].dt.year
historical_weather['month'] = historical_weather['date'].dt.month
historical_weather['day'] = historical_weather['date'].dt.day
historical_weather['day_of_year'] = historical_weather['date'].dt.dayofyear
historical_weather['week_of_year'] = historical_weather['date'].dt.isocalendar().week
historical_weather['weekday'] = historical_weather['date'].dt.weekday

historical_weather['month_sin'] = np.sin(2 * np.pi * historical_weather['month'] / 12)
historical_weather['month_cos'] = np.cos(2 * np.pi * historical_weather['month'] / 12)
historical_weather['day_of_year_sin'] = np.sin(2 * np.pi * historical_weather['day_of_year'] / 365)
historical_weather['day_of_year_cos'] = np.cos(2 * np.pi * historical_weather['day_of_year'] / 365)

historical_weather['city_id'] = historical_weather['city_id'].str.extract('(\d+)').astype(int)

# Ensure filtering is correct by checking the year and week_of_year
historical_weather = historical_weather[
    (historical_weather['week_of_year'] == 1) &
    (historical_weather['date'].dt.year == historical_weather['year']) &
    (historical_weather['month'] == 1)
]

In [3]:
# Fill missing avg_temp_c using the mean of min_temp_c and max_temp_c
historical_weather['avg_temp_c'] = historical_weather.apply(
    lambda row: (row['min_temp_c'] + row['max_temp_c']) / 2 
    if pd.isnull(row['avg_temp_c']) and not pd.isnull(row['min_temp_c']) and not pd.isnull(row['max_temp_c']) 
    else row['avg_temp_c'], 
    axis=1
)

In [4]:
# Check for missing values
# Define columns with missing values
columns_with_missing = [
    'avg_temp_c', 'min_temp_c', 'max_temp_c', 'precipitation_mm',
    'snow_depth_mm', 'avg_wind_dir_deg', 'avg_wind_speed_kmh'
]

# Define numerical and categorical columns
numerical_columns = columns_with_missing + ['city_id', 'year', 'month', 'day', 'day_of_year', 'week_of_year', 'weekday', 'month_sin', 'month_cos', 'day_of_year_sin', 'day_of_year_cos']

# Initialize the IterativeImputer
imputer = IterativeImputer(max_iter=10, random_state=0)

# Fit the imputer on the data and transform it
historical_weather[columns_with_missing] = imputer.fit_transform(historical_weather[columns_with_missing])

# Print the DataFrame after imputation to verify
# print(historical_weather.isnull().sum())  # Check if all missing values are filled



# Feature Selection and Engineering

In [5]:
features = [
    'city_id', 'year', 'month', 'day', 'day_of_year', 'week_of_year', 'weekday',
    'month_sin', 'month_cos', 'day_of_year_sin', 'day_of_year_cos'
]
target = 'avg_temp_c'

# Split data into training and validation sets
X = historical_weather[features]
y = historical_weather[target]

# Split the cleaned data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Selection and Training

In [6]:
reg = xgb.XGBRegressor(base_score=0.8, booster='gbtree',    
                       n_estimators=20000,
                       early_stopping_rounds=100,
                       objective='reg:linear',
                       max_depth=4,
                       learning_rate=0.003)
reg.fit(X_train, y_train,
        eval_set=[(X_train, y_train), (X_val, y_val)],
        verbose=200)

[0]	validation_0-rmse:17.64498	validation_1-rmse:16.98176




[200]	validation_0-rmse:13.03898	validation_1-rmse:12.61104
[400]	validation_0-rmse:10.91583	validation_1-rmse:10.73678
[600]	validation_0-rmse:9.93935	validation_1-rmse:9.89567
[800]	validation_0-rmse:9.26677	validation_1-rmse:9.26338
[1000]	validation_0-rmse:8.69550	validation_1-rmse:8.70284
[1200]	validation_0-rmse:8.22250	validation_1-rmse:8.22816
[1400]	validation_0-rmse:7.81982	validation_1-rmse:7.82771
[1600]	validation_0-rmse:7.46030	validation_1-rmse:7.47685
[1800]	validation_0-rmse:7.11411	validation_1-rmse:7.12938
[2000]	validation_0-rmse:6.78527	validation_1-rmse:6.79231
[2200]	validation_0-rmse:6.48969	validation_1-rmse:6.49106
[2400]	validation_0-rmse:6.21277	validation_1-rmse:6.20700
[2600]	validation_0-rmse:5.99210	validation_1-rmse:5.98407
[2800]	validation_0-rmse:5.78542	validation_1-rmse:5.77725
[3000]	validation_0-rmse:5.61806	validation_1-rmse:5.61733
[3200]	validation_0-rmse:5.38237	validation_1-rmse:5.39127
[3400]	validation_0-rmse:5.17100	validation_1-rmse:5.187

# Evaluate the model

In [7]:
# Evaluate model on training set
y_train_pred = reg.predict(X_train.values)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
train_mean = np.mean(y_train)
train_rmse_percentage = (train_rmse / train_mean) * 100
print(f'Training RMSE: {train_rmse}')
print(f'Training RMSE as Percentage: {train_rmse_percentage:.2f}%')

# Evaluate model on validation set
y_val_pred = reg.predict(X_val.values)
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
val_mean = np.mean(y_val)
val_rmse_percentage = (val_rmse / val_mean) * 100
print(f'Validation RMSE: {val_rmse}')
print(f'Validation RMSE as Percentage: {val_rmse_percentage:.2f}%')

Training RMSE: 1.7473997377841097
Training RMSE as Percentage: 12.61%
Validation RMSE: 2.3967803375135666
Validation RMSE as Percentage: 18.07%


# Prepare the final submission

In [8]:
# Prepare test data (submission_key.csv)
submission_key['date'] = pd.to_datetime(submission_key['date'])
submission_key['year'] = submission_key['date'].dt.year
submission_key['month'] = submission_key['date'].dt.month
submission_key['day'] = submission_key['date'].dt.day
submission_key['day_of_year'] = submission_key['date'].dt.dayofyear
submission_key['week_of_year'] = submission_key['date'].dt.isocalendar().week
submission_key['weekday'] = submission_key['date'].dt.weekday

submission_key['month_sin'] = np.sin(2 * np.pi * submission_key['month'] / 12)
submission_key['month_cos'] = np.cos(2 * np.pi * submission_key['month'] / 12)
submission_key['day_of_year_sin'] = np.sin(2 * np.pi * submission_key['day_of_year'] / 365)
submission_key['day_of_year_cos'] = np.cos(2 * np.pi * submission_key['day_of_year'] / 365)

submission_key['city_id'] = submission_key['city_id'].astype(str).str.extract('(\d+)').astype(int)

In [9]:
test_features = ['city_id', 'year', 'month', 'day', 'day_of_year', 'week_of_year', 'weekday',
    'month_sin', 'month_cos', 'day_of_year_sin', 'day_of_year_cos']

X_test_final = submission_key[test_features]

In [10]:
# Predict temperatures
predictions = reg.predict(X_test_final.values)

# Prepare submission file
submission_output = pd.DataFrame({
    'submission_ID': submission_key['submission_ID'],
    'avg_temp_c': predictions
})

# Save submission file
submission_output.to_csv('submission.csv', index=False)