In [1]:
import pandas as pd

# Load the data
historical_weather = pd.read_csv('/historical_weather.csv')
sample_submission = pd.read_csv('/sample_submission.csv')
submission_key = pd.read_csv('/submission_key.csv')

# Display the first few rows to understand the structure
print(historical_weather.head())
print(sample_submission.head())
print(submission_key.head())

# Handle missing values
historical_weather.fillna(method='ffill', inplace=True)
historical_weather.fillna(method='bfill', inplace=True)

# Convert `date` to datetime
historical_weather['date'] = pd.to_datetime(historical_weather['date'])

# Create time-based features
historical_weather['year'] = historical_weather['date'].dt.year
historical_weather['month'] = historical_weather['date'].dt.month
historical_weather['day'] = historical_weather['date'].dt.day
historical_weather['day_of_week'] = historical_weather['date'].dt.dayofweek

# Create lag and rolling window features
historical_weather['lag_1'] = historical_weather.groupby('city_id')['avg_temp_c'].shift(1)
historical_weather['rolling_mean_7'] = historical_weather.groupby('city_id')['avg_temp_c'].rolling(window=7).mean().reset_index(0, drop=True)

# Split the data into training and validation sets
train_data = historical_weather[historical_weather['date'] < '2018-01-01']
val_data = historical_weather[historical_weather['date'] >= '2018-01-01']

# Define features and target
features = ['month', 'day', 'day_of_week', 'lag_1', 'rolling_mean_7', 'min_temp_c', 'max_temp_c', 'precipitation_mm', 'snow_depth_mm', 'avg_wind_dir_deg', 'avg_wind_speed_kmh']
X_train = train_data[features]
y_train = train_data['avg_temp_c']
X_val = val_data[features]
y_val = val_data['avg_temp_c']

import lightgbm as lgb
from sklearn.metrics import mean_squared_error

# Create the LightGBM dataset
train_set = lgb.Dataset(X_train, label=y_train)
val_set = lgb.Dataset(X_val, label=y_val, reference=train_set)

# Define parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

# Train the model
model = lgb.train(params, train_set, num_boost_round=1000, valid_sets=[train_set, val_set])

# Validate the model
y_pred = model.predict(X_val, num_iteration=model.best_iteration)
rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f'Validation RMSE: {rmse}')

# Prepare submission data
# Convert 'date' in submission_key to datetime
submission_key['date'] = pd.to_datetime(submission_key['date'])

# Now perform the merge
submission_data = pd.merge(submission_key, historical_weather, on=['city_id', 'date'], how='left')
submission_data.fillna(method='ffill', inplace=True)
submission_data.fillna(method='bfill', inplace=True)

submission_data['month'] = submission_data['date'].dt.month
submission_data['day'] = submission_data['date'].dt.day
submission_data['day_of_week'] = submission_data['date'].dt.dayofweek

submission_data['avg_temp_c'] = model.predict(submission_data[features], num_iteration=model.best_iteration)

print(submission_data['avg_temp_c'].head())
# Prepare final submission file
final_submission = submission_data[['submission_ID', 'avg_temp_c']]
final_submission.to_csv('/my_submission.csv', index=False)

  city_id        date  avg_temp_c  min_temp_c  max_temp_c  precipitation_mm  \
0    C001  2014-01-01         6.6        -1.4        11.6               NaN   
1    C001  2014-01-02         9.3         6.3        13.3               NaN   
2    C001  2014-01-03         7.6         1.9        14.0               NaN   
3    C001  2014-01-04         7.6         3.9        13.3               NaN   
4    C001  2014-01-05         8.6         0.5        16.9               NaN   

   snow_depth_mm  avg_wind_dir_deg  avg_wind_speed_kmh  
0            NaN             168.0                 6.2  
1            NaN             155.0                10.0  
2            NaN               NaN                 5.8  
3            NaN             291.0                11.3  
4            NaN               NaN                 5.0  
   submission_ID  avg_temp_c
0              1         NaN
1              2         NaN
2              3         NaN
3              4         NaN
4              5         NaN
   submis