In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from xgboost import DMatrix

# Load  historical weather data
file_path = 'historical_weather.csv'
data = pd.read_csv(file_path)

In [25]:
data.head()

Unnamed: 0,city_id,date,avg_temp_c,min_temp_c,max_temp_c,precipitation_mm,snow_depth_mm,avg_wind_dir_deg,avg_wind_speed_kmh
0,C001,2014-01-01,6.6,-1.4,11.6,,,168.0,6.2
1,C001,2014-01-02,9.3,6.3,13.3,,,155.0,10.0
2,C001,2014-01-03,7.6,1.9,14.0,,,,5.8
3,C001,2014-01-04,7.6,3.9,13.3,,,291.0,11.3
4,C001,2014-01-05,8.6,0.5,16.9,,,,5.0


In [26]:
data.isnull().sum()

city_id                    0
date                       0
avg_temp_c              1224
min_temp_c              5886
max_temp_c              7493
precipitation_mm       69744
snow_depth_mm         170100
avg_wind_dir_deg       35394
avg_wind_speed_kmh     22472
dtype: int64

In [27]:
data = data.drop(['snow_depth_mm', 'precipitation_mm', 'avg_wind_dir_deg', 'avg_wind_speed_kmh'], axis='columns')
data = data.dropna()

In [28]:
data.isnull().sum()

city_id       0
date          0
avg_temp_c    0
min_temp_c    0
max_temp_c    0
dtype: int64

In [29]:
data.head()

Unnamed: 0,city_id,date,avg_temp_c,min_temp_c,max_temp_c
0,C001,2014-01-01,6.6,-1.4,11.6
1,C001,2014-01-02,9.3,6.3,13.3
2,C001,2014-01-03,7.6,1.9,14.0
3,C001,2014-01-04,7.6,3.9,13.3
4,C001,2014-01-05,8.6,0.5,16.9


In [30]:
data['date'] = pd.to_datetime(data['date'])
data = data.sort_values(by=['city_id', 'date'])
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.day

In [31]:
first_week_data = data[data['date'].dt.strftime('%m-%d').isin(['01-01', '01-02', '01-03', '01-04', '01-05', '01-06', '01-07'])]

In [32]:
first_week_data.head()

Unnamed: 0,city_id,date,avg_temp_c,min_temp_c,max_temp_c,year,month,day
0,C001,2014-01-01,6.6,-1.4,11.6,2014,1,1
1,C001,2014-01-02,9.3,6.3,13.3,2014,1,2
2,C001,2014-01-03,7.6,1.9,14.0,2014,1,3
3,C001,2014-01-04,7.6,3.9,13.3,2014,1,4
4,C001,2014-01-05,8.6,0.5,16.9,2014,1,5


In [33]:
results_xgb = []

# Define the parameter grid for hyperparameter tuning
param_distributions = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'subsample': [0.8, 0.9],
    'colsample_bytree': [0.8, 0.9]
}

# Process each city
cities = first_week_data['city_id'].unique()
for city in cities:
    city_data = first_week_data[first_week_data['city_id'] == city]
    
    # Use all available data to train the XGBoost model
    X = city_data[['year', 'month', 'day']]
    y = city_data['avg_temp_c']
    
    # Initialize XGBoost model
    xgb_model = XGBRegressor()
    
    # Initialize Randomized Search with cross-validation
    randomized_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_distributions, cv=3, n_jobs=-1, verbose=2, n_iter=10, random_state=42)
    
    # Fit the model
    randomized_search.fit(X, y)
    
    # Get the best estimator
    best_model = randomized_search.best_estimator_
    
    # Forecast for the first week of 2019
    # Create a DataFrame for the first week of 2019
    forecast_dates = pd.date_range(start='2019-01-01', periods=7)
    forecast_data = pd.DataFrame({
        'date': forecast_dates,
        'year': forecast_dates.year,
        'month': forecast_dates.month,
        'day': forecast_dates.day
    })
    
    # Predict using the best model
    forecast = best_model.predict(forecast_data[['year', 'month', 'day']])
    
    # Store results
    for i, temp in enumerate(forecast):
        results_xgb.append({
            'city_id': city,
            'date': forecast_dates[i].strftime('%Y-%m-%d'),
            'avg_temp_c': temp
        })

# Convert results to DataFrame
results_xgb_df = pd.DataFrame(results_xgb)

# Add submission_ID column with sequential indices from 1 to len(results_xgb_df)
results_xgb_df['submission_ID'] = np.arange(1, len(results_xgb_df) + 1)

# Select and reorder columns to match the required format
submission_df = results_xgb_df[['submission_ID', 'avg_temp_c']]

# Save to CSV
submission_path = 'submission_xgbf2.csv'
submission_df.to_csv(submission_path, index=False)

submission_path


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Fitting 3 folds for each

'submission_xgbf2.csv'