In [None]:
# Mount Google Drive to access files
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**statistics**



> **Bulding Model**



In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# 1. Load and preprocess the data
def load_and_preprocess_data(file_path):
    df = pd.read_csv(file_path)
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['dayofyear'] = df['date'].dt.dayofyear
    return df

# load the file path
historical_data = load_and_preprocess_data('/content/drive/MyDrive/historical_weather.csv')

# 2. Feature engineering
def create_features(df):
    df['temp_1d_lag'] = df.groupby('city_id')['avg_temp_c'].shift(1)
    df['temp_1w_lag'] = df.groupby('city_id')['avg_temp_c'].shift(7)
    df['temp_1m_lag'] = df.groupby('city_id')['avg_temp_c'].shift(30)
    df['temp_1y_lag'] = df.groupby('city_id')['avg_temp_c'].shift(365)

    # Create rolling averages
    df['temp_7d_avg'] = df.groupby('city_id')['avg_temp_c'].rolling(window=7).mean().reset_index(0, drop=True)
    df['temp_30d_avg'] = df.groupby('city_id')['avg_temp_c'].rolling(window=30).mean().reset_index(0, drop=True)

    return df

historical_data = create_features(historical_data)

# 3. Prepare the data for modeling
features = ['year', 'month', 'day', 'dayofyear', 'temp_1d_lag', 'temp_1w_lag', 'temp_1m_lag', 'temp_1y_lag', 'temp_7d_avg', 'temp_30d_avg']
X = historical_data[features].dropna()
y = historical_data.loc[X.index, 'avg_temp_c']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# 5. Evaluate the model
train_predictions = model.predict(X_train_scaled)
test_predictions = model.predict(X_test_scaled)

train_rmse = np.sqrt(mean_squared_error(y_train, train_predictions))
test_rmse = np.sqrt(mean_squared_error(y_test, test_predictions))

print(f"Train RMSE: {train_rmse}")
print(f"Test RMSE: {test_rmse}")


Train RMSE: 0.655091459516315
Test RMSE: 1.734284941685503


**Get Predictions**

In [None]:
# 6. Generate predictions for the submission period
def generate_submission_data(model, scaler, submission_key_path):
    # Load the submission key
    submission_key = pd.read_csv(submission_key_path)
    submission_key['date'] = pd.to_datetime(submission_key['date'])

    # Create features for the submission period
    submission_key['year'] = submission_key['date'].dt.year
    submission_key['month'] = submission_key['date'].dt.month
    submission_key['day'] = submission_key['date'].dt.day
    submission_key['dayofyear'] = submission_key['date'].dt.dayofyear

    # We need to get the lag features from the historical data
    # This assumes that the historical data covers the period just before the submission period
    last_historical_data = historical_data.groupby('city_id').last().reset_index()

    submission_data = pd.merge(submission_key, last_historical_data[['city_id', 'avg_temp_c', 'temp_7d_avg', 'temp_30d_avg']],
                               on='city_id', how='left')

    submission_data['temp_1d_lag'] = submission_data['avg_temp_c']
    submission_data['temp_1w_lag'] = submission_data['avg_temp_c']
    submission_data['temp_1m_lag'] = submission_data['avg_temp_c']
    submission_data['temp_1y_lag'] = submission_data['avg_temp_c']

    # Prepare features for prediction
    features = ['year', 'month', 'day', 'dayofyear', 'temp_1d_lag', 'temp_1w_lag', 'temp_1m_lag', 'temp_1y_lag', 'temp_7d_avg', 'temp_30d_avg']
    X_submission = submission_data[features]
    X_submission_scaled = scaler.transform(X_submission)

    # Make predictions
    predictions = model.predict(X_submission_scaled)

    # Create submission dataframe
    submission = pd.DataFrame({
        'submission_ID': submission_key['submission_ID'],
        'avg_temp_c': predictions
    })

    return submission

# Generate submission
submission_key_path = '/content/drive/MyDrive/submission_key.csv'
submission = generate_submission_data(model, scaler, submission_key_path)

# Save submission to CSV
submission.to_csv('/content/drive/MyDrive/submission.csv', index=False)

print("Submission file created successfully.")

Submission file created successfully.
