# Importing Libraries

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Load the Dataset

In [None]:
data = pd.read_csv('historical_weather.csv')

Display the first few rows of the dataset

In [None]:
print(data.head())

  city_id        date  avg_temp_c  min_temp_c  max_temp_c  precipitation_mm  \
0    C001  2014-01-01         6.6        -1.4        11.6               NaN   
1    C001  2014-01-02         9.3         6.3        13.3               NaN   
2    C001  2014-01-03         7.6         1.9        14.0               NaN   
3    C001  2014-01-04         7.6         3.9        13.3               NaN   
4    C001  2014-01-05         8.6         0.5        16.9               NaN   

   snow_depth_mm  avg_wind_dir_deg  avg_wind_speed_kmh  
0            NaN             168.0                 6.2  
1            NaN             155.0                10.0  
2            NaN               NaN                 5.8  
3            NaN             291.0                11.3  
4            NaN               NaN                 5.0  


To see the size of dataset

In [None]:
data.shape

(182338, 9)

# Handling Null values

In [None]:
# Check for missing values
print(data.isnull().sum())

city_id                    0
date                       0
avg_temp_c              1224
min_temp_c              5886
max_temp_c              7493
precipitation_mm       69744
snow_depth_mm         170100
avg_wind_dir_deg       35394
avg_wind_speed_kmh     22472
dtype: int64


Fill missing values

In [None]:
# filling missing numerical values with the mean of the column
data['avg_temp_c'].fillna(data['avg_temp_c'].mean(), inplace=True)
data['min_temp_c'].fillna(data['min_temp_c'].mean(), inplace=True)
data['max_temp_c'].fillna(data['max_temp_c'].mean(), inplace=True)
data['precipitation_mm'].fillna(data['precipitation_mm'].mean(), inplace=True)
data['snow_depth_mm'].fillna(data['snow_depth_mm'].mean(), inplace=True)
data['avg_wind_dir_deg'].fillna(data['avg_wind_dir_deg'].mean(), inplace=True)
data['avg_wind_speed_kmh'].fillna(data['avg_wind_speed_kmh'].mean(), inplace=True)

Checking the null values after filling

In [None]:
print(data.isnull().sum())

city_id               0
date                  0
avg_temp_c            0
min_temp_c            0
max_temp_c            0
precipitation_mm      0
snow_depth_mm         0
avg_wind_dir_deg      0
avg_wind_speed_kmh    0
dtype: int64


# Feature Engineering

In [None]:
# Convert date column to datetime format
data['date'] = pd.to_datetime(data['date'])

In [None]:
# Extract year, month, and day as separate features
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.day

# Encode Categorical Variables

In [None]:
# Encode city_id as a numerical feature
label_encoder = LabelEncoder()
data['city_id'] = label_encoder.fit_transform(data['city_id'])

# Splitting date for Training and Testing

In [None]:
# Features and target
X = data.drop(columns=['date', 'avg_temp_c'])
y = data['avg_temp_c']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Model

In [None]:
# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Evaluate the Model

In [None]:
# Predict on the test set
y_pred = model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse}')

RMSE: 1.037188595889998


# Load Submission Key and Prepare Features

In [None]:
# Load the submission key
submission_key = pd.read_csv('submission_key.csv')

# Encode 'city_id' in the submission key
submission_key['city_id'] = label_encoder.transform(submission_key['city_id'])

# Convert 'date' to datetime
submission_key['date'] = pd.to_datetime(submission_key['date'])

In [None]:
# Ensure submission data has the same features as training data
submission_data = submission_key.copy()

# Extract year, month, and day as separate features
submission_data['year'] = submission_data['date'].dt.year
submission_data['month'] = submission_data['date'].dt.month
submission_data['day'] = submission_data['date'].dt.day

# Ensure All Features are Present

In [None]:
# Ensure submission data has the same features as training data
required_features = X_train.columns
X_submission = submission_key.drop(columns=['submission_ID', 'date'])

# Add any missing columns with default values
for col in required_features:
    if col not in X_submission.columns:
        X_submission[col] = 0  # or a sensible default value

# Reorder columns to match the training data
X_submission = X_submission[required_features]


# Make Predictions

In [None]:
submission_predictions = model.predict(X_submission)

# Prepare and Save the Submission File

In [None]:
# Prepare the submission dataframe
submission = submission_key[['submission_ID']].copy()
submission['avg_temp_c'] = submission_predictions

# Save to CSV
submission.to_csv('Submission_P239.csv', index=False)

print("Submission file created successfully!")

Submission file created successfully!
