#Importing Libraries


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np


Import Data set

In [2]:
# Load the dataset
data = pd.read_csv("/kaggle/input/predicta-1-0-predict-the-unpredictable/historical_weather.csv")

data


Unnamed: 0,city_id,date,avg_temp_c,min_temp_c,max_temp_c,precipitation_mm,snow_depth_mm,avg_wind_dir_deg,avg_wind_speed_kmh
0,C001,2014-01-01,6.6,-1.4,11.6,,,168.0,6.2
1,C001,2014-01-02,9.3,6.3,13.3,,,155.0,10.0
2,C001,2014-01-03,7.6,1.9,14.0,,,,5.8
3,C001,2014-01-04,7.6,3.9,13.3,,,291.0,11.3
4,C001,2014-01-05,8.6,0.5,16.9,,,,5.0
...,...,...,...,...,...,...,...,...,...
182333,C112,2018-12-27,22.0,15.9,27.9,0.0,,100.0,13.2
182334,C112,2018-12-28,21.9,14.9,28.2,0.0,,91.0,12.6
182335,C112,2018-12-29,22.4,16.3,28.2,0.0,,61.0,14.2
182336,C112,2018-12-30,21.6,18.5,26.6,1.6,,70.0,17.0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 182338 entries, 0 to 182337
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   city_id             182338 non-null  object 
 1   date                182338 non-null  object 
 2   avg_temp_c          181114 non-null  float64
 3   min_temp_c          176452 non-null  float64
 4   max_temp_c          174845 non-null  float64
 5   precipitation_mm    112594 non-null  float64
 6   snow_depth_mm       12238 non-null   float64
 7   avg_wind_dir_deg    146944 non-null  float64
 8   avg_wind_speed_kmh  159866 non-null  float64
dtypes: float64(7), object(2)
memory usage: 12.5+ MB


#Data cleaning

In [4]:
data.isna().sum()

city_id                    0
date                       0
avg_temp_c              1224
min_temp_c              5886
max_temp_c              7493
precipitation_mm       69744
snow_depth_mm         170100
avg_wind_dir_deg       35394
avg_wind_speed_kmh     22472
dtype: int64

#Data cleaning

Drop the snow_depth_mm and precipitation_mm variables  

In [5]:
data = data[['city_id','date','avg_temp_c','min_temp_c', 'max_temp_c','avg_wind_dir_deg', 'avg_wind_speed_kmh']]
data=data.dropna()

data.isna().sum()

city_id               0
date                  0
avg_temp_c            0
min_temp_c            0
max_temp_c            0
avg_wind_dir_deg      0
avg_wind_speed_kmh    0
dtype: int64

# Extract features from the date

In [6]:
data['date'] = pd.to_datetime(data['date'])
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.day
data['day_of_week'] = data['date'].dt.dayofweek

Drop the original date column and convert city_id to numeric value

In [7]:
data = data.drop(columns=['date'])

data['city_id'] = data['city_id'].str.extract(r'(\d+)', expand=False)
data['city_id']=pd.to_numeric(data['city_id'])

# Split into features and target variables

In [8]:
X = data[['city_id', 'year', 'month', 'day', 'day_of_week']]
y = data[['min_temp_c', 'max_temp_c', 'avg_wind_dir_deg', 'avg_wind_speed_kmh']]


# Split into training and testing sets

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Initialize models for each target variable

In [10]:
models = {
    'min_temp_c': RandomForestRegressor(),
    'max_temp_c': RandomForestRegressor(),
    'avg_wind_dir_deg': RandomForestRegressor(),
    'avg_wind_speed_kmh': RandomForestRegressor()
}


# Train each model


In [11]:
for feature in models:
    models[feature].fit(X_train, y_train[feature])

# Predict the features


In [12]:
predictions = {}
for feature in models:
    predictions[feature] = models[feature].predict(X_test)

Convert predictions to DataFrame


In [13]:
predicted_features = pd.DataFrame(predictions)

# Split the data to get average temperature as target


In [14]:
X_avg_temp = predicted_features
y_avg_temp = data.loc[X_test.index, 'avg_temp_c']

Split into training and testing sets for average temperature


In [15]:
X_avg_temp_train, X_avg_temp_test, y_avg_temp_train, y_avg_temp_test = train_test_split(X_avg_temp, y_avg_temp, test_size=0.2, random_state=42)


Initialize and train the model


In [16]:
avg_temp_model = RandomForestRegressor()
avg_temp_model.fit(X_avg_temp_train, y_avg_temp_train)


# Predict the average temperature


In [17]:
avg_temp_predictions = avg_temp_model.predict(X_avg_temp_test)

In [18]:
avg_temp_predictions

array([22.383, 26.921, 30.692, ...,  9.829, 32.764,  8.053])

In [19]:
y_avg_temp_test

9851      18.7
178376    28.9
161265    30.5
157141     7.7
79840     28.7
          ... 
35744     28.7
126556    27.0
77000     10.3
116425    33.9
80816      1.4
Name: avg_temp_c, Length: 5869, dtype: float64

# Evaluate weather feature models

In [20]:
for feature in models:
    mse = mean_squared_error(y_test[feature], predictions[feature])
    print(f'MSE for {feature}: {mse}')


MSE for min_temp_c: 5.609324403782586
MSE for max_temp_c: 6.967529732833533
MSE for avg_wind_dir_deg: 8314.988398756177
MSE for avg_wind_speed_kmh: 16.58357706931334


# Evaluate average temperature model

In [21]:
avg_temp_mse = mean_squared_error(y_avg_temp_test, avg_temp_predictions)

avg_temp_rmse = np.sqrt(avg_temp_mse)
print(f'RMSE for average temperature: {avg_temp_rmse}')

RMSE for average temperature: 2.20790174201717


#Reading the submission key file

In [22]:
submission_key=pd.read_csv("/kaggle/input/predicta-1-0-predict-the-unpredictable/submission_key.csv")

# Extract features from the date
submission_key['date'] = pd.to_datetime(submission_key['date'])
submission_key['year'] = submission_key['date'].dt.year
submission_key['month'] = submission_key['date'].dt.month
submission_key['day'] = submission_key['date'].dt.day
submission_key['day_of_week'] = submission_key['date'].dt.dayofweek

Convert city_id(Object) to numeric

In [23]:
submission_key['city_id'] = submission_key['city_id'].str.extract(r'(\d+)', expand=False)
submission_key['city_id']=pd.to_numeric(submission_key['city_id'])

In [24]:
X1 = submission_key[['city_id', 'year', 'month', 'day', 'day_of_week']]
predictions1 = {}
for feature in models:
    predictions1[feature] = models[feature].predict(X1)

#Predict average temoerature for first week of 2019

In [25]:
# Convert predictions to DataFrame
predicted_features1 = pd.DataFrame(predictions1)

X_avg_temp1 = predicted_features1
# Predict the average temperature
avg_temp_predictions1 = avg_temp_model.predict(X_avg_temp1)
avg_temp_predictions1.shape

(700,)

In [26]:
sample_submission = pd.read_csv("/kaggle/input/predicta-1-0-predict-the-unpredictable/sample_submission.csv")

Update avg_temp_c with predicted value

In [27]:
sample_submission['avg_temp_c'] = avg_temp_predictions1

In [28]:
sample_submission

Unnamed: 0,submission_ID,avg_temp_c
0,1,9.308
1,2,9.060
2,3,10.827
3,4,11.341
4,5,14.078
...,...,...
695,696,20.504
696,697,20.566
697,698,20.642
698,699,20.108


#Save to csv file

In [29]:
sample_submission = pd.DataFrame(sample_submission)
# Specify the file path
sample_submission.to_csv('/kaggle/working/P247-sample_submission.csv',index=False)