In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/gdsc-aiml-member-recruitments/train.csv
/kaggle/input/gdsc-aiml-member-recruitments/test.csv
/kaggle/input/gdsc-aiml-member-recruitments/sample_solution.csv


**Importing Libraries:**
Importing the required Python libraries for data manipulation, imputation, machine learning, and evaluation.

In [2]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor



**Data Loading:**
Loading the training and test data from CSV files into Pandas DataFrames for further processing.

In [3]:
# Load the training and test data
train_data = pd.read_csv('/kaggle/input/gdsc-aiml-member-recruitments/train.csv')
test_data = pd.read_csv('/kaggle/input/gdsc-aiml-member-recruitments/test.csv')
year = 1
week = 1
weeks = []
for ind in train_data.index:
    if (train_data['YEAR'][ind]!=year):
        year = train_data['YEAR'][ind]
        week = 1
    else:
        week = week + 1
    weeks.append(week)
train_data.insert(loc = 2,
          column = 'NEW_WEEK',
          value = weeks)

year = 2009
week = 30
weeks = []
for ind in test_data.index:
    if (test_data['YEAR'][ind]!=year):
        year = test_data['YEAR'][ind]
        week = 1
    else:
        week = week + 1
    weeks.append(week)
test_data.insert(loc = 2,
          column = 'NEW_WEEK',
          value = weeks)

**Creating Week Numbers:**
Creating a new feature column, NEW_WEEK, to represent the week number within each year for both the training and test datasets. This can help capture temporal patterns in the data.

In [4]:
print(train_data.head(5))
print(test_data.head(5))

   ID    YEAR  NEW_WEEK   T2M_RANGE    T2M_MAX    T2M_MIN       RH2M  \
0   0  1982.0         1 -241.552430  29.798545  13.691128  71.652934   
1   1  1982.0         2 -241.819997  30.902708  15.298046  69.754933   
2   2  1982.0         3 -241.883202  30.296561  14.660419  66.661416   
3   3  1982.0         4 -242.007629  30.803224  15.246491  67.621173   
4   4  1982.0         5 -240.659332  32.500906  15.303059  62.097329   

          PS     WS10M      QV2M        T2M  VACATION_RATE  
0  89.725069  3.658959  8.474319  21.723403      64.184741  
1  89.865668  3.529113  8.988374  22.911522      42.649137  
2  89.846273  4.217007  8.474923  22.994887      52.819141  
3  90.029795  4.484080  9.005508  23.594655      39.445489  
4  89.825707  3.057211  8.307515  24.536776      46.444289  
     ID    YEAR  NEW_WEEK   T2M_RANGE    T2M_MAX    T2M_MIN       RH2M  \
0  1440  2009.0        31 -309.569706  32.919144  17.735611  74.415249   
1  1441  2009.0        32 -310.763533  32.340370  17.

**Handling Missing Values:**
Addressing missing values in specific columns (listed in impute_cols) by filling them with the mean value of each respective column. This ensures completeness of the data.

In [5]:
# Handle missing values
imputer = SimpleImputer(strategy='mean')

# Get the columns for imputation
impute_cols = ['YEAR', 'T2M_RANGE', 'T2M_MAX', 'T2M_MIN', 'RH2M', 'PS', 'WS10M', "NEW_WEEK"]

# Impute missing values for train and test data
train_data[impute_cols] = imputer.fit_transform(train_data[impute_cols])
test_data[impute_cols] = imputer.fit_transform(test_data[impute_cols])

**Splitting Data:**
Dividing the training data into two parts: features (X) and target variables (y1 and y2) for regression tasks. The features are defined in X, while y1 and y2 represent target variables.

In [6]:
# Splitting into features (X) and target (y)
X = train_data.drop(columns=['VACATION_RATE', 'QV2M', 'T2M'])
X = X.set_index(['YEAR','NEW_WEEK'])

y1 = train_data['T2M']
y2 = train_data['QV2M']

**Data Slicing for Training and Validation:**
Splitting the data into training and validation sets. The first 1147 samples (divided based on the year) are used for training, and the remaining samples are used for validation in the regression tasks.

In [7]:
# Slicing the data
X_train = X[:1147]
y_train1 = y1[:1147]
y_train2 = y2[:1147]

X_val = X[1147:]
y_val1 = y1[1147:]
y_val2 = y2[1147:]

In [8]:
print(X_train.head(5))
print(X_val.head(5))

                 ID   T2M_RANGE    T2M_MAX    T2M_MIN       RH2M         PS  \
YEAR   NEW_WEEK                                                               
1982.0 1.0        0 -241.552430  29.798545  13.691128  71.652934  89.725069   
       2.0        1 -241.819997  30.902708  15.298046  69.754933  89.865668   
       3.0        2 -241.883202  30.296561  14.660419  66.661416  89.846273   
       4.0        3 -242.007629  30.803224  15.246491  67.621173  90.029795   
       5.0        4 -240.659332  32.500906  15.303059  62.097329  89.825707   

                    WS10M  
YEAR   NEW_WEEK            
1982.0 1.0       3.658959  
       2.0       3.529113  
       3.0       4.217007  
       4.0       4.484080  
       5.0       3.057211  
                   ID   T2M_RANGE    T2M_MAX    T2M_MIN       RH2M         PS  \
YEAR   NEW_WEEK                                                                 
2004.0 1.0       1147 -243.592558  31.740879  18.090290  70.579424  89.901119   
       

**Model Initialization and Training:**
Initializing two Random Forest regression models (model1 and model2) with specific hyperparameters and then training them using the training data and their respective target variables.

In [9]:
# Initialize and train the model (Random Forest)
model1 = RandomForestRegressor(n_estimators=1600, random_state=80)
model2 = RandomForestRegressor(n_estimators=1600, random_state=80)

model1.fit(X_train, y_train1)
model2.fit(X_train, y_train2)

**Prediction on Validation Set:**
Using the trained models to make predictions (y_pred1 and y_pred2) on the validation set, which can be used for performance evaluation.

In [10]:
# Predict vacation rates on validation set
y_pred1 = model1.predict(X_val)
y_pred2 = model2.predict(X_val)

In [11]:
print(y_pred1[:5])
print(y_pred2[:5])

[25.31592891 23.38474732 24.20149582 26.42904374 28.22016416]
[10.89378003  9.18847467  6.05793486  7.51071958 10.81005375]


**Continue with Vacation Rate Prediction:**
Updating the feature set (X) and target variable (y3) to predict VACATION_RATE.

In [12]:
# Continuing with Vacation Rate prediction
X = train_data.drop(columns=['VACATION_RATE'])
X = X.set_index(['YEAR','NEW_WEEK'])
y3 = train_data['VACATION_RATE']

**Data Slicing for Training and Validation (VACATION_RATE):**
Performing another data split, separating the data into training and validation sets for predicting VACATION_RATE.

In [13]:
# Slicing the data
X_train = X[:1147]
y_train3 = y3[:1147]

X_val = X[1147:]
y_val3 = y3[1147:]

**Model Initialization and Training (VACATION_RATE):**
Initializing and training a Random Forest regression model (model3) to predict VACATION_RATE.

Random Forest is a good choice for a time series seasonal dataset, because it can effectively capture complex relationships, handle noise, provide feature importance insights, mitigate overfitting, and support parallel processing.

In [14]:
# Initialize and train the model (Random Forest)
model3 = RandomForestRegressor(n_estimators=1600, random_state=80)
model3.fit(X_train, y_train3)

**Prediction on Validation Set (VACATION_RATE):**
Using model3 to make predictions (y_pred3) for VACATION_RATE on the validation set.

In [15]:
# Predict vacation rates on validation set
y_pred3 = model3.predict(X_val)

In [16]:
print(y_pred3[:5])

[  8.32882878  40.48270232 107.13411362  45.6926572    3.67673144]


**Model Evaluation:**
Calculating the Root Mean Squared Error (RMSE) to assess the performance of the model (model3) in predicting VACATION_RATE. A lower RMSE indicates better prediction accuracy.

In [17]:
# Calculate RMSE
rmse = mean_squared_error(y_val3, y_pred3, squared=False)
print(f'RMSE on validation set: {rmse}')

RMSE on validation set: 4.168477747923908


**Preparing Test Data:**
Preparing the test data by setting the index to match the format of the training data and then using the trained models (model1 and model2) to predict T2M and QV2M for the test data. These predictions are added to the test data, and finally, using model3 to predict VACATION_RATE for the test set.

In [18]:
test_X = test_data
test_X = test_X.set_index(['YEAR','NEW_WEEK'])

# Use the trained model to predict on the test set
test_predictions1 = model1.predict(test_X)
test_predictions2 = model2.predict(test_X)

In [19]:
print(test_predictions1[:5])
print(test_predictions2[:5])

[25.91071485 25.75024638 24.56178222 23.94261964 24.20541613]
[11.91114408 11.9816106  12.31660776 11.85279091 11.94352725]


In [20]:
# Add the predicted QV2M and T2M values to the validation sets
test_X['QV2M'] = test_predictions2
test_X['T2M'] = test_predictions1

In [21]:
# Use the trained model to predict on the test set
test_predictions3 = model3.predict(test_X)

In [22]:
print(test_predictions3[:5])

[1.63271767 1.48324138 0.80963379 6.76888269 4.76972889]


**Creating a Submission DataFrame and Saving the Submission CSV:**
Creating a DataFrame (submission_df) with the predicted VACATION_RATE values and corresponding IDs for the test data, which is necessary for submission.
Finally, saving the submission DataFrame to a CSV file named 'submission.csv,' which can be submitted for evaluation.

In [23]:
# Create a DataFrame for submission
submission_df = pd.DataFrame({'ID': test_data['ID'], 'VACATION_RATE': test_predictions3})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)