In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/gdsc-aiml-member-recruitments/train.csv
/kaggle/input/gdsc-aiml-member-recruitments/test.csv
/kaggle/input/gdsc-aiml-member-recruitments/sample_solution.csv


In [2]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split



**Data Loading:**

Loading the training and test data from CSV files into Pandas DataFrames.

In [3]:
# Load the training and test data
train_data = pd.read_csv('/kaggle/input/gdsc-aiml-member-recruitments/train.csv')
test_data = pd.read_csv('/kaggle/input/gdsc-aiml-member-recruitments/test.csv')

**Handling Missing Values:**

Handling missing values by imputing them with the mean value for specific columns in both the training and test datasets.

In [4]:
# Handle missing values
imputer_train = SimpleImputer(strategy='mean')
imputer_test = SimpleImputer(strategy='mean')

# Get the columns for imputation
impute_cols_train = ['T2M_RANGE', 'T2M_MAX', 'T2M_MIN', 'RH2M', 'PS', 'WS10M']
impute_cols_test = ['T2M_RANGE', 'T2M_MAX', 'T2M_MIN', 'RH2M', 'PS', 'WS10M']

# Impute missing values for train and test data
train_data[impute_cols_train] = imputer_train.fit_transform(train_data[impute_cols_train])
test_data[impute_cols_test] = imputer_test.fit_transform(test_data[impute_cols_test])

**Data Splitting:**

Separating the features (X) from the target variable (y) in the training data. It also splits the data into training and validation sets.

In [5]:
# Splitting into features (X) and target (y)
X = train_data.drop(columns=['ID', 'YEAR', 'VACATION_RATE', 'QV2M', 'T2M'])
y = train_data['VACATION_RATE']

# Splitting into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

**Model Initialization and Training:**

Initializing a Random Forest Regressor model and trains it on the training data.

The Random Forest Regressor is chosen as it's an ensemble method suitable for regression tasks and capable of capturing complex relationships in the data.

In [6]:
# Initialize and train the model (Random Forest)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

**Validation and RMSE Calculation:**

The trained model is used to make predictions on the validation set, and the Root Mean Squared Error (RMSE) is calculated as a measure of prediction accuracy.

Validation and RMSE calculation help assess how well the model performs on unseen data. RMSE quantifies the prediction error, which is a critical metric for regression tasks.

In [7]:
# Predict vacation rates on validation set
y_pred = model.predict(X_val)

# Calculate RMSE
rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f'RMSE on validation set: {rmse}')

RMSE on validation set: 6.739854441418397


**Test Data Preparation and Test Set Prediction:**

Ensuring that the columns in the test data match the columns used for training by selecting relevant features.

The trained model is applied to the test data to make predictions for the target variable, "VACATION_RATE."

In [8]:
# Ensure correct column order in test_data
test_features = test_data[X_val.columns]

# Use the trained model to predict on the test set
test_predictions = model.predict(test_features)

**Submission File Creation and Submission CSV Output:**

A DataFrame is created to combine the "ID" column from the test data with the model's predictions for "VACATION_RATE."

The submission DataFrame is saved as a CSV file named 'submission.csv.'

In [9]:
# Create a DataFrame for submission
submission_df = pd.DataFrame({'ID': test_data['ID'], 'VACATION_RATE': test_predictions})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)