 # Import necessary libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import fbeta_score
from sklearn.model_selection import GridSearchCV

# Load train and test datasets

In [2]:
train_df = pd.read_csv('carvan_train.csv')
test_df = pd.read_csv('carvan_test.csv')

# Data preprocessing

In [3]:
# Checking missing values
train_df.isnull().sum()

V1     0
V2     0
V3     0
V4     0
V5     0
      ..
V82    0
V83    0
V84    0
V85    0
V86    0
Length: 86, dtype: int64

In [4]:
test_df.isnull().sum()

V1     0
V2     0
V3     0
V4     0
V5     0
      ..
V81    0
V82    0
V83    0
V84    0
V85    0
Length: 85, dtype: int64

In [5]:
# Separate features and target variable
X = train_df.drop(columns=['V86'])
y = train_df['V86']


In [6]:
# Split train data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [7]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Model Training and Evaluation

In [8]:
# Model training
rf_classifier = RandomForestClassifier()

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}


In [9]:
from sklearn.metrics import fbeta_score, make_scorer

# Create a scorer for F-beta score (e.g., beta = 2)
scorer = make_scorer(fbeta_score, beta=2)

grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, scoring=scorer, cv=5, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)


In [10]:
# Best parameters
best_params = grid_search.best_params_

# Train model with best parameters
best_rf_classifier = RandomForestClassifier(**best_params)
best_rf_classifier.fit(X_train_scaled, y_train)

In [11]:
# Make predictions on validation set
y_val_pred = best_rf_classifier.predict(X_val_scaled)

In [12]:
# Calculate F-beta score on validation set
fbeta = fbeta_score(y_val, y_val_pred, beta=2)
print("F-beta score on validation set:", fbeta)

F-beta score on validation set: 0.08278145695364238


# Make Predictions on Test Set and Submission

In [13]:
# Make predictions on test set
# Data preprocessing on test set similar to train set
# Scale test set features
X_test_scaled = scaler.transform(test_df)

# Predict target variable for test set
test_predictions = best_rf_classifier.predict(X_test_scaled)

# Create submission CSV
submission_df = pd.DataFrame({'V86': test_predictions})
submission_df.to_csv('submission.csv', index=False)

In [14]:
# Print confirmation message
print("Test set predictions have been generated and saved to 'submission.csv'")

Test set predictions have been generated and saved to 'submission.csv'
