In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [29]:
cleaned_data = pd.read_csv('cleaned_file.csv')


In [30]:
test_data = pd.read_csv('CS98XRegressionTest.csv')

In [31]:
# Define features (X) and target (y) for training
X_train = cleaned_data[['bpm', 'nrgy', 'dnce', 'dB', 'live', 'val', 'dur', 'acous', 'spch']]
y_train = cleaned_data['pop']

In [32]:
# Define features for testing
X_test = test_data[['bpm', 'nrgy', 'dnce', 'dB', 'live', 'val', 'dur', 'acous', 'spch']]

In [33]:
# Split the training data into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

In [34]:
param_grid = {
    'n_estimators': [100, 200, 300, 500],  
    'max_depth': [None, 10, 20, 30],  
    'min_samples_split': [2, 5, 10],  
    'min_samples_leaf': [1, 2, 4],  
    'max_features': [None, 'sqrt', 'log2']  # Removed 'auto'
}



In [35]:
# Initialize Random Forest Regressor
rf = RandomForestRegressor(random_state=42)



In [36]:
# Perform Randomized Search
rf_random = RandomizedSearchCV(
    estimator=rf, param_distributions=param_grid, 
    n_iter=20, cv=5, scoring='neg_mean_squared_error', 
    verbose=2, random_state=42, n_jobs=-1
)

In [37]:
# Fit the model
rf_random.fit(X_train_split, y_train_split)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [38]:
# Get the best parameters
best_params = rf_random.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 10}


In [39]:
# Train the best model on full training data
best_model = RandomForestRegressor(**best_params, random_state=42)
best_model.fit(X_train, y_train)


In [40]:
# Predict on validation set
y_val_pred = best_model.predict(X_val_split)

In [41]:
# Evaluate model
mse = mean_squared_error(y_val_split, y_val_pred)
r2 = r2_score(y_val_split, y_val_pred)
print(f"Validation MSE: {mse:.4f}, R²: {r2:.4f}")

Validation MSE: 39.8040, R²: 0.7316


In [42]:
# Predict the popularity for the test data
y_pred = best_model.predict(X_test)

In [43]:
# Create a new DataFrame with Id and predicted popularity
predictions = pd.DataFrame({'Id': test_data['Id'], 'pop': y_pred})

In [49]:
# Save the predictions to a new CSV file
predictions.to_csv('RF_final_1.csv', index=False)
print("Predictions saved to 'RF_final_1.csv'")

Predictions saved to 'RF_final_1.csv'
