In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

# Load the data
df = pd.read_csv('/mnt/data/my_dataframe.csv')

# Fill missing values in the dataframe with its mean
df.fillna.mean(), inplace=True)

# Perform one-hot encoding on the categorical variables
df_encoded = pd.get_dummies(df, columns=['country', 'network', 'operating_system', 'other examples'])

# Define the features and the target
X = df_encoded.drop(['date', 'metric1', 'metric2', 'metric3'], axis=1)
y = df_encoded['target metric']  # Change this to 'clicks' or 'signups' for other models

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict the target on the test set
y_pred = model.predict(X_test)

# Calculate the RMSE of the predictions
rmse = mean_squared_error(y_test, y_pred, squared=False)

# Calculate the R-squared score of the predictions
r2 = r2_score(y_test, y_pred)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30]
}

# Set up the grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_root_mean_squared_error')

# Conduct the grid search
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Create a Random Forest Regressor with the best parameters
best_model = RandomForestRegressor(max_depth=best_params['max_depth'], n_estimators=best_params['n_estimators'], random_state=42)

# Train the best model
best_model.fit(X_train, y_train)

# Predict the target on the test set
y_pred_best = best_model.predict(X_test)

# Calculate the RMSE of the best model's predictions
rmse_best = mean_squared_error(y_test, y_pred_best, squared=False)

# Calculate the R-squared score of the best model's predictions
r2_best = r2_score(y_test, y_pred_best)

# Get the feature importances of the best model
importances_best = best_model.feature_importances_

# Create a DataFrame of the feature names and importances
importances_best_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': importances_best
})

# Sort the DataFrame by importance in descending order
importances_best_df = importances_best_df.sort_values(by='Importance', ascending=False)