In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder

## Load the dataset

In [2]:
df = pd.read_csv("advertising_dataset.csv")

## Inspecting data to gather info about data 

In [7]:
print(f"Columns: {df.columns}\n")
print(f"DATASET INFO: \n{df.info()}")

Columns: Index(['user_id', 'timestamp', 'device_type', 'location', 'age_group',
       'gender', 'ad_id', 'content_type', 'ad_topic', 'ad_target_audience',
       'click_through_rate', 'conversion_rate', 'engagement_level',
       'view_time', 'cost_per_click', 'click_through_rate.1',
       'conversion_rate.1', 'ROI'],
      dtype='object')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   user_id               1000 non-null   int64  
 1   timestamp             1000 non-null   object 
 2   device_type           1000 non-null   object 
 3   location              1000 non-null   object 
 4   age_group             1000 non-null   object 
 5   gender                1000 non-null   object 
 6   ad_id                 1000 non-null   object 
 7   content_type          1000 non-null   object 
 8   ad_topic              1000 non-nul

## DATA PREPROCESSING

### 1.  Cleaning 

In [None]:
# Removing irrelevant features from the dataset
data=df.drop(columns=['user_id','ad_id','timestamp','conversion_rate.1','click_through_rate.1'])

# Separating features and target
X = data.drop("ROI", axis=1)
y = data["ROI"]

### 2. One Hot Encoding

In [16]:
# Identifying categorical and numerical columns
categorical_columns=X.select_dtypes(include='object').columns
numerical_columns=X.select_dtypes(exclude='object').columns

#
encoder = OneHotEncoder(drop="first",sparse_output=False, handle_unknown="ignore")
X_encoded = encoder.fit_transform(X[categorical_columns])

# Get encoded feature names
encoded_feature_names = encoder.get_feature_names_out(categorical_columns)

# Combine encoded categorical and numerical features
X_f = np.hstack([X[numerical_columns].values, X_encoded])
feature_names = list(numerical_columns) + list(encoded_feature_names)

## Train Test Splitting

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_f, y, test_size=0.3, random_state=19)

# Scaling numerical features for faster convergence 
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Training Random Forest Classifier

In [40]:
random_forest_reg = RandomForestRegressor(n_estimators=200, random_state=19)
random_forest_reg.fit(X_train, y_train)

### Predictions

In [41]:
y_pred=random_forest_reg.predict(X_test)

## Evaluation of Random Forest Regressor Model

In [42]:
R2=r2_score(y_test,y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Random Forest with OneHotEncoder Results")
print("R² Score:", round(R2, 4))
print("RMSE:", round(rmse, 4))

Random Forest with OneHotEncoder Results
R² Score: 0.9398
RMSE: 0.3903


## Feature Importance

In [43]:
importances = random_forest_reg.feature_importances_
indices = np.argsort(importances)[::-1]

print("\nTop 10 Important Features for ROI Prediction:")
for i in range(10):
    print(f"{feature_names[indices[i]]}: {round(importances[indices[i]], 4)}")


Top 10 Important Features for ROI Prediction:
cost_per_click: 0.5285
click_through_rate: 0.2244
conversion_rate: 0.2
view_time: 0.0159
ad_topic_Fashion: 0.003
ad_target_audience_Fitness Lovers: 0.0028
device_type_Tablet: 0.0025
device_type_Mobile: 0.0022
gender_Male: 0.0021
ad_target_audience_Young Adults: 0.0019
