In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_squared_error

In [2]:
data=pd.read_csv('/content/drive/MyDrive/datasets/online_shoppers_intention.csv')
#download this dataset from : https://www.kaggle.com/datasets/henrysue/online-shoppers-intention
data

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.000000,0.200000,0.200000,0.000000,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.000000,0.000000,0.100000,0.000000,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.000000,0.200000,0.200000,0.000000,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.050000,0.140000,0.000000,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.500000,0.020000,0.050000,0.000000,0.0,Feb,3,3,1,4,Returning_Visitor,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,3,145.0,0,0.0,53,1783.791667,0.007143,0.029031,12.241717,0.0,Dec,4,6,1,1,Returning_Visitor,True,False
12326,0,0.0,0,0.0,5,465.750000,0.000000,0.021333,0.000000,0.0,Nov,3,2,1,8,Returning_Visitor,True,False
12327,0,0.0,0,0.0,6,184.250000,0.083333,0.086667,0.000000,0.0,Nov,3,2,1,13,Returning_Visitor,True,False
12328,4,75.0,0,0.0,15,346.000000,0.000000,0.021053,0.000000,0.0,Nov,2,2,3,11,Returning_Visitor,False,False


#Data pre-processing (if needed)

In [3]:
data.shape

(12330, 18)

In [4]:
data['Informational_Duration'].value_counts()

0.00      9925
9.00        33
7.00        26
10.00       26
6.00        26
          ... 
246.80       1
274.00       1
13.40        1
223.15       1
211.25       1
Name: Informational_Duration, Length: 1258, dtype: int64

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

In [6]:
data.isna().sum()

Administrative             0
Administrative_Duration    0
Informational              0
Informational_Duration     0
ProductRelated             0
ProductRelated_Duration    0
BounceRates                0
ExitRates                  0
PageValues                 0
SpecialDay                 0
Month                      0
OperatingSystems           0
Browser                    0
Region                     0
TrafficType                0
VisitorType                0
Weekend                    0
Revenue                    0
dtype: int64

## Model building (fitting and training)

In [7]:
# Prepare the features and target variables
X = data.drop(['Revenue', 'Weekend', 'Informational_Duration'], axis=1)  # Features
y_revenue = data['Revenue']  # Target variable - Revenue
y_weekend = data['Weekend']  # Target variable - Weekend
y_informational_duration = data['Informational_Duration']  # Target variable - Informational Duration

In [8]:
# Convert categorical variables to numerical using one-hot encoding
X_encoded = pd.get_dummies(X)

In [9]:
# Split the data into training and testing sets
X_train, X_test, y_revenue_train, y_revenue_test, y_weekend_train, y_weekend_test, y_informational_duration_train, y_informational_duration_test = train_test_split(X_encoded, y_revenue, y_weekend, y_informational_duration, test_size=0.2, random_state=42)


In [10]:
# Train the Random Forest classifiers
rf_revenue = RandomForestClassifier(n_estimators=100, random_state=42)
rf_weekend = RandomForestClassifier(n_estimators=100, random_state=42)

In [11]:
# Train the Random Forest regressor for Informational Duration
rf_informational_duration = RandomForestRegressor(n_estimators=100, random_state=42)

In [12]:
# Fit the models
rf_revenue.fit(X_train, y_revenue_train)
rf_weekend.fit(X_train, y_weekend_train)
rf_informational_duration.fit(X_train, y_informational_duration_train)


## Model Evaluation 

In [13]:
# Make predictions on the test set
y_revenue_pred = rf_revenue.predict(X_test)
y_weekend_pred = rf_weekend.predict(X_test)
y_informational_duration_pred = rf_informational_duration.predict(X_test)

In [14]:
# Evaluate the models
revenue_classification_report = classification_report(y_revenue_test, y_revenue_pred)
weekend_classification_report = classification_report(y_weekend_test, y_weekend_pred)
informational_duration_rmse = mean_squared_error(y_informational_duration_test, y_informational_duration_pred, squared=False)

print("Revenue Classification Report:\n", revenue_classification_report)
print("Weekend Classification Report:\n", weekend_classification_report)
print("Informational Duration Root Mean Squared Error:", informational_duration_rmse)

Revenue Classification Report:
               precision    recall  f1-score   support

       False       0.91      0.96      0.94      2055
        True       0.74      0.54      0.62       411

    accuracy                           0.89      2466
   macro avg       0.82      0.75      0.78      2466
weighted avg       0.88      0.89      0.88      2466

Weekend Classification Report:
               precision    recall  f1-score   support

       False       0.77      0.98      0.86      1846
        True       0.67      0.11      0.18       620

    accuracy                           0.76      2466
   macro avg       0.72      0.54      0.52      2466
weighted avg       0.74      0.76      0.69      2466

Informational Duration Root Mean Squared Error: 118.63910457404565


## Testing the model on new data

In [18]:
# Predict the revenue, weekend, and informational duration for a new sample
new_sample = pd.DataFrame({
    'Administrative': [0],
    'Administrative_Duration': [0.0],
    'Informational': [0],
    'ProductRelated': [1],
    'ProductRelated_Duration': [0.0],
    'BounceRates': [0.2],
    'ExitRates': [0.2],
    'PageValues': [0.0],
    'SpecialDay': [0.0],
    'Month': ['Feb'],
    'OperatingSystems': [1],
    'Browser': [1],
    'Region': [1],
    'TrafficType': [1],
    'VisitorType': ['Returning_Visitor']
    
})

# Convert categorical variables to numerical using one-hot encoding and align columns
new_sample_encoded = pd.get_dummies(new_sample)
new_sample_aligned = new_sample_encoded.reindex(columns=X_encoded.columns, fill_value=0)

revenue_pred = rf_revenue.predict(new_sample_aligned)[0]
weekend_pred = rf_weekend.predict(new_sample_aligned)[0]
informational_duration_pred = rf_informational_duration.predict(new_sample_aligned)[0]

print("Predicted Revenue:", revenue_pred)
print("Predicted Weekend:", weekend_pred)
print("Predicted Informational Duration:", informational_duration_pred)


Predicted Revenue: False
Predicted Weekend: True
Predicted Informational Duration: 0.0
