In [None]:
#Importing original dataset via CSV (https://archive.ics.uci.edu/dataset/468/online+shoppers+purchasing+intention+dataset)
import pandas as pd
def load_dataset(csv):
  df = pd.read_csv('online_shoppers_intention.csv')
  return df
csv = 'online_shoppers_intention.csv'
df = load_dataset(csv)
print(df.head())

In [None]:
#Imports
import pandas as pd
from scipy.stats import chi2_contingency
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import skew, kurtosis

In [None]:
#basic for Exploratory Report
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title='Exploratory Analysis Report')
profile.to_file('exploratory_analysis_report.html')

In [None]:
#Feature Engineering
#Converting Categorical into Integers and Use One-Hot Coding
sessions = df
sessions['Revenue'] = sessions['Revenue'].astype(int)#converting revenue into 0 and 1
sessions['SpecialDay'] = sessions['SpecialDay'].astype(int)
# One-hot encode VisitorType
visitor_type_dummies = pd.get_dummies(sessions['VisitorType'], prefix='VisitorType').astype(int)
sessions = pd.concat([sessions, visitor_type_dummies], axis=1)
sessions.drop('VisitorType', axis=1, inplace=True)
sessions['Weekend']=sessions['Weekend'].astype(int)
sessions['Month'] = sessions['Month'].map({'Feb':2, 'Mar':3,'May':5, 'June':6, 'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12})

sessions.head()

In [None]:
#Adding Relevant Columns
sessions['SessionDuration']=sessions['Administrative_Duration']+sessions['Informational_Duration']+sessions['ProductRelated_Duration']
sessions['TotalPagesViewed']=sessions['Administrative']+sessions['Informational']+sessions['ProductRelated']
sessions['BounceRates'] = sessions['BounceRates'].astype(float)

sessions.head()

In [None]:
sessions['VisitorType'].unique()
#print(sessions.head())
#print(sessions[(sessions['VisitorType'] == 'Other') & (sessions['Revenue'] == 1)].shape[0])

In [None]:
#Sessions with Revenue based on Pages Viewed 
values_with_Rev = df[df['Revenue']==1].shape[0]
print("Total Values in dataset with Rev", values_with_Rev)

values_with_Rev_Admin = df[(df['Revenue']==1) & (df['Administrative']>0)].shape[0]
print("Values with Rev and atleast one Administrative Page Viewed", values_with_Rev_Admin)

values_with_Rev_Product = df[(df['Revenue']==1) & (df['ProductRelated']>0)].shape[0]
print("Values with Rev and atleast one Product page viewed", values_with_Rev_Product)

values_with_Rev_Informational = df[(df['Revenue']==1) & (df['Informational']>0)].shape[0]
print("Values with Rev and atleast one Informational page viewed", values_with_Rev_Informational)

values_with_Rev_GreaterThanAvgAdmin = df[(df['Revenue']==1) & (df['Administrative_Duration']>df['Administrative_Duration'].mean())].shape[0]
print("Values with Rev and Administrative Duration over the mean Duration", values_with_Rev_GreaterThanAvgAdmin)

values_with_Rev_GreaterThanAvgProduct = df[(df['Revenue']==1) & (df['ProductRelated_Duration']>df['ProductRelated_Duration'].mean())].shape[0]
print("Values with Rev and ProductRelated_Duration  over the mean Duration", values_with_Rev_GreaterThanAvgProduct)

values_with_Rev_GreaterThanAvgInformational = df[(df['Revenue']==1) & (df['Informational_Duration']>df['Informational_Duration'].mean())].shape[0]
print("Values with Rev and Informational Duration over the mean Duration", values_with_Rev_GreaterThanAvgInformational)

In [None]:
#Months with Revenue vs Months without Revenue
values_with_Rev_Feb = df[(df['Revenue']==1) & (df['Month']=='Feb')].shape[0]
print("Values with Rev and Feb", values_with_Rev_Feb)
values_with_Rev_Feb = df[(df['Revenue']==0) & (df['Month']=='Feb')].shape[0]
print("Values with Rev and Feb", values_with_Rev_Feb)

values_with_Rev_March = df[(df['Revenue']==1) & (df['Month']=='Mar')].shape[0]
print("Values with Rev and March", values_with_Rev_March)
values_with_Rev_March = df[(df['Revenue']==0) & (df['Month']=='Mar')].shape[0]
print("Values with Rev and March", values_with_Rev_March)

values_with_Rev_May = df[(df['Revenue']==1) & (df['Month']=='May')].shape[0]
print("Values with Rev and May", values_with_Rev_May)
values_with_Rev_May = df[(df['Revenue']==0) & (df['Month']=='May')].shape[0]
print("Values with Rev and May", values_with_Rev_May)

values_with_Rev_June = df[(df['Revenue']==1) & (df['Month']=='June')].shape[0]
print("Values with Rev and June", values_with_Rev_June)
values_with_Rev_June = df[(df['Revenue']==0) & (df['Month']=='June')].shape[0]
print("Values with Rev and June", values_with_Rev_June)

values_with_Rev_July = df[(df['Revenue']==1) & (df['Month']=='July')].shape[0]
print("Values with Rev and July", values_with_Rev_July)
values_with_Rev_July = df[(df['Revenue']==0) & (df['Month']=='July')].shape[0]
print("Values with Rev and July", values_with_Rev_July)

values_with_Rev_Aug = df[(df['Revenue']==1) & (df['Month']=='Aug')].shape[0]
print("Values with Rev and August", values_with_Rev_Aug)
values_with_Rev_Aug = df[(df['Revenue']==0) & (df['Month']=='Aug')].shape[0]
print("Values with Rev and August", values_with_Rev_Aug)

values_with_Rev_Sep = df[(df['Revenue']==1) & (df['Month']=='Sep')].shape[0]
print("Values with Rev and Sep", values_with_Rev_Sep)
values_with_Rev_Sep = df[(df['Revenue']==0) & (df['Month']=='Sep')].shape[0]
print("Values with Rev and Sep", values_with_Rev_Sep)

values_with_Rev_Oct = df[(df['Revenue']==1) & (df['Month']=='Oct')].shape[0]
print("Values with Rev and Oct", values_with_Rev_Oct)
values_with_Rev_Oct = df[(df['Revenue']==0) & (df['Month']=='Oct')].shape[0]
print("Values with Rev and Oct", values_with_Rev_Oct)

values_with_Rev_Nov = df[(df['Revenue']==1) & (df['Month']=='Nov')].shape[0]
print("Values with Rev and Nov", values_with_Rev_Nov)
values_with_Rev_Nov = df[(df['Revenue']==0) & (df['Month']=='Nov')].shape[0]
print("Values with Rev and Nov", values_with_Rev_Nov)

values_with_Rev_Dec = df[(df['Revenue']==1) & (df['Month']=='Dec')].shape[0]
print("Values with Rev and Dec", values_with_Rev_Dec)
values_with_Rev_Dec = df[(df['Revenue']==0) & (df['Month']=='Dec')].shape[0]
print("Values with Rev and Dec", values_with_Rev_Dec)

In [None]:
#Revenue based on Different TrafficTypes [1-20] 
traffic_types = df['TrafficType'].unique()

# Iterate over each TrafficType and calculate the values with Revenue
for traffic_type in traffic_types:
    values_with_Rev = df[(df['Revenue'] == 1) & (df['TrafficType'] == traffic_type)].shape[0]
    print(f"Values with Rev and TrafficType {traffic_type}: {values_with_Rev}")
    
    values_without_Rev = df[(df['Revenue'] == 0) & (df['TrafficType'] == traffic_type)].shape[0]
    print(f"Values without Rev and TrafficType {traffic_type}: {values_without_Rev}")


In [None]:
#Revenue based on Different VisitorTypes[Returning, New, Other]
visitor_types = df['VisitorType'].unique()

# Initialize lists to store results
results = []

# Iterate over each VisitorType and calculate the values with and without Revenue
for visitor_type in visitor_types:
    values_with_Rev = df[(df['Revenue'] == 1) & (df['VisitorType'] == visitor_type)].shape[0]
    values_without_Rev = df[(df['Revenue'] == 0) & (df['VisitorType'] == visitor_type)].shape[0]
    
    results.append({
        'VisitorType': visitor_type,
        'Values with Revenue': values_with_Rev,
        'Values without Revenue': values_without_Rev
    })

# Create a DataFrame to store the results
results_df = pd.DataFrame(results)

print(results_df)


In [None]:
#Revenue based on Operating System [1-8]
#OperatingSystems
# Get unique VisitorTypes
operating_systems = df['OperatingSystems'].unique()

# Initialize lists to store results
results = []

# Iterate over each VisitorType and calculate the values with and without Revenue
for operating_system in operating_systems:
    values_with_Rev = df[(df['Revenue'] == 1) & (df['OperatingSystems'] == operating_system)].shape[0]
    values_without_Rev = df[(df['Revenue'] == 0) & (df['OperatingSystems'] == operating_system)].shape[0]
    
    results.append({
        'operating_systems': operating_system,
        'Values with Revenue': values_with_Rev,
        'Values without Revenue': values_without_Rev
    })

# Create a DataFrame to store the results
results_df = pd.DataFrame(results)
print(type(results))
print(results_df)



In [None]:
#Revenue based on Weekend
# Get unique VisitorTypes
weekend = df['Weekend'].unique()
# Initialize lists to store results
results = []
# Iterate over each VisitorType and calculate the values with and without Revenue
for weekend_type in weekend:
    values_with_Rev = df[(df['Revenue'] == 1) & (df['Weekend'] == weekend_type)].shape[0]
    values_without_Rev = df[(df['Revenue'] == 0) & (df['Weekend'] == weekend_type)].shape[0]
    
    results.append({
        'weekend_type': weekend_type,
        'Values with Revenue': values_with_Rev,
        'Values without Revenue': values_without_Rev
    })

# Create a DataFrame to store the results
results_df = pd.DataFrame(results)
print(type(results))
print(results_df)


In [None]:
#Revenue based on Browser [1-13]
# Get unique VisitorTypes
browsers = df['Browser'].unique()

# Initialize lists to store results
results = []

# Iterate over each VisitorType and calculate the values with and without Revenue
for browser_type in browsers:
    values_with_Rev = df[(df['Revenue'] == 1) & (df['Browser'] == browser_type)].shape[0]
    values_without_Rev = df[(df['Revenue'] == 0) & (df['Browser'] == browser_type)].shape[0]
    
    results.append({
        'Browser Type': browser_type,
        'Values with Revenue': values_with_Rev,
        'Values without Revenue': values_without_Rev
    })

# Create a DataFrame to store the results
results_df = pd.DataFrame(results)
print(type(results))
print(results_df)

In [None]:
#Chi Square Test for Revenue vs Months (Long-way)
# Observed values for revenue and no revenue across months
observed = [
    [3, 192, 365, 29, 0, 76, 86, 115, 760, 216],   # Revenue = 1
    [181, 1715, 2999, 259, 0, 357, 362, 434, 2238, 1511]  # Revenue = 0
]

# Add a small constant to avoid zero elements
observed = [[value + 0.5 for value in row] for row in observed]

# Convert to a DataFrame for visual clarity (optional)
months = ['Feb', 'Mar', 'May', 'June', 'July', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
observed_df = pd.DataFrame(observed, index=['Revenue', 'No Revenue'], columns=months)
print("Observed Frequencies:\n", observed_df)

# Perform chi-square test
chi2, p_value, dof, expected = chi2_contingency(observed)

# Display results
print("\nChi-square Statistic:", chi2)
print("p-value:", p_value)
print("Degrees of Freedom:", dof)
print("\nExpected Frequencies:\n", pd.DataFrame(expected, index=['Revenue', 'No Revenue'], columns=months))

In [None]:
#Chi Square- ['Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType', 'Weekend']
# Convert Revenue to categorical (1 for revenue, 0 for no revenue)
df['Revenue'] = df['Revenue'].astype(int)

# Define categorical variables to test against Revenue
categorical_vars = ['Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType', 'Weekend']

# Running chi-square tests for each categorical variable against Revenue
results = {}

for var in categorical_vars:
    contingency_table = pd.crosstab(df[var], df['Revenue'])
    
    # Perform chi-square test
    chi2, p, dof, ex = chi2_contingency(contingency_table)
    
    results[var] = {
        'Chi-square Statistic': chi2,
        'p-value': p,
        'Degrees of Freedom': dof
    }

# Print results
for var, result in results.items():
    print(f"{var}:\nChi-square Statistic: {result['Chi-square Statistic']}, p-value: {result['p-value']}, Degrees of Freedom: {result['Degrees of Freedom']}\n")

In [None]:
#Graph Sessions Duration for Rev vs non-Rev
plt.show()
plt.figure(figsize=(10, 6))
sessions.boxplot(column='SessionDuration', by='Revenue')
plt.xlabel('Rev')
plt.ylabel('Session Duration')
plt.title('Session Duration vs Revenue')
plt.suptitle('')  # Suppress the automatic title to keep the plot clean
plt.show()
sessions['SessionDuration_Binned'] = pd.cut(sessions['SessionDuration'], bins=4, labels=['Very Short', 'Short', 'Medium', 'Long'])
sessions.head()


In [None]:
# Example data - replace with your actual session_duration data
np.random.seed(0)
sessions['SessionDuration']
# Standardize
standardized_duration = (sessions['SessionDuration'] - sessions['SessionDuration'].mean()) / sessions['SessionDuration'].std()

# Log transform and standardize
log_standardized_duration = (np.log1p(sessions['SessionDuration']) - np.log1p(sessions['SessionDuration']).mean()) / np.log1p(sessions['SessionDuration']).std()

# Plot the distributions
plt.figure(figsize=(12, 6))
sns.kdeplot(standardized_duration, label='Standardized', color='blue')
sns.kdeplot(log_standardized_duration, label='Log-Standardized', color='red')

plt.title("Comparison of Standardized vs. Log-Standardized Session Duration")
plt.xlabel("Value")
plt.ylabel("Density")
plt.legend()
plt.show()

In [None]:
# Assuming session_duration_summary is a pandas Series
session_duration_summary = pd.Series(sessions['SessionDuration'])

# Calculate summary statistics
mean = session_duration_summary.mean()
median = session_duration_summary.median()
std_dev = session_duration_summary.std()
skewness = skew(session_duration_summary)
kurt = kurtosis(session_duration_summary)

print(f"Mean: {mean}")
print(f"Median: {median}")
print(f"Standard Deviation: {std_dev}")
print(f"Skewness: {skewness}")
print(f"Kurtosis: {kurt}")

sessions['Log_ProductRelated_Duration'] = np.log1p(sessions['ProductRelated_Duration'])  # log(1+x)
plt.figure(figsize=(10, 6))
sns.kdeplot(df['Log_ProductRelated_Duration'], shade=True)
plt.title('Kernel Density Estimate of Log Product Duration ')
plt.xlabel('Standardized Log Product Duration')
plt.ylabel('Density')
plt.show()

# Standardize the data
log_standardized_data = (sessions['Log_ProductRelated_Duration'] - sessions['Log_ProductRelated_Duration'].mean()) / sessions['Log_ProductRelated_Duration'].std()

# Plot the KDE
plt.figure(figsize=(10, 6))
sns.kdeplot(log_standardized_data, shade=True)
plt.title('Kernel Density Estimate of Log and Standardized Session Duration')
plt.xlabel('Standardized Session Duration')
plt.ylabel('Density')
plt.show()



In [None]:
#Log and Standardizing Product Duration (Unfinished)
df['productRelatedDuration_Standardized'] = (df['ProductRelated_Duration'] - df['ProductRelated_Duration'].mean()) / df['ProductRelated_Duration'].std()
df['Log_std_ProductRelated_Duration'] = np.log1p(df['productRelatedDuration_Standardized'])  # log(1+x)

# Summary statistics
product_duration_summary = df['Log_std_ProductRelated_Duration'].describe()
# Calculate summary statistics
product_mean = product_duration_summary.mean()
product_median = product_duration_summary.median()
product_std_dev = product_duration_summary.std()
product_skewness = skew(product_duration_summary)
product_kurt = kurtosis(product_duration_summary)

print(f"Mean: {product_mean}")
print(f"Median: {product_median}")
print(f"Standard Deviation: {product_std_dev}")
print(f"Skewness: {product_skewness}")
print(f"Kurtosis: {product_kurt}")
print(df['ProductRelated_Duration'].describe())
print(product_duration_summary)

# KDE plot for standardized values
plt.figure(figsize=(12, 6))
sns.kdeplot(product_duration_summary, shade=True)
plt.title('Distribution of Standardized Log ProductRelated Duration')
plt.xlabel('Standardized Log ProductRelated Duration')
plt.ylabel('Density')
plt.show()


sns.kdeplot(df['ProductRelated_Duration'], shade=True)


In [None]:
#Log and Standardizing Informational Duration (Unfinished)

import seaborn as sns
import matplotlib.pyplot as plt

# Standardize the SessionDuration
df['infoDuration_Standardized'] = (df['Informational_Duration'] - df['Informational_Duration'].mean()) / df['Informational_Duration'].std()
df['Log_std_Informational_Duration'] = np.log1p(df['infoDuration_Standardized'])  # log(1+x)

print(df['Informational_Duration'].describe())
informational_duration_summary = df['Log_std_Informational_Duration'].describe()
print(informational_duration_summary)
plt.figure(figsize=(12, 6))
sns.kdeplot(informational_duration_summary, shade=True)
plt.title('Distribution of Standardized Log informational Duration')
plt.xlabel('Standardized Log informational Duration')
plt.ylabel('Density')
plt.show()

sns.kdeplot(df['Informational_Duration'], shade=True)


In [None]:
#Log and Standardizing Administrative Duration (Unfinished)
df['AdminDuration_Standardized'] = (df['Administrative_Duration'] - df['Administrative_Duration'].mean()) / df['Administrative_Duration'].std()
df['Log_std_Admin_Duration'] = np.log1p(df['AdminDuration_Standardized'])  # log(1+x)

print(df['Administrative_Duration'].describe())
Admin_duration_summary = df['Log_std_Admin_Duration'].describe()
print(Admin_duration_summary)
plt.figure(figsize=(12, 6))
sns.kdeplot(Admin_duration_summary, shade=True)
plt.title('Distribution of Standardized Log Admin Duration')
plt.xlabel('Standardized Log Admin Duration')
plt.ylabel('Density')
plt.show()

sns.kdeplot(df['Administrative_Duration'], shade=True)


In [None]:
#Clustering based on Groups 
import numpy as np
import pandas as pd
from scipy.signal import find_peaks
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KernelDensity

# Assuming you have a dataframe 'df' with 'ProductRelated_Duration' as a column
data = df['ProductRelated_Duration'].values

# Step 1: Kernel Density Estimation (KDE)
kde = KernelDensity(kernel='gaussian', bandwidth=500).fit(data[:, np.newaxis])
x_d = np.linspace(min(data), max(data), 1000)
log_dens = kde.score_samples(x_d[:, np.newaxis])

# Plot KDE to visualize peaks
plt.figure(figsize=(10, 6))
plt.fill(x_d, np.exp(log_dens), alpha=0.5)
plt.title('Kernel Density Estimation for ProductRelated Duration')
plt.xlabel('ProductRelated_Duration')
plt.ylabel('Density')
plt.show()

# Step 2: Identify the Peaks
peaks, _ = find_peaks(np.exp(log_dens), distance=200)
peak_values = x_d[peaks]

# Sort by peak height (density values) to get the highest peaks
sorted_peaks = sorted(peak_values, key=lambda x: np.exp(log_dens)[np.where(x_d == x)[0][0]], reverse=True)

first_peak = sorted_peaks[0]  # Highest peak
second_peak = sorted_peaks[1]  # Second highest peak

# Step 3: Group user sessions based on proximity to the nearest peak
def assign_peak_group(duration, first_peak, second_peak):
    # Assign to first peak or second peak based on proximity
    if abs(duration - first_peak) < abs(duration - second_peak):
        return 'First Peak'
    else:
        return 'Second Peak'

df['Session_Group'] = df['ProductRelated_Duration'].apply(lambda x: assign_peak_group(x, first_peak, second_peak))

# View the classification results
print(df[['ProductRelated_Duration', 'Session_Group']].head())

In [None]:
#Multi-Grouped Categories 
second_peak_values = df[df['Session_Group'] == 'Second Peak']
first_peak_values = df[df['Session_Group'] == 'First Peak']

# Create a summary table for the counts of first and second peak values
summary_table = pd.DataFrame({
    'Group': ['First Peak', 'Second Peak'],
    'Count': [first_peak_values.shape[0], second_peak_values.shape[0]]
})
print(summary_table)
print(summary_table.shape)
print(first_peak_values.shape)

# Calculate the percentage of values that have revenue in each of the first peak values and second peak values
group_first_peak_revenue_percentage = (first_peak_values[first_peak_values['Revenue'] == 1].shape[0] / first_peak_values.shape[0]) * 100
group_second_peak_revenue_percentage = (second_peak_values[second_peak_values['Revenue'] == 1].shape[0] / second_peak_values.shape[0]) * 100

print(f"First Peak Revenue Percentage: {group_first_peak_revenue_percentage:.2f}%")
print(f"Second Peak Revenue Percentage: {group_second_peak_revenue_percentage:.2f}%")

#medianOfDf=df['ProductRelated_Duration'].median()
#print(medianOfDf)
#print(df[df['ProductRelated_Duration']>medianOfDf].count())

In [None]:
#Disregard this grouping
# Display the unique session categories
print(df['Session_Category'].unique())
plt.show()
# Count the number of values in each category for 'Session_Category'
category_counts = df['Session_Category'].value_counts()
print(category_counts)

first_peak_values=df[df['Session_Category']=='First Peak']
second_peak_values=df[df['Session_Category']=='Second Peak']

print(first_peak_values.count())

category_first_peak_revenue_percentage = (first_peak_values[first_peak_values['Revenue'] == 1].shape[0] / first_peak_values.shape[0]) * 100
category_second_peak_revenue_percentage = (second_peak_values[second_peak_values['Revenue'] == 1].shape[0] / second_peak_values.shape[0]) * 100
print(f"First Peak Revenue Percentage: {category_first_peak_revenue_percentage:.2f}%")
print(f"Second Peak Revenue Percentage: {category_second_peak_revenue_percentage:.2f}%")

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans

#Define the preprocessing for numeric and non-numeric features
#Visualize the clusters
plt.figure(figsize=(12, 8))
sns.scatterplot(x='ProductRelated_Duration', y='SessionDuration', hue='Cluster', data=df, palette='viridis')
plt.title('Clusters of Sessions based on ProductRelated Duration and Session Duration')
plt.xlabel('ProductRelated Duration')
plt.ylabel('Session Duration')
plt.legend(title='Cluster')
plt.show()
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
non_numeric_features = df.select_dtypes(include=['object', 'category', 'bool']).columns

# Create a column transformer to apply the appropriate transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), non_numeric_features)
    ])

# Create a pipeline that first transforms the data and then applies KMeans
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('kmeans', KMeans(n_clusters=3, random_state=42))
])

# Fit the pipeline to the data
pipeline.fit(df)

# Predict the cluster for each session
df['Cluster'] = pipeline.predict(df)

#View the first few rows of the dataframe with the cluster assignments
#print(df.head())
cluster1 = df[df['Cluster']==0]
cluster2 = df[df['Cluster']==1]
cluster3 = df[df['Cluster']==2]

# Create a summary table for the counts of first and second peak values
cluster_summary_table = pd.DataFrame({
    'Group': ['cluster1','cluster2','cluster3'],
    'Count': [cluster1.shape[0], cluster2.shape[0], cluster3.shape[0]]
})
print(cluster_summary_table)

cluster1_revenue_percentage = (cluster1[cluster1['Revenue'] == 1].shape[0] / cluster1.shape[0])
cluster2_revenue_percentage = (cluster2[cluster2['Revenue'] == 1].shape[0] / cluster2.shape[0])
cluster3_revenue_percentage = (cluster3[cluster3['Revenue'] == 1].shape[0] / cluster3.shape[0])

print(df[df['Revenue'] == 1].shape[0] / df['Revenue'].shape[0])
print(f"Number of sessions with Revenue in entire dataset: {df[df['Revenue'] == 1].shape[0]:.2f}")
print(f"Clustering Comprises of {cluster1_revenue_percentage * cluster1.shape[0]+cluster2_revenue_percentage * cluster2.shape[0]+cluster3_revenue_percentage * cluster3.shape[0]:.2f}")

print(f"cluster1 Revenue Percentage: {cluster1_revenue_percentage * 100:.2f}% with {cluster1_revenue_percentage * cluster1.shape[0]:.2f} sessions with rev. values")
print(f"cluster1 Revenue Percentage: {cluster2_revenue_percentage * 100:.2f}% with {cluster2_revenue_percentage * cluster2.shape[0]:.2f} sessions with rev. values")
print(f"cluster1 Revenue Percentage: {cluster3_revenue_percentage * 100:.2f}% with {cluster3_revenue_percentage * cluster3.shape[0]:.2f} sessions with rev. values")


