In [1]:
import pandas as pd
import plotly.express as px


# Define the schema as a dictionary for easier handling
schema = {
    "visitor_id": str,
    "site_url": str,
    "page_view_url": str,
    "timestamp": int
}

# Define the column names according to your schema
column_names = ["visitor_id", "site_url", "page_view_url", "timestamp"]

# Load the CSV files
files = ["input_1.csv", "input_2.csv", "input_3.csv"]  # Adjust paths as necessary
dfs = [pd.read_csv(file, names=column_names, dtype=schema) for file in files]

# Concatenate all DataFrames
df = pd.concat(dfs, ignore_index=True)

df = df.dropna().drop_duplicates()

# Ensure the timestamp is treated as datetime
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')


In [2]:
# Exploratory Data Analysis (EDA)
# Count total number of rows
total_rows = len(df)
print(f"Total number of rows: {total_rows}")

# Count number of unique visitors
unique_visitors = df["visitor_id"].nunique()
print(f"Number of unique visitors: {unique_visitors}")

# Count number of unique sites
unique_sites = df["site_url"].nunique()
print(f"Number of unique sites: {unique_sites}")

# Find the most visited site
most_visited_site = df["site_url"].value_counts().index[0]
print(f"Most visited site: {most_visited_site}")

timestamp_min_max = df[["timestamp"]].agg(["min", "max"])
print(timestamp_min_max)

Total number of rows: 145967
Number of unique visitors: 10000
Number of unique sites: 10
Most visited site: www.s_6.com
              timestamp
min 2012-09-17 01:14:02
max 2012-09-17 19:44:14


In [3]:
# Sort DF by timestamp for the overlaps, then also by visitor ID and Site URL and reset the index:
df = df.sort_values(['timestamp', 'visitor_id', 'site_url', 'page_view_url']).reset_index(drop=True)

# Create next_timestamp column to be used in sessions length. 
df['next_timestamp'] = df.groupby(['visitor_id', 'site_url'])['timestamp'].shift(-1)

# calculate every sessions length based on the next timestamp. 
#And assign 0 where session_length is null for the 1 ppage only sessions:
df['session_length'] = (df['next_timestamp'] - df['timestamp']).dt.total_seconds()
df.loc[df['session_length'].isnull(), 'session_length'] = 0

# Assigns a Unique Session ID to each session for each user on each site. 
#The session ID increments each time a new session starts (i.e., when the session length exceeds 30 minutes) 
# using a cumulative sum of a boolean condition (session length > 30 minutes) within each group defined by visitor_id and site_url.
df['new_session'] = (df['session_length'] > 30*60).astype(int)
df['session_id'] = df.groupby(['visitor_id', 'site_url'])['new_session'].cumsum() + 1

# Calculate the actual session length as the difference between the first and last timestamp in each session.
#Its the the difference between the first and last timestamp in each session. This ensures that the session length is calculated correctly.
df['session_start'] = df.groupby(['visitor_id', 'site_url', 'session_id'])['timestamp'].transform('min')
df['session_end'] = df.groupby(['visitor_id', 'site_url', 'session_id'])['timestamp'].transform('max')
df['actual_session_length'] = (df['session_end'] - df['session_start']).dt.total_seconds()

df_filtered = df[df['actual_session_length'].notnull()]


In [4]:
# Actual Session Length count BoxPlot
fig_session_length = px.box(df_filtered, y="actual_session_length")
fig_session_length.update_layout(
    title_text="Actual Session Length Distribution",
    yaxis_title="Length"
)
fig_session_length.show()

# Grouping by 'session_id' and counting occurrences bar graph
session_id_counts = df_filtered.groupby('session_id').size().reset_index(name='counts')

fig_session_id = px.bar(session_id_counts, x='session_id', y='counts', labels={'counts': 'Count'})
fig_session_id.update_layout(
    title_text="Sessions Distribution",
    xaxis_title="Session IDs",
    yaxis_title="Count"
)
fig_session_id.show()


In [5]:
# Calculate Q1, Q3, and IQR for actual_session_length
Q1_session_length = df_filtered["actual_session_length"].quantile(0.25)
Q3_session_length = df_filtered["actual_session_length"].quantile(0.75)
IQR_session_length = Q3_session_length - Q1_session_length

# Calculate lower and upper bounds for actual_session_length outliers
lower_bound_session_length = Q1_session_length - 1.5 * IQR_session_length
upper_bound_session_length = Q3_session_length + 1.5 * IQR_session_length

# Calculate percentage of outliers for actual_session_length
total_count = len(df_filtered)
outliers_count_session_length = len(df_filtered[(df_filtered["actual_session_length"] < lower_bound_session_length) | (df_filtered["actual_session_length"] > upper_bound_session_length)])
percentage_outliers_session_length = (outliers_count_session_length / total_count) * 100

# Calculate Q1, Q3, and IQR for session_id
Q1_session_id = df_filtered["session_id"].quantile(0.25)
Q3_session_id = df_filtered["session_id"].quantile(0.75)
IQR_session_id = Q3_session_id - Q1_session_id

# Calculate lower and upper bounds for session_id outliers
lower_bound_session_id = Q1_session_id - 1.5 * IQR_session_id
upper_bound_session_id = Q3_session_id + 1.5 * IQR_session_id

# Calculate percentage of outliers for session_id
outliers_count_session_id = len(df_filtered[(df_filtered["session_id"] < lower_bound_session_id) | (df_filtered["session_id"] > upper_bound_session_id)])
percentage_outliers_session_id = (outliers_count_session_id / total_count) * 100

# Display the results
print(f"Percentage of actual_session_length outliers: {percentage_outliers_session_length}%")
print(f"Percentage of session_id outliers: {percentage_outliers_session_id}%")


Percentage of actual_session_length outliers: 18.59872436920674%
Percentage of session_id outliers: 18.875499256681305%


In [12]:
# Requested queries:
def num_sessions(df_filtered, site_url):
    return df_filtered[df_filtered["site_url"] == site_url].groupby(['visitor_id', 'session_id']).ngroups

def num_unique_visited_sites(df_filtered, visitor_id):
    return df_filtered[df_filtered["visitor_id"] == visitor_id]["site_url"].nunique()

def median_session_length(df_filtered, site_url):
    session_lengths = df_filtered[df_filtered["site_url"] == site_url].drop_duplicates(subset=['visitor_id', 'site_url', 'session_id'])['actual_session_length']
    return session_lengths.median()

In [15]:
# Get user input
command = input("Enter command: ")
site_url = input("Enter site URL: ")
visitor_id = input("Enter visitor ID: ")

# Example usage based on the command input
if command == "num_sessions":
    if df_filtered[df_filtered["site_url"] == site_url].empty:
        print("Site URL not found")
    else:
        print(f"Executing num_sessions command for site URL: {site_url}, number of sessions is:")
        print(num_sessions(df_filtered, site_url))
elif command == "median_session_length":
    if df_filtered[df_filtered["site_url"] == site_url].empty:
        print("Site URL not found")
    else:
        print(f"Executing median_session_length command for site URL: {site_url}, median session length is:")
        print(median_session_length(df_filtered, site_url))
elif command == "num_unique_visited_sites":
    if df_filtered[df_filtered["visitor_id"] == visitor_id].empty:
        print("Visitor ID not found")
    else:
        print(f"Executing num_unique_visited_sites command for visitor ID: {visitor_id}, amount of unique visited sites is:")
        print(num_unique_visited_sites(df_filtered, visitor_id))
    
# For num_sessions and median_session_length fill only site URL, for num_unique_visited_sites fill bisitor ID.

Enter command: median_session_length
Enter site URL: www.s_8.com
Enter visitor ID: visitor_4297
Executing median_session_length command for site URL: www.s_8.com, median session length is:
1607.0


In [7]:
df

Unnamed: 0,visitor_id,site_url,page_view_url,timestamp,next_timestamp,session_length,new_session,session_id,session_start,session_end,actual_session_length
0,visitor_4297,www.s_8.com,www.s_8.com/page_1,2012-09-17 01:14:02,2012-09-17 01:23:42,580.0,0,1,2012-09-17 01:14:02,2012-09-17 01:35:30,1288.0
1,visitor_6267,www.s_10.com,www.s_10.com/page_1,2012-09-17 01:14:03,2012-09-17 01:23:13,550.0,0,1,2012-09-17 01:14:03,2012-09-17 01:57:37,2614.0
2,visitor_7565,www.s_6.com,www.s_6.com/page_1,2012-09-17 01:14:06,2012-09-17 01:23:51,585.0,0,1,2012-09-17 01:14:06,2012-09-17 01:30:40,994.0
3,visitor_3227,www.s_1.com,www.s_1.com/page_1,2012-09-17 01:14:10,2012-09-17 01:23:37,567.0,0,1,2012-09-17 01:14:10,2012-09-17 01:42:28,1698.0
4,visitor_3514,www.s_9.com,www.s_9.com/page_1,2012-09-17 01:14:11,2012-09-17 01:23:18,547.0,0,1,2012-09-17 01:14:11,2012-09-17 01:49:30,2119.0
...,...,...,...,...,...,...,...,...,...,...,...
145962,visitor_8335,www.s_9.com,www.s_9.com/page_1,2012-09-17 19:15:54,2012-09-17 19:23:41,467.0,0,1,2012-09-17 19:15:54,2012-09-17 19:44:14,1700.0
145963,visitor_8335,www.s_9.com,www.s_9.com/page_2,2012-09-17 19:23:41,2012-09-17 19:32:02,501.0,0,1,2012-09-17 19:15:54,2012-09-17 19:44:14,1700.0
145964,visitor_8335,www.s_9.com,www.s_9.com/page_3,2012-09-17 19:32:02,2012-09-17 19:37:45,343.0,0,1,2012-09-17 19:15:54,2012-09-17 19:44:14,1700.0
145965,visitor_8335,www.s_9.com,www.s_9.com/page_4,2012-09-17 19:37:45,2012-09-17 19:44:14,389.0,0,1,2012-09-17 19:15:54,2012-09-17 19:44:14,1700.0
