<a href="https://colab.research.google.com/github/R1shiY/Projects/blob/main/User_Journey.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
#Initial preprocessing

import pandas as pd
#import the library

pages = pd.read_csv('user_journey_raw.csv')
#import the dataset

pages = pages.drop_duplicates(subset = ['user_journey'])
#remove the duplicates

pages['user_journey'] = pages['user_journey'].str.split('-')
#split each string in the user_journey column into an array of passages, instead of a singular string with dashes

pages.head()
#display the data

Unnamed: 0,user_id,session_id,subscription_type,user_journey
0,1516,2980231,Annual,"[Homepage, Log in, Log in, Log in, Log in, Log..."
1,1516,2980248,Annual,"[Other, Sign up, Sign up, Sign up, Sign up, Si..."
2,1516,2992252,Annual,"[Log in, Log in, Log in, Log in, Log in, Log in]"
3,1516,3070491,Annual,"[Homepage, Log in, Log in, Log in, Log in, Log..."
4,1516,3709807,Annual,"[Log in, Log in, Log in, Log in, Log in, Log i..."


In [8]:
#Page count

def explode_user_journey(pages_df, column_name):
    return pages_df.explode(column_name)
    ##explode(split) the array in the column parameter of the dataset into individual strings per row

def count_page_occurrences(pages_df, column_name):
    exploded_df = explode_user_journey(pages_df, column_name)
    #calls the explode_user_journey function to split the array
    page_count = exploded_df[column_name].value_counts()
    #counts the occurences of each individual page
    return page_count
    #returns the count

page_counts = count_page_occurrences(pages, 'user_journey')
#calls the function
print(page_counts)
#prints the count

user_journey
Courses                     6012
Career tracks               4507
Sign up                     3863
Log in                      3619
Career track certificate    2952
Checkout                    2721
Homepage                    2233
Resources center            2103
Pricing                     1754
Other                       1580
Coupon                      1435
Course certificate          1104
Success stories              604
Upcoming courses             188
Instructors                   76
Blog                          34
About us                      29
Name: count, dtype: int64


In [9]:
#Page presence

def process_user_journey(pages_df, column):
    pages_df['unique_pages'] = pages_df[column].apply(set)
    # converts each list in the user_journey column to a set, allowing for a singular occurrence of the page from the list per row

    split_pages = explode_user_journey(pages_df, 'unique_pages')
    # explode(split) the array into individual strings per row

    page_presence = split_pages['unique_pages'].value_counts()
    # counts the occurrences of each individual page

    return page_presence
    # returns the output

print(process_user_journey(pages, 'user_journey'))
#prints the output


unique_pages
Homepage                    949
Sign up                     669
Courses                     663
Pricing                     610
Career tracks               599
Log in                      484
Other                       362
Career track certificate    331
Resources center            236
Checkout                    196
Course certificate          187
Upcoming courses            101
Coupon                       60
Success stories              49
Instructors                  26
About us                     19
Blog                         14
Name: count, dtype: int64


In [21]:
#Page destination

def calculate_page_transitions(pages_df, column):
    # Initialize an empty dictionary to record page transitions
    user = {}
    # Loop through every journey
    for journey in pages_df[column]:
        for i in range(len(journey) - 1):
            # Loop through each page in the journey besides last
            current_page = journey[i]
            # Assign current page in the journey
            follow_page = journey[i + 1]
            # Assign next page in the journey

            if current_page not in user:
                # If the current page is not in the user, initialize an empty dictionary in place of the column of the new page
                user[current_page] = {}

            if follow_page not in user[current_page]:
                # If the follow page is not yet in the place in a row next to the current page, create that row and assign it a value of 1
                user[current_page][follow_page] = 1
            else:
                # If both are already present, increment the count by one due to the occurrence
                user[current_page][follow_page] += 1

    # Initialize an empty array to convert the dictionary
    user_journey = []

    # Loop over each current page in the transitions dictionary
    for current_page in user:
        follow_pages = user[current_page]
        # Get the dictionary of next pages for the current page present
        for follow_page in follow_pages:
            # Loop over the next pages
            count = follow_pages[follow_page]
            # Get the count for the transition from current page to next page in the dictionary
            user_journey.append({'current_page': current_page, 'follow_up': follow_page, 'count': count})
            # Append a dictionary of the row you just looped over to the list

    # Convert the list to a DataFrame
    transition_df = pd.DataFrame(user_journey)

    #return the DataFrame
    return transition_df

transition_df = calculate_page_transitions(pages, 'user_journey')
#calls the function
transition_df_sorted = transition_df.sort_values(by='count', ascending=False)
#sorts the DataFrame by count in descending order
print(transition_df_sorted)
#prints the output

                 current_page                 follow_up  count
103                   Courses                   Courses   5070
76              Career tracks             Career tracks   3522
41                    Sign up                   Sign up   3102
17                     Log in                    Log in   3065
115  Career track certificate  Career track certificate   2495
..                        ...                       ...    ...
135               Instructors                    Log in      1
16                   Homepage                    Coupon      1
14                   Homepage                  Checkout      1
154           Success stories                   Courses      1
191                  About us                  Homepage      1

[192 rows x 3 columns]


In [11]:
#Page sequences

def find_most_popular_sequence(pages_df, column, N):
    sequence_counts = {}
    # initialize a dictionary to count sequences

    for visit in pages_df[column]:
        # loop through each user journey/visit
        unique_sequences = set()
        # ensures each sequence is only counted once per journey, since sets don't allow repeated values

        if len(visit) >= N:
            # only consider journeys with length >= N, as the sequence can't be counted
            for i in range(len(visit) - N + 1):
                # iterate through the journey to find sequences of length N to add/increase their count, and to make sure it doesn't exceed the index, the value stops N before the last index in the list
                sequence = '-'.join(visit[i:i+N])
                # create a string of N consecutive pages, with dashes in between each

                if sequence not in unique_sequences:
                    # sees if the created sequence is not in the unique_sequences
                    if sequence in sequence_counts:
                        sequence_counts[sequence] += 1
                    # if the sequence is already in the dictionary, increase the count by 1
                    else:
                        sequence_counts[sequence] = 1
                    # else if the sequence is not in the dictionary, add it with a new count of 1
                    unique_sequences.add(sequence)
                    # add the sequence to the set of repeated sequences so it doesn't get repeated

    most_popular_sequence = max(sequence_counts, key=sequence_counts.get)
    # finds the most popular sequence entry in the dictionary using the sequence counts dictionary and the get attribute, which is used to retrieve the value associated with a given key (in this case, the max count)
    count = sequence_counts[most_popular_sequence]
    # gets the count of the most popular sequence entry from the dictionary
    return "Num pages: " + str(N) + ", Sequence: " + most_popular_sequence + ", Count: " + str(count)
    #returns the length of sequence, the most popular sequence that has the given length, and the count of that sequence. Casts int to string so it can be printed in the same line

# Call the function to find and print the most popular sequence (in this case 3)
print(find_most_popular_sequence(pages, 'user_journey', N=3))

Num pages: 3, Sequence: Courses-Courses-Courses, Count: 513


In [19]:
#Journey length

def calculate_mean_journey_length(pages_df, column):
    pages_df['journey_length'] = pages_df[column].apply(len)
    # split the array into values of the pages's individual word lengths

    journey_length = pages_df['journey_length'].mean()
    # calculate the mean of all lengths

    return journey_length
    #return length

print('Average Journey Length: ' + str(calculate_mean_journey_length(pages, 'user_journey')))
#prints the reuslts by calling the function

Average Journey Length: 18.910374796306357


In [18]:
# EDA Report

# Introduction
# The goal of this exploratory data analysis (EDA) is to understand user behavior
# The dataset user_journey_raw.csv contains records of user journeys
# Each journey represented as a sequence of pages visited.

# Analysis

# 1. Page Count
print(page_counts)
# Insights:
# - The most frequently visited page is courses, followed by career tracks and sign-up.
# - Pages with the least visits include About us and Blog.

print()

# 2. Page Presence
print(process_user_journey(pages, 'user_journey'))
# Insights:
# - The home page appears in almost all user journeys.
# - Pages such as blog and about us have little presence across user journeys.

print()

# 3. Page Destination
print(transition_df_sorted)
# Insights:
# - The most common transition is from courses to courses.
# - The transition from career tracks to career tracks is also frequent.
# - Uncommon transitions include those from about us to homepage.

print()

# 4. Page Sequences
print(find_most_popular_sequence(pages, 'user_journey', N=3))
# Insights:
# - The most popular 3-page sequence is courses-courses-courses.
# - It's count was 513, indicating its high frequency and that users tend to follow this pattern regularly.

print()

# 5. Journey Length
print('Average Journey Length: ' + str(calculate_mean_journey_length(pages, 'user_journey')))
# Insights:
# - The average journey length is approximately 19 pages.
# - Most user journeys are relatively long, indicating elongated navigation to desired information.

# Conclusion/Reccomendations
# 1. Optimize High Traffic Pages: Focus on improving the user experience on the most frequently visited pages (courses, career-tracks, sign-up).
# 2. Review Low Traffic Pages: Investigate and alter pages with low visits (blog, about-us) to make them more accessible and eye-catching
# 3. Shorten User Paths: Since the average journey is long, ensure critical information is easily accessible within a few clicks so their journey can be decreased and simplified.

user_journey
Courses                     6012
Career tracks               4507
Sign up                     3863
Log in                      3619
Career track certificate    2952
Checkout                    2721
Homepage                    2233
Resources center            2103
Pricing                     1754
Other                       1580
Coupon                      1435
Course certificate          1104
Success stories              604
Upcoming courses             188
Instructors                   76
Blog                          34
About us                      29
Name: count, dtype: int64

unique_pages
Homepage                    949
Sign up                     669
Courses                     663
Pricing                     610
Career tracks               599
Log in                      484
Other                       362
Career track certificate    331
Resources center            236
Checkout                    196
Course certificate          187
Upcoming courses            101
Co