<a href="https://colab.research.google.com/github/R1shiY/Projects/blob/main/User_Journey.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
#Initial preprocessing

import pandas as pd
#import the library

pages = pd.read_csv('user_journey_raw.csv')
#import the dataset

pages = pages.drop_duplicates(subset = ['user_journey'])
#remove the duplicates

pages = pages.groupby(['session_id', 'user_id'])
#group the dataset by these two columns

pages.head()
#display the data

Unnamed: 0,user_id,session_id,subscription_type,user_journey
0,1516,2980231,Annual,Homepage-Log in-Log in-Log in-Log in-Log in-Lo...
1,1516,2980248,Annual,Other-Sign up-Sign up-Sign up-Sign up-Sign up-...
2,1516,2992252,Annual,Log in-Log in-Log in-Log in-Log in-Log in
3,1516,3070491,Annual,Homepage-Log in-Log in-Log in-Log in-Log in-Lo...
4,1516,3709807,Annual,Log in-Log in-Log in-Log in-Log in-Log in-Log ...
...,...,...,...,...
9907,509000,4811514,Annual,Homepage-Pricing-Pricing-Pricing-Pricing-Prici...
9910,509017,4685936,Annual,Homepage-Sign up-Homepage-Courses-Courses-Cour...
9914,509017,4840618,Annual,Other-Career tracks-Career tracks-Courses-Cour...
9923,509036,4841754,Annual,Other-Other-Coupon-Coupon-Coupon-Coupon-Coupon...


In [14]:
#Page count

pages = pd.read_csv('user_journey_raw.csv')
#import the data

pages = pages.drop_duplicates(subset=['user_journey'])
#drop the duplicates

pages['user_journey'] = pages['user_journey'].str.split('-')
#split each string in the user_journey column into an array of passages, instead of a singular string with dashes

pages.head()

#display data for testing purposes

split_pages = pages.explode('user_journey')
#explode(split) the array into individual strings per row

page_count = split_pages['user_journey'].value_counts()
#counts the occurences of each individual page

print(page_count)
#display the counts

user_journey
Courses                     6012
Career tracks               4507
Sign up                     3863
Log in                      3619
Career track certificate    2952
Checkout                    2721
Homepage                    2233
Resources center            2103
Pricing                     1754
Other                       1580
Coupon                      1435
Course certificate          1104
Success stories              604
Upcoming courses             188
Instructors                   76
Blog                          34
About us                      29
Name: count, dtype: int64


In [15]:
#Page presence

pages = pd.read_csv('user_journey_raw.csv')
#import the data

pages = pages.drop_duplicates(subset=['user_journey'])
#drop the duplicates

pages['user_journey'] = pages['user_journey'].str.split('-')
#split each string in the user_journey column into an array of passages, instead of a singular string with dashes

pages['unique_pages'] = pages['user_journey'].apply(set)
#converts each list in the user_journey column to a set, allowing for a singular occurence of the page from the list per row

split_pages = pages.explode('unique_pages')
#explode(split) the array into individual strings per row

page_presence = split_pages['unique_pages'].value_counts()
#counts the occurences of each individual page

print(page_presence)
#display the output

unique_pages
Homepage                    949
Sign up                     669
Courses                     663
Pricing                     610
Career tracks               599
Log in                      484
Other                       362
Career track certificate    331
Resources center            236
Checkout                    196
Course certificate          187
Upcoming courses            101
Coupon                       60
Success stories              49
Instructors                  26
About us                     19
Blog                         14
Name: count, dtype: int64


In [20]:
#Page destination

pages = pd.read_csv('user_journey_raw.csv')
#import the data

pages = pages.drop_duplicates(subset=['user_journey'])
#drop the duplicates

pages['user_journey'] = pages['user_journey'].str.split('-')
#split each string in the user_journey column into an array of passages, instead of a singular string with dashes

user = {}
#initialize an empty dictionary to record page transitions
for journey in pages['user_journey']:
 #loops through every journey
    for i in range(len(journey) - 1):
      #loops through each page in the journey besides last
        current_page = journey[i]
        #assigns current page in the journey
        follow_page = journey[i + 1]
        #assigns next page in the journey
        if current_page not in user:
          #if the current page is not in the user, initliaze an empty dictionary in place of the column of the new page
            user[current_page] = {}

        if follow_page not in user[current_page]:
          #if the follow page is not yet in the place in a row next to the current page, create that row and assign it a value of 1
            user[current_page][follow_page] = 1

        else:
          #if both are already present, increment the count by one due to the occurence
          user[current_page][follow_page] += 1

user_journey = []
#initialize an empty array to convert the dictionary
for current_page in user:
  # loop over each current page in the transitions dictionary
    follow_pages = user[current_page]
    #get the dictionary of next pages for the current page present
    for follow_page in follow_pages:
      #loop over the next pages
        count = follow_pages[follow_page]
        #get the count for the transition from current page to next page in the dictionary
        user_journey.append({'current_page': current_page, 'follow_up': follow_page, 'count': count})
        #append a dictionary of the row you just looped over to the list

transition_df = pd.DataFrame(user_journey)
#convert the list to a dataframe
print(transition_df)
#print the data frame
transition_df.head()
#shows categorical distributions on a graph

    current_page         follow_up  count
0       Homepage            Log in    112
1       Homepage           Pricing    258
2       Homepage          Homepage    950
3       Homepage     Career tracks    295
4       Homepage  Resources center     48
..           ...               ...    ...
187     About us       Instructors      1
188     About us           Courses      1
189     About us  Upcoming courses      1
190     About us   Success stories      1
191     About us          Homepage      1

[192 rows x 3 columns]


Unnamed: 0,current_page,follow_up,count
0,Homepage,Log in,112
1,Homepage,Pricing,258
2,Homepage,Homepage,950
3,Homepage,Career tracks,295
4,Homepage,Resources center,48



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.



In [27]:
#Page sequences

pages = pd.read_csv('user_journey_raw.csv')
#import the data

pages = pages.drop_duplicates(subset=['user_journey'])
#drop the duplicates

pages['user_journey'] = pages['user_journey'].str.split('-')
#split each string in the user_journey column into an array of passages, instead of a singular string with dashes

#initialize a dictionary to count sequences
sequence_counts = {}
#initialize a dictionary to count sequences

N = 3
#defining number for sequence length, can be changed if neccesary (3 is a placeholder)

for visit in pages['user_journey']:
  #loop through each user journey/visit
    unique_sequences = set()
    #ensures each sequence is only counted once per journey, since sets don't allow repeated values

    if len(visit) >= N:
    # only consider journeys with length >= N, as the sequence can't be counted
        for i in range(len(visit) - N + 1):
          #iterate through the journey to find sequences of length N to add/increase their count, and to make sure it doesn't exceed the index, the value stops N before the last index in the list
            sequence = '-'.join(visit[i:i+N])
             #create a string of N consecutive pages (in this case 3), with dashes in between each

            if sequence not in unique_sequences:
            #sees if the created sequence is not in the unique_sequences
                if sequence in sequence_counts:
                    sequence_counts[sequence] += 1
                #if the sequence is already in the dictionary, increase the count by 1
                else:
                    sequence_counts[sequence] = 1
                #else if the sequence is not in the dictionary, add it with a new count of 1
                unique_sequences.add(sequence)
                #add the sequence to the set of repeated sequences so it doesn't get repeated

most_popular_sequence = max(sequence_counts, key=sequence_counts.get)
#finds the most popular sequence entry in the dictionary using the sequence counts dictionary and the get attribute, which is used to retrieve the value associated with a given key (in this case, the max count)
count = sequence_counts[most_popular_sequence]
#gets the count of the most popular sequence entry from the dictionary
print("Num pages: " + str(N) + ", Sequence: " + most_popular_sequence + ", Count: " + str(count))
#displays the length of sequence, the most popular sequence that has the given length, and the count of that sequence. Casts int to string so it can be printed in the same line

Num pages: 3, Sequence: Courses-Courses-Courses, Count: 513


In [None]:
#Journey length

pages = pd.read_csv('user_journey_raw.csv')
#import the data

pages = pages.drop_duplicates(subset=['user_journey'])
#drop the duplicates

pages['user_journey'] = pages['user_journey'].str.split('-')
#split each string in the user_journey column into an array of passages, instead of a singular string with dashes


pages['journey_length'] = pages['user_journey'].apply(len)
#split the array into values of the pages's individual word lengths


journey_length = pages['journey_length'].mean()
#calculate the mean of all lengths

print(journey_length)
#display length

18.910374796306357
