In [253]:
import pandas as pd

In [273]:
'''
Function counts the total number of times each page has been visited in all user journeys

Parameters:
data: DataFrame with cleaned user journey data in column 'user_journey'
target_column: default set to 'user_journey' - do not set as another column

Returns:
A series containing counts of unique values from a list with all web page appearances in user journey
'''

def page_counts(data, target_column='user_journey', subscription_select=None):
    
    # Empty list to store '-' seperated values in strings of 'user_journey'
    mega_list = []
    
    # Loop to iterate through each row of the DataFrame
    for index, row in data.iterrows():
        
        if subscription_select == None:
            # Extracts values from target_column containing hyphen-separated strings and stores them in a list
            page_list = row[target_column].split('-')
            mega_list.extend(page_list)
        elif row['subscription_type'] == subscription_select:
            # Extracts values of specified subscription type from target_column and stores them in a list
            page_list = row[target_column].split('-')
            mega_list.extend(page_list)
        
    
    # mega_list is converted to a series and unique values counted
    page_counts = pd.Series(mega_list).value_counts()
    
    return page_counts

In [255]:
'''
Counts the number of times a page appears in a users journey, only will count the page once per journey

Parameters:
data: DataFrame with cleaned user journey data in column 'user_journey'
target_column: default set to 'user_journey' - do not set as another column

Returns:
A series containing counts of unique values from a list with total appearances of a page in user journey
'''

def page_presence(data, target_column='user_journey', subscription_select=None):
    
    # Empty list to store string values each time they appear in user journey 
    mega_list = []
    
    # Iterates through rows in a DataFrame (data)
    for index, row in data.iterrows():
        
        if subscription_select == None:
            # Extracts values from target_column containing hyphen-separated strings and stores them in a list
            page_list = row[target_column].split('-')
        elif row['subscription_type'] == subscription_select:
            # Extracts values of specified subscription type from target_column and stores them in a list
            page_list = row[target_column].split('-')
        
        
        # Returns set of 'page_list' removing duplicate values in the list
        unique_pages = list(set(page_list))
        # Adds them to a cumulative list (mega_list)
        mega_list.extend(unique_pages)
    
    # mega_list is converted to a series and unique values counted
    page_presence = pd.Series(mega_list).value_counts()
    
    return page_presence 

In [256]:
'''
Looks at every page and records which pages the user goes to next. Can specify the subscription type and
what page you want to see subsequent destination pages from.

Parameters:
data: DataFrame with cleaned user journey data in column 'user_journey'
target_column: default set to 'user_journey' - do not set as another column
page: default set to None, will return all key value pairs. Can specify a specific web page to view

Returns:
Dictionaries key value pairs for either all pages or a specific page. Each dictionary key for a page has
another dictionary within it that has the keys as subsequent page visits from that page, the keys show 
how often this has happened per user
'''
def page_destinations(data, target_column='user_journey', subscription_select=None, page=None): 
    
    # Initialize a dictionary to store the counts
    page_destinations = {}
    
    # Iterates through rows in a DataFrame (data)
    for index, row in data.iterrows():
        
         # if elif statement handles selecting either all journeys or only for a specific subscription type
        if subscription_select == None:
           # Extracts all values from target_column and stores them in a list 
            page_list = row[target_column].split('-')
        elif row['subscription_type'] == subscription_select:
            # Extracts values of specified subscription type from target_column and stores them in a list
            page_list = row[target_column].split('-')
        
        # Iterate through the list to count occurrences of the next strings
        for i in range(len(page_list) - 1):
            current_string = page_list[i]
            next_string = page_list[i + 1]

            # Check if the current string is already in the dictionary
            if current_string not in page_destinations:
                page_destinations[current_string] = {}
            
            # Count occurrences of the next string
            if next_string in page_destinations[current_string]:
                page_destinations[current_string][next_string] += 1
            else:
                page_destinations[current_string][next_string] = 1
    
    if page is None:
        return page_destinations
    else:
        if page in page_destinations:
            return page_destinations[page]
        else:
            return print('Page not found')


In [257]:
'''
Looks at the most popular sequence of page view for a specified number of pages

Parameters:
data: DataFrame with cleaned user journey data in column 'user_journey'
target_column: default set to 'user_journey' - do not set as another column
subscription_select: Default is None which will select all results. 
                     Can be 'Yearly', 'Monthly' or 'Quarterly' to only look at certain subscription types
start_page: Default will select all results.
            Can be a string value of desired start page for subsequent sequences of pages
run: Default set at 2 - criteria for the number of pages in sequence to view

Returns:
Sorted dictionary in descending order of page sequences. Keys are the sequence of pages, values are occurances
'''

def page_sequences(data, target_column='user_journey', subscription_select=None, start_page=None, run=2):
    
    # Initialize a dictionary to store all the counts of sequences in individual user journys
    sequences_count = {}
    
    # Iterates through rows in a DataFrame (data)
    for index, row in data.iterrows():
        
        # if elif statement handles selecting either all journeys or only for a specific subscription type
        if subscription_select == None:
            # Extracts all values from target_column and stores them in a list 
            page_list = row[target_column].split('-') 
        elif row['subscription_type'] == subscription_select:
            # Extracts values of specified subscription type from target_column and stores them in a list
            page_list = row[target_column].split('-')   
            
        # Empty list to store page sequences for each iteration of user journey in main for loop
        user_page_sequence = []
        
        # Iterates through list of pages and slices the specified amount of page sequences
        for i in range(len(page_list) - (run-1)):
            combination = page_list[i:i+run]
            string_comb = '-'.join(combination)
            
            # If page sequence for individual user has not occured, adds the sequence to a list 
            if string_comb not in user_page_sequence and start_page == None:
                user_page_sequence.append(string_comb)
            elif string_comb not in user_page_sequence and page_list[i] == start_page:
                user_page_sequence.append(string_comb)
                
        # Adds count of individual users sequence to overall count of all sequences
        for sequence in user_page_sequence:
            if sequence not in sequences_count:
                sequences_count[sequence] = 1
            else:
                sequences_count[sequence] += 1
    
    # Creates a new dictionsary sorting the results into descending order
    sorted_sequences = {k: v for k, v in sorted(sequences_count.items(), key=lambda item: item[1], reverse=True)}
    
    return sorted_sequences

In [282]:
'''
Calculates the average amount of pages a user goes through (excluding repeated sequential pages from 
data cleaning)

Parameters:
data: DataFrame with cleaned user journey data in column 'user_journey'
target_column: default set to 'user_journey' - do not set as another column

Returns:
Returns the average number of pages a users goes through as an integer
'''
def journey_length(data, target_column='user_journey'):
    
    total_page_views = 0
    
    for index, row in data.iterrows():
        page_list = row[target_column].split('-')
        total_page_views += len(page_list)
    
    average_journey = (total_page_views / len(data[target_column]))
    
    return average_journey

In [280]:
# Import csv file with the cleaned user journey data into a pandas DataFrame
df_user_journeys = pd.read_csv('user_journey_23_01_to_23_03_CLEANED.csv')
df_user_journeys

Unnamed: 0,user_id,subscription_type,user_journey
0,1516,Yearly,Log in-Homepage-Log in-Other-Log in-Other-Sign...
1,3395,Yearly,Log in-Sign up-Log in-Pricing-Sign up-Homepage...
2,10107,Yearly,Homepage-Resources center-Other-Career tracks-...
3,11145,Monthly,Homepage-Log in-Homepage-Log in-Homepage-Log i...
4,12400,Monthly,Homepage-Career tracks-Sign up-Log in-Career t...
...,...,...,...
1432,509060,Yearly,Other
1433,509061,Yearly,Coupon
1434,509085,Yearly,Coupon
1435,509095,Yearly,Other


In [276]:
page_counts(df_user_journeys, subscription_select='Quarterly')

Log in                      134
Homepage                    127
Sign up                      89
Checkout                     68
Career tracks                61
Courses                      56
Pricing                      35
Career track certificate     28
Other                        22
Course certificate           18
Resources center             16
Coupon                        7
Upcoming courses              5
Success stories               4
About us                      1
dtype: int64

In [277]:
page_sequences(df_user_journeys, start_page='Pricing', run=4)

{'Pricing-Checkout-Pricing-Checkout': 56,
 'Pricing-Checkout-Homepage-Pricing': 47,
 'Pricing-Sign up-Homepage-Pricing': 26,
 'Pricing-Courses-Pricing-Courses': 25,
 'Pricing-Homepage-Pricing-Homepage': 25,
 'Pricing-Log in-Homepage-Log in': 22,
 'Pricing-Sign up-Pricing-Sign up': 21,
 'Pricing-Courses-Homepage-Courses': 18,
 'Pricing-Career tracks-Pricing-Career tracks': 17,
 'Pricing-Career tracks-Courses-Career tracks': 17,
 'Pricing-Homepage-Pricing-Checkout': 16,
 'Pricing-Log in-Homepage-Pricing': 16,
 'Pricing-Career tracks-Homepage-Career tracks': 15,
 'Pricing-Homepage-Pricing-Log in': 14,
 'Pricing-Homepage-Pricing-Courses': 13,
 'Pricing-Log in-Pricing-Log in': 13,
 'Pricing-Checkout-Log in-Homepage': 13,
 'Pricing-Courses-Career tracks-Courses': 13,
 'Pricing-Checkout-Log in-Checkout': 12,
 'Pricing-Sign up-Homepage-Sign up': 11,
 'Pricing-Courses-Homepage-Pricing': 11,
 'Pricing-Career track certificate-Pricing-Career track certificate': 10,
 'Pricing-Career tracks-Sign up

In [279]:
df_user_journeys_last_3 = pd.read_csv('user_journey_23_01_to_23_03_last_3_sessions.csv')
df_user_journeys_last_3

Unnamed: 0,user_id,subscription_type,user_journey
0,1516,Yearly,Checkout
1,3395,Yearly,Homepage-Pricing-Checkout
2,10107,Yearly,Checkout
3,11145,Monthly,Log in-Homepage-Log in
4,12400,Monthly,Resources center-Career track certificate-Reso...
...,...,...,...
1432,509060,Yearly,Other
1433,509061,Yearly,Coupon
1434,509085,Yearly,Coupon
1435,509095,Yearly,Other


In [283]:
journey_length(df_user_journeys_last_3)

4.4022268615170494

In [284]:
page_presence(df_user_journeys_last_3)

Checkout                    747
Log in                      702
Homepage                    579
Coupon                      534
Sign up                     331
Other                       296
Pricing                     204
Courses                     189
Career tracks               133
Career track certificate     74
Resources center             59
Course certificate           42
Upcoming courses             29
Success stories              12
Instructors                   9
About us                      5
Blog                          5
Checkou                       1
Career track certific         1
C                             1
Caree                         1
Course certi                  1
dtype: int64

In [286]:
page_sequences(df_user_journeys_last_3, run=4)

{'Log in-Homepage-Log in-Checkout': 80,
 'Homepage-Log in-Homepage-Log in': 65,
 'Log in-Homepage-Log in-Homepage': 52,
 'Sign up-Homepage-Sign up-Checkout': 28,
 'Sign up-Log in-Sign up-Log in': 19,
 'Pricing-Checkout-Pricing-Checkout': 17,
 'Log in-Sign up-Log in-Sign up': 17,
 'Courses-Career tracks-Courses-Career tracks': 17,
 'Log in-Homepage-Log in-Coupon': 16,
 'Career tracks-Courses-Career tracks-Courses': 15,
 'Log in-Homepage-Sign up-Log in': 15,
 'Pricing-Checkout-Homepage-Pricing': 14,
 'Checkout-Log in-Homepage-Log in': 14,
 'Log in-Other-Log in-Checkout': 14,
 'Homepage-Sign up-Log in-Checkout': 13,
 'Sign up-Courses-Sign up-Checkout': 13,
 'Sign up-Homepage-Sign up-Log in': 13,
 'Sign up-Log in-Homepage-Sign up': 12,
 'Checkout-Homepage-Pricing-Checkout': 12,
 'Log in-Homepage-Log in-Other': 11,
 'Homepage-Log in-Other-Log in': 11,
 'Checkout-Homepage-Log in-Checkout': 11,
 'Sign up-Courses-Homepage-Courses': 11,
 'Homepage-Career tracks-Homepage-Career tracks': 10,
 'Ca