In [2]:
import pandas as pd
import numpy as np

In [3]:
file_path = r'C:\Users\PC\Desktop\Projects\User Journey Analysis in Python Project\project-files-user-journey-analysis-in-python (1)\user_journey_raw.csv';
df= pd.read_csv(file_path);
df

Unnamed: 0,user_id,session_id,subscription_type,user_journey
0,1516,2980231,Annual,Homepage-Log in-Log in-Log in-Log in-Log in-Lo...
1,1516,2980248,Annual,Other-Sign up-Sign up-Sign up-Sign up-Sign up-...
2,1516,2992252,Annual,Log in-Log in-Log in-Log in-Log in-Log in
3,1516,3070491,Annual,Homepage-Log in-Log in-Log in-Log in-Log in-Lo...
4,1516,3709807,Annual,Log in-Log in-Log in-Log in-Log in-Log in-Log ...
...,...,...,...,...
9930,509095,4487613,Annual,Other-Other-Other-Other-Other-Other-Other-Othe...
9931,509095,4842565,Annual,Other-Other-Other-Other-Other-Other-Other-Othe...
9932,509095,4843103,Annual,Other-Other
9933,509095,4845316,Annual,Other-Other-Other-Other-Other-Other-Other-Othe...


In [4]:
df.dtypes

user_id               int64
session_id            int64
subscription_type    object
user_journey         object
dtype: object

## Data Preprocessing

In [5]:
# Make a copy of the DataFrame to avoid modifying the original
# Split the journey string into individual pages
# Initialize a new list to store the cleaned pages
# Iterate over pages, adding each page only if it's different from the last added
# Join the cleaned pages back into a single string
# Apply the function to each row in the user_journey column
# Display the cleaned DataFrame

def remove_page_duplicates(data ,target_column='user_journey'):
    df_copy = data.copy()
    def remove_sequential_duplicates(journey):
        if isinstance(journey , str):
            pages = journey.split('-')
            cleaned_pages = []
            for page in pages:
                if not  cleaned_pages or page!=cleaned_pages[-1] :
                    cleaned_pages.append(page)
            return '-'.join(cleaned_pages)
        else :
            return journey
    df_copy[target_column] = df_copy[target_column].apply(remove_sequential_duplicates)
    return df_copy



In [6]:
cleaned_df = remove_page_duplicates(df, 'user_journey')
cleaned_df

Unnamed: 0,user_id,session_id,subscription_type,user_journey
0,1516,2980231,Annual,Homepage-Log in-Other
1,1516,2980248,Annual,Other-Sign up-Log in
2,1516,2992252,Annual,Log in
3,1516,3070491,Annual,Homepage-Log in
4,1516,3709807,Annual,Log in
...,...,...,...,...
9930,509095,4487613,Annual,Other
9931,509095,4842565,Annual,Other
9932,509095,4843103,Annual,Other
9933,509095,4845316,Annual,Other


In [7]:
# Create a copy of the DataFrame to avoid modifying the original
# Sort the DataFrame by the group_column and session_id to ensure proper ordering within each user's sessions
# Define a function to concatenate journeys based on specified sessions and count_from
# If sessions is 'All', concatenate all journeys for the user
# If sessions is an integer, select journeys based on count_from
# Concatenate the selected journeys into one big string
# Group by the group_column and apply the concatenate_journeys function
# Rename the columns for clarity

def group_by(data , group_column='user_id' , target_column='user_journey' , sessions='All' , count_from='last'):
    df_copy = data.copy()
    df_copy = df_copy.sort_values(by=[group_column, 'session_id']).reset_index(drop=True)
    def concatenate_journeys(group):
        if sessions == 'All':
            selected_journey = group[target_column]
        else :
            num_sessions = int(sessions)
            if count_from == 'first':
                selected_journey = group[target_column].head(num_sessions)
            elif count_from == 'last':
                selected_journey = group[target_column].tail(num_sessions)
            else:
                raise ValueError ("Count_from must be either 'first' or 'last'") 
        return '-'.join(selected_journey)
    grouped_df = df_copy.groupby(group_column,group_keys=False).apply(concatenate_journeys).reset_index()
    grouped_df.columns = [group_column , target_column]

    return grouped_df

In [8]:
"""
    Groups user sessions into a single journey string based on specified criteria.

    Parameters:
        data (pd.DataFrame): The DataFrame containing the journey data.
        group_column (str): The column to group by (default is 'user_id').
        target_column (str): The column containing journey strings (default is 'user_journey').
        sessions (int or str): The number of sessions to group (default is 'All' to include all sessions).
        count_from (str): Whether to count sessions from 'first' or 'last' (default is 'last').

    Returns:
        pd.DataFrame: A new DataFrame with grouped journeys for each user.
    """
def group_by(data, group_column='user_id', target_column='user_journey', sessions='All', count_from='last'):
    
    # Create a copy of the DataFrame to avoid modifying the original
    df_copy = data.copy()

    # Sort by group_column and session_id to ensure proper ordering within each user's sessions
    df_copy = df_copy.sort_values(by=[group_column, 'session_id']).reset_index(drop=True)

    # Define a function to concatenate journeys based on the specified sessions and count_from
    def concatenate_journeys(group):
        if sessions == 'All':
            selected_journey = group[target_column]
        else:
            num_sessions = int(sessions)
            if count_from == 'first':
                selected_journey = group[target_column].head(num_sessions)
            elif count_from == 'last':
                selected_journey = group[target_column].tail(num_sessions)
            else:
                raise ValueError("count_from must be either 'first' or 'last'")
        
        # Concatenate the selected journeys into one big string
        return '-'.join(selected_journey)
    
    # Group by the specified column and apply the concatenate_journeys function
    grouped_df = df_copy.groupby(group_column, group_keys=False).apply(concatenate_journeys).reset_index()

    # Rename the columns for clarity
    grouped_df.columns = [group_column, target_column]

    return grouped_df


In [21]:
### Assuming your DataFrame is called df
### Example 1: Group all sessions for each user
### Example 2: Group only the first 10 sessions for each user
### Example 3: Group only the last 3 sessions for each user
### Display the result


group_df_all = group_by(df , 'user_id' , 'user_journey' , sessions='All')
#print(group_df_all.head())

group_df_10 = group_by(df , 'user_id' , 'user_journey' , sessions=10 , count_from='last')
##print(group_df_10)

grouped_df_last_3 = group_by(df, group_column='user_id', target_column='user_journey', sessions=3, count_from='last')
print(grouped_df_last_3)



  grouped_df = df_copy.groupby(group_column, group_keys=False).apply(concatenate_journeys).reset_index()
  grouped_df = df_copy.groupby(group_column, group_keys=False).apply(concatenate_journeys).reset_index()


      user_id                                       user_journey
0        1516  Checkout-Checkout-Checkout-Checkout-Checkout-C...
1        3395  Homepage-Pricing-Pricing-Checkout-Checkout-Che...
2       10107  Checkout-Checkout-Checkout-Checkout-Checkout-C...
3       11145  Homepage-Log in-Log in-Log in-Homepage-Log in-...
4       12400  Other-Career track certificate-Career track ce...
...       ...                                                ...
1345   509060  Other-Other-Other-Other-Other-Other-Other-Othe...
1346   509061  Coupon-Coupon-Coupon-Coupon-Coupon-Coupon-Coup...
1347   509085  Coupon-Coupon-Coupon-Coupon-Coupon-Coupon-Coup...
1348   509095  Other-Other-Other-Other-Other-Other-Other-Othe...
1349   509096  Other-Other-Other-Other-Other-Other-Coupon-Cou...

[1350 rows x 2 columns]


  grouped_df = df_copy.groupby(group_column, group_keys=False).apply(concatenate_journeys).reset_index()


In [10]:
#The function should iterate over the rows in the specified column.
#It will split the journey by a specified delimiter (assuming the delimiter is '-' in this case).
#It will remove any pages that match those in the pages list.
#Finally, it returns the modified dataframe.

def remove_pages(data , pages , target_column='user_journey'):
    data_copy = data.copy()
    def filter_pages(journey):
        # Only process if journey is a string
        if isinstance(journey, str):
            journey_pages = journey.split('-')
            # Remove pages listed in the `pages` parameter
            filtered_pages = [page for page in journey_pages if page not in pages]
            return '-'.join(filtered_pages)
        else:
            return journey
            
    data_copy[target_column] = data_copy[target_column].apply(filter_pages)
    return data_copy

pages_to_remove = ['Log in']

cleaned_df = remove_pages(df , pages_to_remove)

##print(cleaned_df)


In [16]:
### Step 1: Apply the `group_by` function to group all sessions for each user
# Step 2: Remove sequential duplicates in the user journey using `remove_page_duplicates`
# Step 3: (Optional for now, based on your requirements) Remove specific pages
# If you don’t need to remove any pages yet, you can skip this step
# Otherwise, specify the pages to remove and apply `remove_pages`
# Step 4: Save the final cleaned DataFrame to a CSV file

grouped_df_no_duplicates = remove_page_duplicates(df, target_column='user_journey')

grouped_all_sessions_df = group_by(grouped_df_no_duplicates, group_column ='user_id' ,target_column='user_journey' , sessions='All');
pages_to_remove = ['Log in']
cleaned_df = remove_pages(grouped_all_sessions_df ,pages= pages_to_remove , target_column='user_journey')

cleaned_df.to_csv('cleaned_user_journey/cleaned_user_journey.csv', index=False)
print("CSV file 'cleaned_user_journey.csv' created successfully.")



CSV file 'cleaned_user_journey.csv' created successfully.


  grouped_df = df_copy.groupby(group_column, group_keys=False).apply(concatenate_journeys).reset_index()
