In [52]:
import json
import pandas as pd

## Loading and processing the datasets

In [54]:
def load_data(csv_path, nrows=None):
    """
    Load and preprocess the Google Analytics customer revenue dataset.
    
    Parameters:
        csv_path (str): Path to the CSV file.
        nrows (int, optional): Number of rows to load for testing purposes. Loads all rows if None.
        
    Returns:
        pd.DataFrame: Processed DataFrame with flattened JSON columns and formatted data.
    """
    
    # Define columns that contain JSON data
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    # Load the CSV file with JSON columns parsed and fullVisitorId as a string
    df = pd.read_csv(
        csv_path,
        converters={column: json.loads for column in JSON_COLUMNS},  # Convert JSON columns to dictionaries
        dtype={'fullVisitorId': 'str'},  # Ensure fullVisitorId is loaded as a string for uniqueness
        nrows=nrows  # Load specified number of rows if provided
    )
    
    # Flatten each JSON column and merge back into the main DataFrame
    for column in JSON_COLUMNS:
        # Convert JSON data in the column to a DataFrame
        column_as_df = pd.json_normalize(df[column])
        
        # Rename the columns to indicate their source (e.g., device_isMobile)
        column_as_df.columns = [f"{column}_{subcolumn}" for subcolumn in column_as_df.columns]
        
        # Drop the original JSON column and merge the new flattened columns
        df = df.drop(column, axis=1).join(column_as_df)
    
    # Drop columns that are either not useful or too complex for initial analysis
    df.drop(columns=['customDimensions', 'hits'], inplace=True)
    
    # Fill missing values for key numerical columns with 0
    # - totals_pageviews: Number of pages viewed in a session, can be 0 if no pages were viewed
    # - totals_transactionRevenue: Revenue generated, fill missing with 0 as no revenue
    df.fillna({
        'totals_pageviews': 0,
        'totals_transactionRevenue': 0
    }, inplace=True)
    
    # Convert specific columns to integers for consistency
    df['device_isMobile'] = df['device_isMobile'].astype(int)  # Convert isMobile to 1 or 0
    df['totals_transactionRevenue'] = df['totals_transactionRevenue'].astype(int)  # Revenue as integer
    
    # Ensure all object (string) columns are consistently treated as strings
    cat_cols = df.select_dtypes(include='object').columns
    df[cat_cols] = df[cat_cols].astype(str)
    
    return df


In [56]:
# Load a subset (100,000 rows) of the data for testing purposes
df = load_data('../data/raw/dataset.csv', nrows=1000000)

# Inspect the first few rows to confirm the processing
display(df.head())

Unnamed: 0,channelGrouping,date,fullVisitorId,socialEngagementType,visitId,visitNumber,visitStartTime,device_browser,device_browserVersion,device_browserSize,...,trafficSource_adwordsClickInfo.criteriaParameters,trafficSource_referralPath,trafficSource_isTrueDirect,trafficSource_adContent,trafficSource_adwordsClickInfo.page,trafficSource_adwordsClickInfo.slot,trafficSource_adwordsClickInfo.gclId,trafficSource_adwordsClickInfo.adNetworkType,trafficSource_adwordsClickInfo.isVideoAd,trafficSource_campaignCode
0,Organic Search,20171016,3162355547410993243,Not Socially Engaged,1508198450,1,1508198450,Firefox,not available in demo dataset,not available in demo dataset,...,not available in demo dataset,,,,,,,,,
1,Referral,20171016,8934116514970143966,Not Socially Engaged,1508176307,6,1508176307,Chrome,not available in demo dataset,not available in demo dataset,...,not available in demo dataset,/a/google.com/transportation/mtv-services/bike...,,,,,,,,
2,Direct,20171016,7992466427990357681,Not Socially Engaged,1508201613,1,1508201613,Chrome,not available in demo dataset,not available in demo dataset,...,not available in demo dataset,,True,,,,,,,
3,Organic Search,20171016,9075655783635761930,Not Socially Engaged,1508169851,1,1508169851,Chrome,not available in demo dataset,not available in demo dataset,...,not available in demo dataset,,,,,,,,,
4,Organic Search,20171016,6960673291025684308,Not Socially Engaged,1508190552,1,1508190552,Chrome,not available in demo dataset,not available in demo dataset,...,not available in demo dataset,,,,,,,,,


In [58]:
df.shape

(1000000, 58)