<a href="https://colab.research.google.com/github/San-dra/Digital-Marketing-Performance-Dashboard-UrbanNest/blob/main/02_generate_backfill_websessions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta


from google.auth import default
import gspread_dataframe as gd
from google.colab import auth

from google.colab import userdata

from googleapiclient.discovery import build
import gspread_dataframe as gd

auth.authenticate_user()

# Connect to Google Sheets
creds, _ = default()
gc = gspread.authorize(creds)


In [None]:
# --- SETTINGS ---
start_date = datetime(2022, 1, 1)
end_date = datetime(2025, 4, 28)
days = (end_date - start_date).days + 1

In [None]:
countries = [
    'Austria', 'Belgium', 'Bulgaria', 'Croatia', 'Cyprus', 'Czech Republic',
    'Denmark', 'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hungary',
    'Ireland', 'Italy', 'Latvia', 'Lithuania', 'Luxembourg', 'Malta', 'Netherlands',
    'Poland', 'Portugal', 'Romania', 'Slovakia', 'Slovenia', 'Spain', 'Sweden'
]

In [None]:
# Uplift factors for campaigns
uplift = {
    'normal': (1.0, 1.2),
    'campaign': (1.3, 2.0),
}

In [None]:
# Open the Campaigns Google Sheet inside Campaigns folder
campaigns_spreadsheet = gc.open_by_key(userdata.get('CAMPAIGNS_SHEET_ID'))
worksheet = campaigns_spreadsheet.sheet1
df_campaigns = gd.get_as_dataframe(worksheet).dropna()

print(df_campaigns.head())


  campaign_id               campaign_name  start_date    end_date  \
0    camp_001  Spring Home Refresh Europe  2022-03-01  2022-04-10   
1    camp_002            Easter Home Sale  2022-04-01  2022-04-15   
2    camp_003        UK Bank Holiday Sale  2022-05-01  2022-05-08   
3    camp_004   French Bastille Day Promo  2022-07-10  2022-07-20   
4    camp_005  Summer Outdoor Living Sale  2022-06-15  2022-07-31   

                          target_countries  budget_usd     campaign_type  
0                                      All     15000.0          Seasonal  
1  ['Germany', 'France', 'Italy', 'Spain']      8000.0           Holiday  
2                              ['Ireland']      5000.0           Holiday  
3                               ['France']      7000.0  National Holiday  
4                                      All     12000.0          Seasonal  


In [None]:
def find_campaign_id(date, country):
    eligible_campaigns = []
    for _, camp in df_campaigns.iterrows():
        if pd.to_datetime(camp['start_date']) <= date <= pd.to_datetime(camp['end_date']):
            if camp['target_countries'] == "All" or country in camp['target_countries']:
                eligible_campaigns.append(camp['campaign_id'])
    return random.choice(eligible_campaigns) if eligible_campaigns else 'None'


In [None]:
# Weighted options
device_weights = ['mobile']*60 + ['desktop']*35 + ['tablet']*5
age_weights = ['18-24']*15 + ['25-34']*40 + ['35-44']*30 + ['45-54']*15
gender_weights = ['female']*60 + ['male']*38 + ['other']*2

# Bigger markets weighted higher, small countries less
country_weights = (
    ['Germany']*20 + ['France']*18 + ['Italy']*15 + ['Spain']*15 +
    ['Netherlands']*8 + ['Poland']*8 + ['Sweden']*5 + ['Ireland']*5 +
    ['Denmark']*4 + ['Belgium']*4 +
    ['Austria']*3 + ['Finland']*3 + ['Portugal']*3 + ['Czech Republic']*3 +
    ['Greece']*2 + ['Hungary']*2 + ['Romania']*2 +
    ['Bulgaria']*1 + ['Croatia']*1 + ['Cyprus']*1 + ['Estonia']*1 +
    ['Latvia']*1 + ['Lithuania']*1 + ['Luxembourg']*1 + ['Malta']*1 +
    ['Slovakia']*1 + ['Slovenia']*1
)

In [None]:
# --- GENERATE SESSION DATA ---
rows = []
for day in range(days):
    current_date = start_date + timedelta(days=day)

    base_sessions = random.randint(450, 650)
    country = random.choice(country_weights)
    camp_id = find_campaign_id(current_date, country)
    factor = random.uniform(*uplift['campaign' if camp_id != 'None' else 'normal'])
    num_sessions = int(base_sessions * factor)

    for _ in range(num_sessions):
        session = {
            'session_id': f"sess_{random.randint(1000000,9999999)}",
            'user_id': f"user_{random.randint(10000,99999)}",
            'session_date': current_date.strftime("%Y-%m-%d"),
            'source_medium': random.choice([
                                            'google/organic',
                                            'facebook/organic',
                                            'facebook/paid',
                                            'instagram/organic',
                                            'instagram/paid',
                                            'email/direct',
                                            'direct/none',
                                            'referral/other',
                                            'tiktok/organic',
                                            'youtube/organic'
                                        ]),
            'campaign_id': camp_id,
            'device_category': random.choice(device_weights),
            'country': random.choice(country_weights),
            'age_group': random.choice(age_weights),
            'gender': random.choice(gender_weights),
            'sessions': 1,
            'pageviews': random.randint(1,10),
            'bounce': random.choice([True, False]),
            'session_duration_seconds': random.randint(30,600),
            'goal_completion': random.choice([0,1]),
            'transaction_revenue': round(random.uniform(25.0, 300.0),2) if random.random() < 0.2 else 0
        }
        rows.append(session)

In [None]:
# --- FINALIZE DATAFRAMES ---
df_sessions = pd.DataFrame(rows)

In [None]:
# Preview
df_sessions.head()

Unnamed: 0,session_id,user_id,session_date,source_medium,campaign_id,device_category,country,age_group,gender,sessions,pageviews,bounce,session_duration_seconds,goal_completion,transaction_revenue
0,sess_4038858,user_30880,2022-01-01,facebook/paid,,desktop,Germany,25-34,female,1,3,False,293,1,0.0
1,sess_9915305,user_44202,2022-01-01,facebook/organic,,desktop,Netherlands,35-44,female,1,9,False,83,0,0.0
2,sess_8940694,user_63201,2022-01-01,google/organic,,mobile,France,45-54,female,1,2,True,580,0,0.0
3,sess_6411131,user_81963,2022-01-01,email/direct,,mobile,Denmark,35-44,male,1,7,False,264,1,0.0
4,sess_3547725,user_25964,2022-01-01,instagram/paid,,desktop,France,45-54,female,1,5,False,112,1,0.0


In [None]:
# --- GROUP SESSIONS BY MONTH_YEAR AND SAVE TO INDIVIDUAL GOOGLE SHEETS IN FOLDER ---
# Authenticate Google Drive service (you already authenticated before)
drive_service = build('drive', 'v3')

# Folder ID where you want to store the session files
folder_id = userdata.get('WEB_SESSIONS_DATA_FOLDER')

# Create month_start column for real sorting
df_sessions['month_start'] = pd.to_datetime(df_sessions['session_date']).dt.to_period('M').dt.to_timestamp()

# Create month_year text for filenames
df_sessions['month_year'] = df_sessions['month_start'].dt.strftime('%b_%Y')

# Sort properly by real month_start
df_sessions = df_sessions.sort_values('month_start')

# Group by real month_start order
grouped = df_sessions.groupby('month_year', sort=False)


for i, (month_year, group) in enumerate(grouped, start=1):
    file_name = f"{i:03d}_sessions_{month_year}"  # <--- this is where names like 001_sessions_Jan_2022 are built
    spreadsheet = gc.create(file_name)
    worksheet = spreadsheet.get_worksheet(0)
    gd.set_with_dataframe(worksheet, group.drop(columns=['month_year']))
    # Move file into folder
    file_id = spreadsheet.id
    drive_service.files().update(
        fileId=file_id,
        addParents=folder_id,
        removeParents='root',
        fields='id, parents'
    ).execute()

    print(f'Uploaded and moved {file_name} into folder.')




Uploaded and moved 001_sessions_Jan_2022 into folder.
Uploaded and moved 002_sessions_Feb_2022 into folder.
Uploaded and moved 003_sessions_Mar_2022 into folder.
Uploaded and moved 004_sessions_Apr_2022 into folder.
Uploaded and moved 005_sessions_May_2022 into folder.
Uploaded and moved 006_sessions_Jun_2022 into folder.
Uploaded and moved 007_sessions_Jul_2022 into folder.
Uploaded and moved 008_sessions_Aug_2022 into folder.
Uploaded and moved 009_sessions_Sep_2022 into folder.
Uploaded and moved 010_sessions_Oct_2022 into folder.
Uploaded and moved 011_sessions_Nov_2022 into folder.
Uploaded and moved 012_sessions_Dec_2022 into folder.
Uploaded and moved 013_sessions_Jan_2023 into folder.
Uploaded and moved 014_sessions_Feb_2023 into folder.
