In [1]:
#pip install Faker

In [2]:
from faker import Faker
import random 
import pandas as pd
from datetime import datetime, timedelta
from datetime import date
import numpy as np

def generate_demographic_data(num_records):
    fake = Faker('en_US')
    Faker.seed(42)
    random.seed(42)
    demographic_data = []
    for i in range(1, num_records + 1):
        demographic_data.append({
            'ID': i,
            'Age': random.randint(18, 80),
            'Gender': random.choice(['Male', 'Female', 'Other']),
            'Location': fake.city(), #use predefined list instead, random choice 
            'Email': fake.email(), 
        })
    return pd.DataFrame(demographic_data)
# can add more features, and make income dependent on other factors besides just age

In [3]:
demo_df = generate_demographic_data(num_records = 1000)
demo_df

Unnamed: 0,ID,Age,Gender,Location,Email
0,1,58,Male,North Judithbury,donaldgarcia@example.net
1,2,19,Other,New Roberttown,robinsonwilliam@example.org
2,3,35,Male,Lake Debra,lrobinson@example.com
3,4,32,Male,Port Lindachester,joshua35@example.org
4,5,65,Male,South Colinstad,lindsay78@example.org
...,...,...,...,...,...
995,996,49,Other,South Samantha,kaylarhodes@example.com
996,997,60,Female,West Erinville,gbrooks@example.com
997,998,20,Male,North Christopher,rgibson@example.net
998,999,43,Other,West Taramouth,stacey92@example.org


In [4]:
def generate_income_by_age(age):
    if age < 25:
        return random.randint(20000, 50000)  # Young adults
    elif 25 <= age <= 40:
        return random.randint(50000, 100000)  # Mid-career
    elif 41 <= age <= 60:
        return random.randint(70000, 150000)  # Established professionals
    else:
        return random.randint(30000, 80000) 

demo_df['Income'] = demo_df.apply(lambda row: generate_income_by_age(row['Age']), axis=1)
demo_df

Unnamed: 0,ID,Age,Gender,Location,Email,Income
0,1,58,Male,North Judithbury,donaldgarcia@example.net,96794
1,2,19,Other,New Roberttown,robinsonwilliam@example.org,29876
2,3,35,Male,Lake Debra,lrobinson@example.com,63843
3,4,32,Male,Port Lindachester,joshua35@example.org,58993
4,5,65,Male,South Colinstad,lindsay78@example.org,46745
...,...,...,...,...,...,...
995,996,49,Other,South Samantha,kaylarhodes@example.com,132580
996,997,60,Female,West Erinville,gbrooks@example.com,112415
997,998,20,Male,North Christopher,rgibson@example.net,28128
998,999,43,Other,West Taramouth,stacey92@example.org,70895


In [5]:
#Viewership Data 
def generate_fake_viewership(demo_df, num_viewership_records, network_to_shows_genres, network_weights):
    fake = Faker('en_US')
    Faker.seed(42)
    random.seed(42)
    ids = demo_df['ID'].tolist()
    
    start_date = datetime(2022, 1,1, 0, 0, 0)
    end_date = datetime(2024, 12, 31, 21, 59, 59)

    viewership_data = []
    for _ in range(num_viewership_records):
        id_ = random.choice(ids)
        time_started = fake.date_time_between(start_date = start_date, end_date = end_date)
        duration = timedelta(minutes = fake.random_int(min=6, max= 120))
        time_ended = time_started + duration 
        
         # Select a network based on weights
        network = random.choices(list(network_to_shows_genres.keys()), weights=network_weights, k=1)[0]
        
        # Select a show and its corresponding genre
        show, genre = random.choice(list(network_to_shows_genres[network].items()))
        
        viewership_data.append({
            'ID': id_, 
            'Time Started': time_started,
            'Time Ended': time_ended, 
            'Show Name': show,
            'Network': network, 
            'Genre': genre
        })
    
    viewership_df = pd.DataFrame(viewership_data)
    return viewership_df 
    

In [6]:
# network_to_shows = {
#     'Lime Light': ['Fashion Frenzy: The Runway Wars', 'Home Makeover Rescue', 'Celebrity Chefs Showdown'],
#     'Pulse': ['Edge of Extinction', 'Highway Heist', 'Stormfront: The Final Mission'],
#     'ChillStream': ['Oceans Untamed: The Deep Unknown', 'Ancient Civilizations Uncovered', 'World Wonders: Nature''s Marvels'],
#     'RetroReel': ['Vintage Noir: Tale of the 50s', 'Hollywood Gold: The Best of the Silver Screen', 'The Golden Age of Television'],
#     'CineQuest': ['The Forgotten Kingdom', 'Unwritten Laws', 'Echoes of the Past'],
# }

# network_to_genres = {
#     'Lime Light': ['Reality TV', 'Lifestyle'],
#     'Pulse': ['Action', 'Thriller', 'Adventure'],
#     'ChillStream': ['Documentaries', 'Nature', 'Travel'],
#     'RetroReel': ['Classic Films'],
#     'CineQuest': ['Premium Films'],
# }

network_to_shows_genres = {
    'Lime Light': {
        'Fashion Frenzy: The Runway Wars': 'Reality TV',
        'Home Makeover Rescue': 'Lifestyle',
        'Celebrity Chefs Showdown': 'Reality TV',
    },
    'Pulse': {
        'Edge of Extinction': 'Action',
        'Highway Heist': 'Thriller',
        'Stormfront: The Final Mission': 'Adventure',
    },
    'ChillStream': {
        'Oceans Untamed: The Deep Unknown': 'Nature',
        'Ancient Civilizations Uncovered': 'Documentaries',
        'World Wonders: Nature\'s Marvels': 'Travel',
    },
    'RetroReel': {
        'Vintage Noir: Tale of the 50s': 'Classic Films',
        'Hollywood Gold: The Best of the Silver Screen': 'Classic Films',
        'The Golden Age of Television': 'Classic Films',
    },
    'CineQuest': {
        'The Forgotten Kingdom': 'Premium Films',
        'Unwritten Laws': 'Premium Films',
        'Echoes of the Past': 'Premium Films',
    },
}

network_weights = [20, 40, 20, 10, 10] #can adjust 
num_viewership_records = 100000
viewership_df = generate_fake_viewership(demo_df, num_viewership_records, network_to_shows_genres, network_weights)

viewership_df

#create histogram, total minutes viewed 

Unnamed: 0,ID,Time Started,Time Ended,Show Name,Network,Genre
0,655,2024-09-20 07:33:32,2024-09-20 07:53:32,Celebrity Chefs Showdown,Lime Light,Reality TV
1,282,2022-02-08 20:28:06,2022-02-08 22:08:06,Edge of Extinction,Pulse,Action
2,755,2023-03-04 05:50:10,2023-03-04 06:27:10,Celebrity Chefs Showdown,Lime Light,Reality TV
3,914,2022-12-13 17:53:58,2022-12-13 18:16:58,Stormfront: The Final Mission,Pulse,Adventure
4,433,2022-06-09 05:17:49,2022-06-09 06:49:49,Fashion Frenzy: The Runway Wars,Lime Light,Reality TV
...,...,...,...,...,...,...
99995,685,2023-10-17 07:12:22,2023-10-17 07:39:22,Home Makeover Rescue,Lime Light,Lifestyle
99996,701,2022-11-03 16:52:05,2022-11-03 18:21:05,Fashion Frenzy: The Runway Wars,Lime Light,Reality TV
99997,653,2022-03-18 08:59:53,2022-03-18 09:53:53,Edge of Extinction,Pulse,Action
99998,714,2024-01-21 03:44:46,2024-01-21 04:37:46,Fashion Frenzy: The Runway Wars,Lime Light,Reality TV


## Online Content Data

First, create subset of fake urls, which we will then sample from randomly in generate_fake_online_content()

In [7]:
shows = [
    "Fashion Frenzy: The Runway Wars",
    "Home Makeover Rescue",
    "Celebrity Chefs Showdown",
    "Style Seekers: Global Trends",
    "The Perfect Party Planner",
    "Behind the Glam: Celebrity Secrets",
    "Living Large: Luxury Homes Edition",
    "Fitness Gurus: Transformations",
    "Destination Wedding Dreams",
    "Extreme Makeovers: House Flips",
    "Edge of Extinction",
    "Highway Heist",
    "Stormfront: The Final Mission",
    "Deep Waters: Submarine Showdown",
    "Midnight Pursuit",
    "Target Locked",
    "Chasing Shadows: Elite Unit",
    "Blood Oath: Vigilante Justice",
    "Flight Risk",
    "The Last Outpost",
    "Oceans Untamed: The Deep Unknown",
    "Ancient Civilizations Uncovered",
    "World Wonders: Nature's Marvels",
    "The Amazon Trail: A Journey Through the Rainforest",
    "Surviving Antarctica: The Final Frontier",
    "Wildlife Warriors: Protectors of the Endangered",
    "Unseen Worlds: Microscopic Marvels",
    "Nomads of the North: Life in the Arctic",
    "Mount Everest: Beyond the Summit",
    "Wonders of the Coral Reef",
    "Vintage Noir: Tales of the 50s",
    "Hollywood Gold: The Best of the Silver Screen",
    "The Golden Age of Television",
    "Classic Cartoons Rewind",
    "Heroes of the West: Old Cowboy Adventures",
    "The Big Picture: Cinema’s Finest Years",
    "Love in Black and White: Romantic Classics",
    "Retro Sci-Fi: Space and Beyond",
    "Timeless Thrillers: Hitchcock’s Legacy",
    "1950s Sitcom Showcase",
    "The Forgotten Kingdom",
    "Unwritten Laws",
    "Echoes of the Past",
    "Shattered Mirror: A Tale of Betrayal",
    "Lies Beneath the Surface",
    "After the Storm: A Family’s Battle",
    "In the Shadow of Giants",
    "Fallen Crown: The Battle for the Throne",
    "Last Sunset: A Dystopian Love Story",
    "Whispers in the Dark"]

#start = "www"
subdomains = ["www", "stream", "watch", "view", "media", "online", "hub"]
suffixes = [
    "",
    "-season-1",
    "-exclusive",
    "-special",
    "-preview",
    f"?ref=source{random.randint(1, 10)}",
    f"?utm_campaign=ad{random.randint(1, 100)}",
    "-free-trial",
    "-new",
    "-episode-guide",
]
domains = ["example.com", "tvnetwork.com", "streamit.net", "watchhub.org", "myshow.tv"]

# Generate multiple variations per show
def generate_url_variations(show, num_variations=5):
    base_url = show.lower().replace(" ", "-").replace(":", "").replace("’", "").replace("'", "").replace(",", "")
    variations = []
    for _ in range(num_variations):
        subdomain = random.choice(subdomains)
        suffix = random.choice(suffixes)
        domain = random.choice(domains)
        variations.append(f"{subdomain}.{domain}/{base_url}{suffix}")
    return variations

# Generate URLs for all shows
all_urls = []
for show in shows:
    all_urls.extend(generate_url_variations(show, num_variations=random.randint(5,10)))  # Up to 5 variations per show

# Print a sample of the generated URLs
for url in random.sample(all_urls, 20):  # Show 20 random samples
    print(url)

# Total number of URLs
print(f"Total URLs generated: {len(all_urls)}")

stream.watchhub.org/whispers-in-the-dark-special
online.tvnetwork.com/the-golden-age-of-television-exclusive
media.tvnetwork.com/destination-wedding-dreams-free-trial
watch.example.com/chasing-shadows-elite-unit
view.myshow.tv/the-big-picture-cinemas-finest-years-preview
media.myshow.tv/deep-waters-submarine-showdown-season-1
online.myshow.tv/whispers-in-the-dark?ref=source2
media.myshow.tv/echoes-of-the-past-special
online.myshow.tv/ancient-civilizations-uncovered-new
online.tvnetwork.com/timeless-thrillers-hitchcocks-legacy-special
stream.myshow.tv/blood-oath-vigilante-justice-preview
watch.myshow.tv/fitness-gurus-transformations?ref=source2
hub.streamit.net/home-makeover-rescue-special
hub.example.com/after-the-storm-a-familys-battle?utm_campaign=ad33
www.watchhub.org/hollywood-gold-the-best-of-the-silver-screen?utm_campaign=ad33
stream.example.com/shattered-mirror-a-tale-of-betrayal-free-trial
online.streamit.net/unseen-worlds-microscopic-marvels?ref=source2
hub.example.com/celebri

In [8]:
all_urls

['watch.example.com/fashion-frenzy-the-runway-wars?ref=source2',
 'media.tvnetwork.com/fashion-frenzy-the-runway-wars-free-trial',
 'view.myshow.tv/fashion-frenzy-the-runway-wars-preview',
 'media.streamit.net/fashion-frenzy-the-runway-wars-new',
 'stream.watchhub.org/fashion-frenzy-the-runway-wars-special',
 'stream.watchhub.org/fashion-frenzy-the-runway-wars-episode-guide',
 'stream.watchhub.org/fashion-frenzy-the-runway-wars-exclusive',
 'online.watchhub.org/home-makeover-rescue?ref=source2',
 'stream.tvnetwork.com/home-makeover-rescue?ref=source2',
 'hub.streamit.net/home-makeover-rescue-special',
 'www.example.com/home-makeover-rescue-new',
 'hub.watchhub.org/home-makeover-rescue-episode-guide',
 'hub.streamit.net/home-makeover-rescue?utm_campaign=ad33',
 'view.watchhub.org/home-makeover-rescue-episode-guide',
 'watch.example.com/celebrity-chefs-showdown?ref=source2',
 'view.tvnetwork.com/celebrity-chefs-showdown?ref=source2',
 'hub.myshow.tv/celebrity-chefs-showdown?ref=source2',

In [9]:
#Online Content Data
def generate_fake_online_content(demo_df, num_online_content_records, all_urls):
    fake = Faker('en_US')
    Faker.seed(42)
    random.seed(42)
    
    ids = demo_df['ID'].tolist()
    
    start_date = datetime(2022, 1, 1)
    end_date = datetime(2024, 12, 31)
     
#     start_date = datetime(2022, 1,1, 0, 0, 0)
#     end_date = datetime(2024, 12, 31, 21, 59, 59)

    online_content_data = []
    for _ in range(num_online_content_records):
        id_ = random.choice(ids)
        date = fake.date_between(start_date = start_date, end_date = end_date)
        url = random.choice(all_urls) #sample urls from all_urls
        
        
        online_content_data.append({
            'ID': id_, 
            'DATE': date,
            'URL': url,
            'SCORE': None
        })
        
    mean = 0.5
    std_dev = 0.2
    scores = np.random.normal(loc = mean, scale=std_dev, size = num_online_content_records)
    scores = np.clip(scores, 0.0, 1.0)
    
#     for i in range(len(online_content_data)):
#         online_content_data[i]['SCORE'] = scores[i]
    
    for record, score in zip(online_content_data, scores):
        record['SCORE'] = score
        
    online_content_data = pd.DataFrame(online_content_data)
    return online_content_data 

In [11]:
num_online_content_records = 100000

online_content_df = generate_fake_online_content(demo_df, num_online_content_records, all_urls)

In [12]:
len(online_content_df['ID'].unique())

1000

In [13]:
online_content_df

Unnamed: 0,ID,DATE,URL,SCORE
0,655,2024-09-20,watch.watchhub.org/fitness-gurus-transformatio...,0.834853
1,26,2022-06-22,online.watchhub.org/flight-risk-free-trial,0.508279
2,251,2022-02-08,stream.tvnetwork.com/target-locked-episode-guide,0.738272
3,143,2023-03-04,media.watchhub.org/living-large-luxury-homes-e...,0.447552
4,693,2023-01-16,watch.streamit.net/1950s-sitcom-showcase-preview,0.102237
...,...,...,...,...
99995,901,2024-11-24,view.streamit.net/timeless-thrillers-hitchcock...,0.735416
99996,440,2022-04-09,online.example.com/nomads-of-the-north-life-in...,0.353572
99997,331,2024-05-11,online.example.com/1950s-sitcom-showcase,0.502180
99998,79,2024-09-02,watch.streamit.net/1950s-sitcom-showcase-preview,0.516853


In [14]:
#date(2024, 12, 31)

## Subscription Data

In [15]:
#Subscription Data 
#ID, BRAND, SUB START/SUB END (instead of start date?), TERM, STATUS  
#instead of replicating data, can just give them preprocessed data:
 #ie one ID per BRAND, start date, end date, resubscriber flag, frequnecy, current term, status 