### Data Collection1

In [2]:
import pandas as pd
import random
from faker import Faker
from datetime import datetime, timedelta

# Initialize Faker with Indian locale
fake = Faker('en_IN')

# Tamil Nadu District List (for location column)
tamil_nadu_districts = [
    "Chennai", "Coimbatore", "Madurai", "Tiruchirappalli", "Salem", "Erode", "Vellore", 
    "Tirunelveli", "Thoothukudi", "Dindigul", "Thanjavur", "Namakkal", "Karur", "Sivagangai", 
    "Virudhunagar", "Cuddalore", "Kanchipuram", "Tiruvallur", "Krishnagiri", "Dharmapuri",
    "Nagapattinam", "Tiruppur", "Ramanathapuram", "Nilgiris", "Ariyalur", "Perambalur",
    "Villupuram", "Tenkasi", "Pudukkottai", "Kallakurichi"
]

# Parameters
num_users = 10000
start_date = datetime(2024, 1, 1)
end_date = datetime(2025, 1, 1)

# Helper function to generate random date in range
def random_date(start, end):
    return start + timedelta(seconds=random.randint(0, int((end - start).total_seconds())))

# Generate synthetic customer data
data = []
for i in range(1, num_users + 1):
    user_id = f"U{str(i).zfill(5)}"
    name = fake.name()
    age = random.randint(18, 60)
    gender = random.choice(["Male", "Female"])
    location = random.choice(tamil_nadu_districts)
    signup_date = random_date(start_date, end_date).strftime("%Y-%m-%d")
    
    data.append([user_id, name, age, gender, location, signup_date])

# Create DataFrame
customer_df = pd.DataFrame(data, columns=["User_ID", "Name", "Age", "Gender", "Location", "Signup_Date"])

# Save to CSV
customer_df.to_csv("tamilnadu_customer_data.csv", index=False)

print("✅ Tamil Nadu Customer Data Generated Successfully!")
print(customer_df.head(10))
print("\nTotal Records:", len(customer_df))


✅ Tamil Nadu Customer Data Generated Successfully!
  User_ID             Name  Age  Gender         Location Signup_Date
0  U00001      Charvi Lata   23    Male  Tiruchirappalli  2024-04-13
1  U00002  Widisha Sachdev   35    Male      Krishnagiri  2024-06-30
2  U00003    Darsh Sanghvi   30  Female          Chennai  2024-08-29
3  U00004       Finn Amble   21  Female       Tiruvallur  2024-12-11
4  U00005  Madhav Choudhry   35    Male       Dharmapuri  2024-11-04
5  U00006      Aadhya Sura   52  Female         Namakkal  2024-06-13
6  U00007    Daniel Bhakta   34    Male          Madurai  2024-05-12
7  U00008     Chameli Loke   20  Female       Villupuram  2024-08-01
8  U00009     Mitesh Banik   18  Female            Erode  2024-10-12
9  U00010         Urvi Lad   60    Male         Ariyalur  2024-01-31

Total Records: 10000


### Data_Collection2

In [8]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# -----------------------------
# CONFIGURATION
# -----------------------------
num_records = 10000
start_date = datetime.now() - timedelta(days=365)
end_date = datetime.now()

# Sample options for categorical fields
channels = ["Google Ads", "Facebook", "Instagram", "LinkedIn", "Email", "Blog", "Organic Search", "Referral", "YouTube"]
mediums = {
    "Google Ads": "Paid",
    "Facebook": "Social",
    "Instagram": "Social",
    "LinkedIn": "Social",
    "Email": "Email",
    "Blog": "Organic",
    "Organic Search": "Organic",
    "Referral": "Referral",
    "YouTube": "Video"
}
touch_types = ["Click", "Visit", "Impression", "Open", "Conversion"]
pages = [
    "/home", "/product", "/pricing", "/about", "/contact", 
    "/blog/marketing", "/blog/data", "/checkout", "/thank-you"
]

# -----------------------------
# DATA GENERATION
# -----------------------------
np.random.seed(42)

data = []

for i in range(num_records):
    user_id = random.randint(1000, 9999)
    
    # Random timestamp within the last 365 days
    random_days = random.randint(0, 364)
    random_time = timedelta(
        hours=random.randint(0, 23),
        minutes=random.randint(0, 59),
        seconds=random.randint(0, 59)
    )
    timestamp = (start_date + timedelta(days=random_days) + random_time).strftime("%Y-%m-%d %H:%M:%S")

    # Random channel and medium
    channel = random.choice(channels)
    medium = mediums[channel]

    # Random campaign ID and name
    campaign_id = f"CAMP_{random.randint(100, 999)}"
    campaign_name = f"{channel}_Promo_{random.randint(1, 50)}"

    # Random touchpoint type and page
    touch_type = random.choice(touch_types)
    page = random.choice(pages)

    data.append([
        user_id,
        timestamp,
        channel,
        campaign_id,
        campaign_name,
        medium,
        touch_type,
        page
    ])

# Create DataFrame
df = pd.DataFrame(data, columns=[
    "User_ID", "TouchpointTimestamp", "Channel", "Campaign_ID",
    "Campaign_Name", "Medium", "Touchpoint_Type", "Page"
])

# -----------------------------
# EXPORT TO CSV
# -----------------------------
df.to_csv("touchpoint_data.csv", index=False)
print(f"✅ Generated {len(df)} records and saved as 'touchpoint_data.csv'")
print(df.head(10))


✅ Generated 10000 records and saved as 'touchpoint_data.csv'
   User_ID  TouchpointTimestamp    Channel Campaign_ID       Campaign_Name  \
0     1063  2025-06-26 09:14:48   Referral    CAMP_460   Referral_Promo_30   
1     9829  2025-08-12 18:10:05   Facebook    CAMP_948   Facebook_Promo_38   
2     2383  2025-08-09 21:15:51  Instagram    CAMP_261  Instagram_Promo_45   
3     5829  2025-09-30 13:21:22    YouTube    CAMP_264     YouTube_Promo_6   
4     7104  2025-10-12 03:25:18    YouTube    CAMP_955    YouTube_Promo_46   
5     6503  2025-08-21 00:31:05   Referral    CAMP_327   Referral_Promo_26   
6     1270  2025-04-10 10:08:04   LinkedIn    CAMP_293   LinkedIn_Promo_20   
7     3297  2025-08-11 19:31:23   Facebook    CAMP_684    Facebook_Promo_4   
8     7820  2025-09-19 15:04:22   LinkedIn    CAMP_455    LinkedIn_Promo_5   
9     3549  2025-10-08 23:39:57      Email    CAMP_497      Email_Promo_21   

     Medium Touchpoint_Type             Page  
0  Referral      Conversion      

### Data_Collection3

In [9]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# -----------------------------
# CONFIGURATION
# -----------------------------
num_records = 10000
start_date = datetime.now() - timedelta(days=365)
end_date = datetime.now()

# Possible conversion events and their approximate value ranges
conversion_events = {
    "Purchase": (50, 5000),
    "Signup": (0, 0),
    "Subscription": (100, 2000),
    "Download": (0, 0),
    "Upgrade": (200, 3000),
    "Add_to_Cart": (0, 0)
}

# -----------------------------
# DATA GENERATION
# -----------------------------
np.random.seed(42)
data = []

for i in range(num_records):
    transaction_id = f"TXN_{100000 + i}"
    user_id = random.randint(1000, 9999)
    event = random.choice(list(conversion_events.keys()))
    
    # Random timestamp within the last year
    random_days = random.randint(0, 364)
    random_time = timedelta(
        hours=random.randint(0, 23),
        minutes=random.randint(0, 59),
        seconds=random.randint(0, 59)
    )
    timestamp = (start_date + timedelta(days=random_days) + random_time).strftime("%Y-%m-%d %H:%M:%S")
    
    # Conversion value depending on event type
    value_range = conversion_events[event]
    if value_range[0] == value_range[1] == 0:
        conversion_value = 0
    else:
        conversion_value = round(random.uniform(value_range[0], value_range[1]), 2)
    
    data.append([
        transaction_id,
        user_id,
        event,
        timestamp,
        conversion_value
    ])

# -----------------------------
# CREATE DATAFRAME
# -----------------------------
df = pd.DataFrame(data, columns=[
    "Transaction_ID",
    "User_ID",
    "Conversion_Event",
    "Conversion_Timestamp",
    "Conversion_Value"
])

# -----------------------------
# EXPORT TO CSV
# -----------------------------
df.to_csv("conversion_data.csv", index=False)
print(f"✅ Generated {len(df)} conversion records and saved as 'conversion_data.csv'")
print(df.head(10))


✅ Generated 10000 conversion records and saved as 'conversion_data.csv'
  Transaction_ID  User_ID Conversion_Event Conversion_Timestamp  \
0     TXN_100000     6492         Download  2025-09-25 06:13:42   
1     TXN_100001     6322          Upgrade  2025-01-07 18:02:24   
2     TXN_100002     2626           Signup  2025-01-08 20:37:40   
3     TXN_100003     3484         Purchase  2025-08-07 03:12:46   
4     TXN_100004     6515         Download  2024-11-24 16:05:10   
5     TXN_100005     6171     Subscription  2025-02-24 03:41:20   
6     TXN_100006     1208         Download  2025-10-17 04:15:15   
7     TXN_100007     6021         Purchase  2025-08-07 01:59:43   
8     TXN_100008     7963         Purchase  2025-06-23 16:50:20   
9     TXN_100009     2422           Signup  2025-09-28 15:39:20   

   Conversion_Value  
0              0.00  
1           2423.74  
2              0.00  
3           1750.09  
4              0.00  
5           1216.54  
6              0.00  
7            3

### Data_Collection4

In [10]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# -----------------------------
# CONFIGURATION
# -----------------------------
num_records = 10000
start_date = datetime.now() - timedelta(days=365)
channels = ["Facebook Ads", "Google Ads", "Instagram", "LinkedIn", "Twitter", "Email", "YouTube", "Organic Search"]
mediums = ["Paid", "Social", "Organic", "Referral"]

np.random.seed(42)
data = []

# -----------------------------
# DATA GENERATION
# -----------------------------
for i in range(num_records):
    channel = random.choice(channels)
    medium = random.choice(mediums)
    campaign_id = f"CAMP_{random.randint(1000, 9999)}"
    
    # Random date within last year
    random_days = random.randint(0, 364)
    date = (start_date + timedelta(days=random_days)).strftime("%Y-%m-%d")
    
    # Random cost (varies by channel)
    base_cost = {
        "Facebook Ads": random.uniform(100, 2000),
        "Google Ads": random.uniform(200, 2500),
        "Instagram": random.uniform(80, 1500),
        "LinkedIn": random.uniform(150, 3000),
        "Twitter": random.uniform(50, 1200),
        "Email": random.uniform(30, 800),
        "YouTube": random.uniform(100, 2500),
        "Organic Search": random.uniform(0, 300)
    }[channel]
    
    # Performance metrics
    impressions = random.randint(1000, 50000)
    clicks = random.randint(50, impressions // 10)
    conversions = random.randint(5, clicks // 2)
    
    # Estimated revenue from conversions
    avg_revenue_per_conversion = random.uniform(10, 200)
    revenue = round(conversions * avg_revenue_per_conversion, 2)
    
    data.append([
        campaign_id,
        channel,
        medium,
        date,
        round(base_cost, 2),
        impressions,
        clicks,
        conversions,
        revenue
    ])

# -----------------------------
# CREATE DATAFRAME
# -----------------------------
df = pd.DataFrame(data, columns=[
    "Campaign_ID",
    "Channel_Name",
    "Medium",
    "Date",
    "Cost",
    "Impressions",
    "Clicks",
    "Conversions",
    "Revenue"
])

# -----------------------------
# EXPORT TO CSV
# -----------------------------
df.to_csv("channel_cost_data.csv", index=False)
print(f"✅ Generated {len(df)} records of Channel Cost/Budget Data and saved as 'channel_cost_data.csv'")
print(df.head(10))


✅ Generated 10000 records of Channel Cost/Budget Data and saved as 'channel_cost_data.csv'
  Campaign_ID    Channel_Name    Medium        Date     Cost  Impressions  \
0   CAMP_3756  Organic Search  Referral  2024-12-17   132.68         8204   
1   CAMP_2957         YouTube   Organic  2024-11-16   933.30        42696   
2   CAMP_6049  Organic Search      Paid  2025-02-07   109.88        44616   
3   CAMP_8874         YouTube    Social  2025-03-24  2180.59        42768   
4   CAMP_1106        LinkedIn   Organic  2025-10-08  1770.62        29371   
5   CAMP_8993  Organic Search      Paid  2025-05-28   174.87        31467   
6   CAMP_8540           Email      Paid  2025-07-17   413.05        24100   
7   CAMP_5366         YouTube  Referral  2025-03-28   694.91         4741   
8   CAMP_5959    Facebook Ads      Paid  2025-02-04   214.16        22967   
9   CAMP_3858    Facebook Ads      Paid  2025-06-08   672.44        45148   

   Clicks  Conversions   Revenue  
0     513          154  27

### Data_Collection5

In [11]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# -----------------------------
# CONFIGURATION
# -----------------------------
num_records = 10000
start_date = datetime.now() - timedelta(days=365)

np.random.seed(42)
data = []

# -----------------------------
# SAMPLE CONFIGURATIONS
# -----------------------------
locations = ["Chennai", "Bangalore", "Mumbai", "Delhi", "Hyderabad", "Kolkata", "Pune"]
subscription_types = ["Basic", "Premium", "Gold", "Enterprise"]

# -----------------------------
# DATA GENERATION
# -----------------------------
for i in range(1, num_records + 1):
    customer_id = f"CUST_{i:05d}"
    location = random.choice(locations)
    subscription_plan = random.choice(subscription_types)
    
    # Generate number of purchases (1–10)
    num_purchases = random.randint(1, 10)
    
    # Generate random purchase dates within last year
    purchase_dates = sorted([
        (start_date + timedelta(days=random.randint(0, 364))).strftime("%Y-%m-%d")
        for _ in range(num_purchases)
    ])
    
    # Generate purchase values
    purchase_values = [round(random.uniform(20, 500), 2) for _ in range(num_purchases)]
    
    # Compute total CLV
    total_value = round(sum(purchase_values), 2)
    
    # Subscription or churn info
    subscription_length_months = random.randint(1, 12)
    churned = random.choice(["Yes", "No"])
    
    data.append([
        customer_id,
        location,
        subscription_plan,
        num_purchases,
        purchase_dates[-1],  # last purchase date
        total_value,
        subscription_length_months,
        churned,
        purchase_dates,
        purchase_values
    ])

# -----------------------------
# CREATE DATAFRAME
# -----------------------------
df = pd.DataFrame(data, columns=[
    "Customer_ID",
    "Location",
    "Subscription_Plan",
    "Total_Purchases",
    "Last_Purchase_Date",
    "Customer_Lifetime_Value",
    "Subscription_Length_Months",
    "Churned",
    "Purchase_History_Dates",
    "Purchase_Values"
])

# -----------------------------
# EXPORT TO CSV
# -----------------------------
df.to_csv("customer_lifetime_data.csv", index=False)
print(f"✅ Generated {len(df)} Customer Lifetime / CLV records and saved as 'customer_lifetime_data.csv'")
print(df.head(10))


✅ Generated 10000 Customer Lifetime / CLV records and saved as 'customer_lifetime_data.csv'
  Customer_ID   Location Subscription_Plan  Total_Purchases  \
0  CUST_00001  Bangalore        Enterprise                7   
1  CUST_00002    Kolkata        Enterprise                3   
2  CUST_00003  Bangalore        Enterprise                3   
3  CUST_00004       Pune        Enterprise                3   
4  CUST_00005     Mumbai        Enterprise                9   
5  CUST_00006       Pune        Enterprise                9   
6  CUST_00007     Mumbai           Premium                4   
7  CUST_00008    Kolkata             Basic                7   
8  CUST_00009       Pune             Basic                1   
9  CUST_00010  Bangalore             Basic                5   

  Last_Purchase_Date  Customer_Lifetime_Value  Subscription_Length_Months  \
0         2025-08-06                  1667.20                           7   
1         2025-09-05                   737.11               