General Imports

In [None]:
!pip install pandas numpy boto3 faker
import pandas as pd
import numpy as np
import datetime
import random
from faker import Faker
import boto3
from google.colab import drive
import os
drive.mount('drive')

AWS Details

In [None]:
# Set up S3 client
s3 = boto3.client(
    's3',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name='us-east-1'
)

# Upload dataset to S3
bucket_name = "my_s3_bucket"

# Mock Ad Campaign Dataset Generation

In [None]:
# Initialize Faker and define platforms
fake = Faker()
campaigns = ["Facebook", "Google Ads", "LinkedIn", "TikTok", "Twitter", "Gooogle Ads", "Facebok", "Tik-Tok"]  # With typos

# Set parameters
num_rows = 10_000_000  
batch_size = 1_000_000  # Process in batches
chunk_size = 50_000     # Write in 50K-row chunks to CSV
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
file_name = f"ad_campaign_data_{timestamp}.csv"  # Unique file name per run
output_file = f"/content/{file_name}"  # Save locally in Colab

# Generate & Save Data in Batches
for batch in range(num_rows // batch_size):
    data = []
    
    for _ in range(batch_size):
        data.append({
            "campaign_id": fake.uuid4() if random.random() > 0.01 else np.nan,  # 1% missing
            "platform": random.choice(campaigns),  # Includes typos
            "date": fake.date_this_year() if random.random() > 0.02 else np.nan,  # 2% missing dates
            "impressions": random.randint(1000, 50000),
            "clicks": random.randint(100, 5000) if random.random() > 0.05 else np.nan,  # 5% missing
            "spend": round(random.uniform(10, 500), 2),
            "conversions": random.randint(1, 500) if random.random() > 0.05 else np.nan,  # 5% missing
        })
    
    df = pd.DataFrame(data)

    # Introduce Duplicates (2% of each batch)
    duplicate_rows = df.sample(frac=0.02, random_state=42)
    df = pd.concat([df, duplicate_rows], ignore_index=True)

    # Write Data in Chunks to CSV
    for i, chunk in enumerate(range(0, len(df), chunk_size)):
        df.iloc[chunk:chunk + chunk_size].to_csv(output_file, mode='a', index=False, header=(batch == 0 and i == 0))

    print(f"Batch {batch + 1} processed and written to {output_file}")

# Upload Dataset to S3
s3_key = f"raw/{file_name}"  # Store in raw folder
s3.upload_file(output_file, bucket_name, s3_key)
print(f"File uploaded to S3: s3://{bucket_name}/{s3_key}")

# Optional: Remove local file after upload to save storage
os.remove(output_file)