# Script for collecting data to train the fusion model.

In [6]:
# Below is the correct script 

In [5]:
import requests
import pandas as pd
import time

# -------------------------------
# Configurations
# -------------------------------
CHANNEL_IDS = [
    # Add your 150 channel IDs here
    "UCAo_wAxH1WT6rFmK5yCd1Cg"#yashnhass
    # "UCYFn7BOOlmL21iNJ6q5IDfg",# lochi
    # "UCJbxRq_IlWyzvB9KK0Mrs8A",#ratta
    # "UCRp1csKx1dx8rsmcD6f9q1g"
    
    # ...
]

USER_PROFILING_URL = "http://127.0.0.1:8000/user-profiling/"
CHANNEL_EMBED_URL = "http://127.0.0.1:8000/embed/channel-embedding"

OUTPUT_CSV = "channel_embedding_results.csv"

# -------------------------------
# Function to get user profiling
# -------------------------------
def get_user_profiling(channel_id):
    payload = {"channel_id": channel_id}
    try:
        response = requests.post(USER_PROFILING_URL, json=payload)
        response.raise_for_status()
        return response.json()
    except Exception as e:
        print(f"Error profiling channel {channel_id}: {e}")
        return None

# -------------------------------
# Function to get channel embedding
# -------------------------------
def get_channel_embedding(user_profile_response):
    try:
        response = requests.post(CHANNEL_EMBED_URL, json=user_profile_response)
        response.raise_for_status()
        return response.json()
    except Exception as e:
        print(f"Error embedding channel {user_profile_response.get('channel_title', 'Unknown')}: {e}")
        return None

# -------------------------------
# Main loop
# -------------------------------
results = []

for idx, channel_id in enumerate(CHANNEL_IDS, 1):
    print(f"[{idx}/{len(CHANNEL_IDS)}] Processing channel ID: {channel_id}")

    # 1. Get user profiling
    user_profile = get_user_profiling(channel_id)
    if user_profile is None:
        results.append({"channel_id": channel_id, "embedding_response": None})
        continue

    # 2. Get channel embedding
    embedding_response = get_channel_embedding(user_profile)
    if embedding_response is None:
        results.append({"channel_id": channel_id, "embedding_response": None})
        continue

    # 3. Save the result
    results.append({"channel_id": channel_id, "embedding_response": embedding_response['embedding']})

    # Optional: avoid hitting the server too fast
    time.sleep(0.1)  # sleep 100ms between requests

# -------------------------------
# Save results to CSV
# -------------------------------
df = pd.DataFrame(results)
df.to_csv(OUTPUT_CSV, index=False)
print(f"Saved results to {OUTPUT_CSV}")


[1/1] Processing channel ID: UCAo_wAxH1WT6rFmK5yCd1Cg
Saved results to channel_embedding_results.csv


In [7]:
df = pd.read_csv('channel_embedding_results.csv')
df

Unnamed: 0,channel_id,embedding_response
0,UCAo_wAxH1WT6rFmK5yCd1Cg,"[0.04781635105609894, 0.06638624519109726, -0...."


In [3]:
import requests
import pandas as pd
import time

# -------------------------------
# Configurations
# -------------------------------

CHANNEL_IDS = [
    "UCAo_wAxH1WT6rFmK5yCd1Cg",
    "UCYFn7BOOlmL21iNJ6q5IDfg",
    "UCJbxRq_IlWyzvB9KK0Mrs8A"
    # Add the rest of your 150 IDs
]

# FastAPI endpoint
USER_PROFILING_URL = "http://127.0.0.1:8000/user-profiling/"

OUTPUT_CSV = "channel_fetching_results.csv"

# -------------------------------
# Functions
# -------------------------------
def get_user_profiling(channel_id):
    """Fetch channel details via FastAPI."""
    payload = {"channel_id": channel_id}
    try:
        response = requests.post(USER_PROFILING_URL, json=payload)
        response.raise_for_status()
        return response.json()
    except Exception as e:
        print(f"Error fetching channel {channel_id}: {e}")
        return None

# -------------------------------
# Main loop
# -------------------------------
all_responses = []

for idx, channel_id in enumerate(CHANNEL_IDS, 1):
    print(f"[{idx}/{len(CHANNEL_IDS)}] Fetching channel ID: {channel_id}")

    user_profile = get_user_profiling(channel_id)
    if user_profile is not None:
        all_responses.append(user_profile)
    
    time.sleep(0.2)  # small delay to avoid rate limiting

# -------------------------------
# Save results
# -------------------------------
# Save as JSON lines for easier later processing
import json
with open("channel_fetching_results.json", "w", encoding="utf-8") as f:
    for item in all_responses:
        json.dump(item, f)
        f.write("\n")

print(f"Saved {len(all_responses)} channel responses to channel_fetching_results.json")


[1/3] Fetching channel ID: UCAo_wAxH1WT6rFmK5yCd1Cg
[2/3] Fetching channel ID: UCYFn7BOOlmL21iNJ6q5IDfg
[3/3] Fetching channel ID: UCJbxRq_IlWyzvB9KK0Mrs8A
Saved 3 channel responses to channel_fetching_results.json


In [4]:
df_emb = pd.read_csv('channel_embedding_results.csv')
df_emb

Unnamed: 0,channel_id,embedding_response
0,UCAo_wAxH1WT6rFmK5yCd1Cg,"{'embedding': [0.047871872782707214, 0.0662819..."
