### CSV Looper

In [1]:
import pandas as pd
import os
os.chdir('..')

In [2]:
import ast
df = pd.read_csv("data/sdk_combined.csv", dtype=str, keep_default_na=False, na_values=[""])
df = df.fillna("NA")
df["platform_list"] = df["platforms"].apply(ast.literal_eval)

first_row_df = df.iloc[0]
for p in first_row_df["platform_list"]:
    print(p)

print(type(first_row_df["platforms"]))

android
ios
<class 'str'>


In [3]:
df_test = df.head(15)
df_test

Unnamed: 0,android_id_from_ios_perspective,ios_id,android_id,ios_id_from_android_perspective,name,company,android_totins,ios_totins,totins,function,platforms,url,platform_list
0,962.0,4500.0,962.0,4500.0,Google Mobile Ads (AdMob),,1018573.0,282919.0,1301492,AD_NETWORK,"['android', 'ios']",,"[android, ios]"
1,566.0,350.0,566.0,350.0,Google,,1222566.0,54686.0,1277252,DEV_PLATFORM,"['android', 'ios']",google.com,"[android, ios]"
2,1001.0,4611.0,1001.0,4611.0,Firebase,,849357.0,245060.0,1094417,ANALYTICS,"['android', 'ios']",https://firebase.google.com/,"[android, ios]"
3,992.0,5146.0,992.0,5146.0,Google Sign-In,,965611.0,51321.0,1016932,SOCIAL,"['android', 'ios']",https://developers.google.com/android/,"[android, ios]"
4,1061.0,4612.0,1061.0,4612.0,Google Analytics for Firebase,,699395.0,222457.0,921852,ANALYTICS,"['android', 'ios']",https://firebase.google.com/docs/analytics/,"[android, ios]"
5,,,984.0,,Google Search Actions,,814302.0,,814302,DEV_TOOL,['android'],https://developers.google.com/android/,[android]
6,,5062.0,164459.0,,Miscellaneous,,278750.0,481238.0,759988,,"['android', 'ios']",,"[android, ios]"
7,974.0,4564.0,974.0,4564.0,Google Ad Manager (formerly DoubleClick for Pu...,,546541.0,197004.0,743545,MONETIZATION,"['android', 'ios']",https://admanager.google.com/home/,"[android, ios]"
8,572.0,387.0,572.0,387.0,Facebook (general),,510239.0,194255.0,704494,SOCIAL,"['android', 'ios']",facebook.com,"[android, ios]"
9,,5374.0,,,Apple Core Video,,,704442.0,704442,VIDEO,['ios'],https://developer.apple.com/documentation/core...,[ios]


In [None]:
import json
from typing import List
from src.sdk_research.core.schemas import SDK
from src.sdk_research.core.prompts import github_link_platform_specific_prompt, prompt_website_release_notes_general, metadata_extractor_prompt
from src.sdk_research.extractor_engine.release_notes.github.linkup_github_release_notes_extractor import LinkupGitHubReleaseNotesExtractor
from src.sdk_research.extractor_engine.release_notes.linkup_release_notes_extractor import LinkupWebsiteReleaseNotesExtractor
from src.sdk_research.extractor_engine.linkup_metadata_extractor import MetadataExtractor

linkup_api = 'a6e3a036-96b2-4393-8abc-a58d0a35c021'

EXCLUDED_DOMAINS = [
    "reddit.com",
    "x.com",
    "medium.com",
    "facebook.com",
    "linkedin.com",
]

# Extractor Engines
github_release_notes_extractor = LinkupGitHubReleaseNotesExtractor(linkup_api)
docs_release_notes_extractor = LinkupWebsiteReleaseNotesExtractor(linkup_api, exclude_list=EXCLUDED_DOMAINS)
metadata_extractor = MetadataExtractor(linkup_api, exclude_list=EXCLUDED_DOMAINS)

# Selected Prompts
GITHUB_LINK_PLATFORM_SPECIFIC_PROMPT = github_link_platform_specific_prompt
WEBSITE_RELEASE_NOTES_PROMPT = prompt_website_release_notes_general
METADATA_PS_PROMPT = metadata_extractor_prompt

def scrape_all_fields_sdk(sdk_name, platforms):

    scraper_release_notes_results = []
    repo_links = []
    for platform in platforms:
        github_rn_scraper_result, repo_link = github_release_notes_extractor.extract(GITHUB_LINK_PLATFORM_SPECIFIC_PROMPT, sdk_name, platform)
        scraper_release_notes_results.append(github_rn_scraper_result)
        repo_links.append(repo_link)

    docs_rn_scraper_result = docs_release_notes_extractor.extract(WEBSITE_RELEASE_NOTES_PROMPT, sdk_name)
    scraper_release_notes_results.append(docs_rn_scraper_result)

    metadata_scraper_result = metadata_extractor.extract(METADATA_PS_PROMPT, sdk_name)

    return scraper_release_notes_results, repo_links, metadata_scraper_result

# Main Loop
final_results: List[SDK] = []
for index, row in df_test.iterrows():
    sdk_name = row["name"]
    supported_platforms = row["platform_list"]
    url = row["url"]

    scraper_release_notes_results, repo_links, metadata_scraper_result = scrape_all_fields_sdk(sdk_name, supported_platforms)

    sdk_complete_result = SDK(
        index = index,
        sdk_name = sdk_name,
        platform = supported_platforms,
        android_id_from_ios_perspective = row["android_id_from_ios_perspective"],
        ios_id = row["ios_id"],
        android_id = row["android_id"],
        ios_id_from_android_perspective = row["ios_id_from_android_perspective"],
        company = row["company"],
        android_totins = row["android_totins"],
        ios_totins = row["ios_totins"],
        totins = row["totins"],
        function = row["function"],
        platforms = row["platforms"],
        url = row["url"],

        metadata = metadata_scraper_result,
        repository_url = repo_links,
        all_release_notes = scraper_release_notes_results,
    )

    final_results.append(sdk_complete_result)

In [7]:
# Serialize to JSON string
from datetime import datetime
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

json_string = json.dumps(
    [sdk.model_dump(mode="json") for sdk in final_results],
    indent=2,
    ensure_ascii=False
)

with open(f"data/final_results_{timestamp}.json", "w", encoding="utf-8") as f:
    f.write(json_string)