In [12]:
## BASIC PROFILE DATA ##. --- Calculate daily follower counts

import requests
import pandas as pd
import datetime
import pytz  # For timezone handling

usernames = ["naukridotcom", "swiggyindia", "zomato", "instagram"]
api_key = "55325f396cmsh1812ff7f2016376p1079d8jsn5ef3a673c06c"  # Replace with your actual API key
url = "https://starapi1.p.rapidapi.com/instagram/user/get_web_profile_info"


def get_instagram_data(username, api_key):
    """Fetches Instagram data for a given username."""
    payload = {"username": username}
    headers = {
        "x-rapidapi-key": api_key,
        "x-rapidapi-host": "starapi1.p.rapidapi.com",
        "Content-Type": "application/json",
    }
    response = requests.post(url, json=payload, headers=headers)
    data = response.json()

    if (
        data
        and data["status"] == "done"
        and data["response"]["body"]["data"]["user"]
    ):
        user_data = data["response"]["body"]["data"]["user"]
        user_data["profile_id"] = user_data.get(
            "id"
        )  # Add 'profile_id' for later use

        # Add timestamp in IST
        ist = pytz.timezone("Asia/Kolkata")
        user_data["timestamp_ist"] = datetime.datetime.now(ist)
        return user_data
    else:
        print(f"Error fetching data for {username}: {data}")  # Detailed error
        return None


def create_dataframe(data):
    """Creates a Pandas DataFrame from Instagram data."""

    if not data:
        print("No data to create DataFrame.")
        return None

    # Prepare data for DataFrame. Create a list of dictionaries.
    df_data = []
    for username, user_data in data.items():
        if user_data:
            df_data.append(user_data)

    # Create DataFrame
    df = pd.DataFrame(df_data)
    if not df.empty:
        df = df.set_index(
            "username"
        )  # Use username as index to avoid duplicate index values

        # Ensure correct data types for key fields (explicitly)
        df["edge_followed_by.count"] = df.get(
            "edge_followed_by", {}
        ).apply(lambda x: x.get("count") if isinstance(x, dict) else None)
        df["edge_follow.count"] = df.get("edge_follow", {}).apply(
            lambda x: x.get("count") if isinstance(x, dict) else None
        )
        df["is_verified"] = df["is_verified"].astype(bool)
        df["is_private"] = df["is_private"].astype(bool)
        df["timestamp_ist"] = pd.to_datetime(df["timestamp_ist"])

    return df


def main():
    """Fetches and displays Instagram data for multiple usernames."""
    all_data = {}
    for username in usernames:
        user_data = get_instagram_data(username, api_key)
        if user_data:
            all_data[username] = user_data

    df = create_dataframe(all_data)

    if df is not None and not df.empty:
        print(df)
        return df  # Return the DataFrame for subsequent use
    else:
        return None


if __name__ == "__main__":
    instagram_df = main()

    if instagram_df is not None:
        # Example usage of 'profile_id' in subsequent code.
        # Access the profile_id for a specific user
        for username in instagram_df.index:
            profile_id = instagram_df.loc[username, "profile_id"]
            print(f"Profile ID for {username}: {profile_id}")

        # Example of accessing a subset of the DataFrame
        subset_df = instagram_df[
            ["full_name", "profile_id", "is_verified", "timestamp_ist"]
        ]  # Using profile_id
        print("\nSubset of DataFrame:")
        print(subset_df)

             ai_agent_type                                          bio_links  \
username                                                                        
naukridotcom          None  [{'link_type': 'external', 'lynx_url': 'https:...   
swiggyindia           None                                                 []   
zomato                None                                                 []   
instagram             None  [{'link_type': 'external', 'lynx_url': 'https:...   

                                                      biography  \
username                                                          
naukridotcom  ✨Ambition starts here - job search, resume tip...   
swiggyindia   khaana khau raat bhar 🥰\ncrazy collabs karu ha...   
zomato                   Serving meals to those making history.   
instagram                   Discover what's new on Instagram 🔎✨   

                                        biography_with_entities  \
username                                   

In [13]:
import requests
import pandas as pd

# --- Assuming 'instagram_df' DataFrame is available from the previous code execution ---
# --- Replace with your actual API key (if it's not already in scope) ---
api_key = "55325f396cmsh1812ff7f2016376p1079d8jsn5ef3a673c06c"


def get_user_media(profile_id, api_key, count=20):
    """Fetches the last 'count' media items for a given profile ID."""
    media_url = "https://starapi1.p.rapidapi.com/instagram/user/get_media"
    payload = {"id": profile_id, "count": count}
    headers = {
        "x-rapidapi-key": api_key,
        "x-rapidapi-host": "starapi1.p.rapidapi.com",
        "Content-Type": "application/json",
    }

    try:
        response = requests.post(media_url, json=payload, headers=headers)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching media for profile {profile_id}: {e}")
        return None


# --- Example Usage (assuming 'instagram_df' exists) ---
if 'instagram_df' in locals() or 'instagram_df' in globals():
    # Check if DataFrame exists
    if isinstance(instagram_df, pd.DataFrame) and not instagram_df.empty:
        # Iterate through users and fetch their media
        for username in instagram_df.index:
            profile_id = instagram_df.loc[username, "profile_id"]
            print(f"\nFetching media for {username} (Profile ID: {profile_id})...")
            media_data = get_user_media(profile_id, api_key)

            if media_data:
                print(media_data)  # Process the media data further
            else:
                print(f"Failed to retrieve media for {username}")
    else:
        print("Error: 'instagram_df' DataFrame is empty or not properly initialized.")
else:
    print(
        "Error: 'instagram_df' DataFrame not found.  Please run the profile data fetching code first."
    )


Fetching media for naukridotcom (Profile ID: 2247292209)...
{'status': 'done', 'response': {'status_code': 200, 'content_type': 'application/json', 'body': {'auto_load_more_enabled': True, 'items': [{'all_previous_submitters': [], 'boost_unavailable_identifier': None, 'boost_unavailable_reason': None, 'boost_unavailable_reason_v2': None, 'can_modify_carousel': False, 'can_reply': False, 'can_see_insights_as_brand': False, 'can_view_more_preview_comments': False, 'can_viewer_reshare': True, 'can_viewer_save': True, 'caption': {'bit_flags': 0, 'content_type': 'comment', 'created_at': 1751433402, 'created_at_utc': 1751433402, 'did_report_as_spam': False, 'is_covered': False, 'is_ranked_comment': False, 'media_id': '3667611642747161357', 'pk': '17918601894124257', 'private_reply_status': 0, 'share_enabled': False, 'status': 'Active', 'strong_id__': '17918601894124257', 'text': 'Most people stop when it’s slow.\nCreators like Abhishek? They keep going.\nThat’s why he’s our Creator of the M

In [14]:
import datetime
import pytz
import pandas as pd

# Assumes `instagram_df`, `get_user_media`, and `api_key` are already defined
ist = pytz.timezone("Asia/Kolkata")
media_type_map = {1: "post", 2: "reel", 8: "carousel"}

all_rows = []
for username in instagram_df.index:
    profile_id = instagram_df.loc[username, "profile_id"]
    media_data = get_user_media(profile_id, api_key)
    if not media_data:
        continue

    items = (
        media_data
        .get("response", {})
        .get("body", {})
        .get("items", [])
    )

    for item in items:
        # taken_at → IST datetime string
        ts = item.get("taken_at", 0) or 0
        dt_ist = datetime.datetime.fromtimestamp(ts, ist)
        taken_str = dt_ist.strftime("%Y-%m-%d %H:%M")

        # clickable link
        code = item.get("code", "") or ""
        link = f"https://www.instagram.com/p/{code}/"

        # collaboration
        coauthors = item.get("coauthor_producers") or []
        is_collab = bool(coauthors)
        collab_with = ", ".join(c.get("username", "") for c in coauthors) if is_collab else 0

        # audio
        music_meta = item.get("music_metadata") or {}
        mi = music_meta.get("music_info") or {}
        audio_title = item.get("title") or mi.get("title") or 0
        audio_url = (
            item.get("progressive_download_url")
            or item.get("fast_start_progressive_download_url")
            or music_meta.get("cover_artwork_uri")
            or 0
        )

        # play count for reels
        play_count = item.get("play_count") or 0

        row = {
            "full_name":            item.get("user", {}).get("full_name") or 0,
            "username":             item.get("user", {}).get("username") or 0,
            "pk":                   item.get("pk") or 0,
            "id":                   item.get("id") or 0,
            "link":                 link,
            "media_type":           media_type_map.get(item.get("media_type"), 0),
            "is_video":             item.get("is_video") or False,
            "carousel_media_count": item.get("carousel_media_count") or 0,
            "caption":              item.get("caption", {}).get("text") or "",
            "taken_at_ist":         taken_str,
            "like_count":           item.get("like_count") or 0,
            "comment_count":        item.get("comment_count") or 0,
            "reshare_count":        item.get("reshare_count") or 0,
            "share_count_disabled": item.get("share_count_disabled") or False,
            "play_count":           play_count,
            "is_collab":            is_collab,
            "collab_with":          collab_with,
            "audio_title":          audio_title,
            "audio_url":            audio_url,
            "raw_data":             item,
        }
        all_rows.append(row)

df_media = pd.DataFrame(all_rows)
# ensure missing/NaN filled as 0 or appropriate defaults
df_media = df_media.fillna({
    "full_name": 0,
    "username": 0,
    "pk": 0,
    "id": 0,
    "link": "",
    "media_type": 0,
    "is_video": False,
    "carousel_media_count": 0,
    "caption": "",
    "taken_at_ist": "",
    "like_count": 0,
    "comment_count": 0,
    "reshare_count": 0,
    "share_count_disabled": False,
    "play_count": 0,
    "is_collab": False,
    "collab_with": 0,
    "audio_title": 0,
    "audio_url": 0,
})
print(df_media)

                full_name             username                   pk  \
0              Naukri.com         naukridotcom  3667611642747161357   
1              Naukri.com         naukridotcom  3686539081059221736   
2              Naukri.com         naukridotcom  3685846541056038231   
3              Naukri.com         naukridotcom  3685157666671193369   
4           Shreya Sarkar        curly_humour_  3684453184153370728   
5              Naukri.com         naukridotcom  3684405653052365569   
6   Interview Preparation  interview_prep_here  3683833305603246423   
7            Vrinda Gupta    vrinda_softskills  3683738030081976351   
8              Naukri.com         naukridotcom  3683139070117958784   
9           Ankita shende       simplyankita._  3683113613559700201   
10           Isha Jaiswal       ca.ishajaiswal  3683084276176218570   
11             Naukri.com         naukridotcom  3683080734226204264   
12                 Swiggy          swiggyindia  3686644799885225653   
13    

In [19]:
### STORY DATA ###

import datetime
import pytz
import pandas as pd

# Assumes `instagram_df` and `api_key` defined, and requests imported
ist = pytz.timezone("Asia/Kolkata")
story_type_map = {1: "photo", 2: "video"}

def get_user_stories(profile_ids, api_key):
    url = "https://starapi1.p.rapidapi.com/instagram/user/get_stories"
    payload = {"ids": profile_ids}
    headers = {
        "x-rapidapi-key": api_key,
        "x-rapidapi-host": "starapi1.p.rapidapi.com",
        "Content-Type": "application/json",
    }
    r = requests.post(url, json=payload, headers=headers)
    r.raise_for_status()
    return r.json()

rows = []
for username in instagram_df.index:
    pid = instagram_df.loc[username, "profile_id"]
    data = get_user_stories([pid], api_key)
    reel = data["response"]["body"]["reels"].get(str(pid), {})
    if not reel:
        continue

    owner = reel.get("user", {})
    full_name = owner.get("full_name", "")
    uname = owner.get("username", "")

    exp_ts = reel.get("expiring_at", 0) or 0
    exp_dt = datetime.datetime.fromtimestamp(exp_ts, ist).strftime("%Y-%m-%d %H:%M")
    paid = "Yes" if reel.get("is_paid_partnership") else "No"
    is_reel = reel.get("is_reel_media", False)

    for item in reel.get("items", []):
        post_ts = item.get("taken_at", 0) or 0
        post_dt = datetime.datetime.fromtimestamp(post_ts, ist).strftime("%Y-%m-%d %H:%M")

        rows.append({
            "full_name": full_name,
            "username": uname,
            "story_id": item.get("id", ""),
            "media_type": story_type_map.get(item.get("media_type"), ""),
            "post_datetime_ist": post_dt,
            "expire_datetime_ist": exp_dt,
            "is_paid_partnership": paid,
            "is_reel_media": is_reel,
            "raw_data": item
        })

df_stories = pd.DataFrame(rows).fillna({
    "full_name": "",
    "username": "",
    "story_id": "",
    "media_type": "",
    "post_datetime_ist": "",
    "expire_datetime_ist": "",
    "is_paid_partnership": "No",
    "is_reel_media": False,
    "raw_data": {}
})

print(df_stories)

    full_name      username                        story_id media_type  \
0  Naukri.com  naukridotcom  3686033313303993983_2247292209      photo   
1  Naukri.com  naukridotcom  3686540191182533715_2247292209      video   
2      Swiggy   swiggyindia  3686038321152116341_2045652032      video   
3      Swiggy   swiggyindia  3686566615115509166_2045652032      photo   
4      Swiggy   swiggyindia  3686566911040406557_2045652032      video   

  post_datetime_ist expire_datetime_ist is_paid_partnership  is_reel_media  \
0  2025-07-27 20:47    2025-07-29 13:34                  No          False   
1  2025-07-28 13:34    2025-07-29 13:34                  No          False   
2  2025-07-27 20:57    2025-07-29 14:27                  No          False   
3  2025-07-28 14:26    2025-07-29 14:27                  No          False   
4  2025-07-28 14:27    2025-07-29 14:27                  No          False   

                                            raw_data  
0  {'archive_story_deletion_ts'

In [17]:
import requests
import pandas as pd
import datetime
import pytz
import os  # For handling API key (in a Colab-friendly way)
from google.colab import userdata  # For retrieving secrets in Colab

# --- Constants and Configuration ---
usernames = ["naukridotcom", "swiggyindia", "zomato", "instagram"]  # Example usernames. Replace with your list.
# API Key - Use Colab's Secrets to securely store the API key
# In Colab, go to the "Secrets" tab (usually on the left side)
# and add a new secret named "INSTAGRAM_API_KEY" with your API key as the value.
api_key = userdata.get("INSTAGRAM_API_KEY")
if not api_key:
    raise ValueError(
        "Instagram API key not found in Colab's secrets. "
        "Go to the Secrets tab and add a secret named 'INSTAGRAM_API_KEY'."
    )

# API Endpoints
url_profile = (
    "https://starapi1.p.rapidapi.com/instagram/user/get_web_profile_info"
)
url_media = "https://starapi1.p.rapidapi.com/instagram/user/get_media"  # Changed from get_user_media

# Timezone
ist = pytz.timezone("Asia/Kolkata")
media_type_map = {1: "post", 2: "reel", 8: "carousel"}


# --- Helper Functions ---
def get_instagram_profile_data(username, api_key):
    """Fetches Instagram profile data for a given username."""
    payload = {"username": username}
    headers = {
        "x-rapidapi-key": api_key,
        "x-rapidapi-host": "starapi1.p.rapidapi.com",
        "Content-Type": "application/json",
    }
    try:
        response = requests.post(url_profile, json=payload, headers=headers)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
        data = response.json()

        if (
            data
            and data["status"] == "done"
            and data["response"]["body"]["data"]["user"]
        ):
            user_data = data["response"]["body"]["data"]["user"]
            user_data["profile_id"] = user_data.get(
                "id"
            )  # Add 'profile_id' for later use
            user_data["username"] = username  # Ensure username is present

            # Add timestamp in IST
            user_data["timestamp_ist"] = datetime.datetime.now(ist)
            return user_data
        else:
            print(
                f"Error fetching profile data for {username}: {data}"
            )  # Detailed error
            return None
    except requests.exceptions.RequestException as e:
        print(f"Request error for profile {username}: {e}")
        return None
    except (KeyError, TypeError) as e:  # Catch potential data structure errors
        print(f"Data parsing error for profile {username}: {e}")
        return None


def get_user_media(profile_id, api_key, count=50):  # Adjusted to include count parameter
    """Fetches the last 'count' media items for a given profile ID."""
    payload = {"id": profile_id, "count": count}
    headers = {
        "x-rapidapi-key": api_key,
        "x-rapidapi-host": "starapi1.p.rapidapi.com",
        "Content-Type": "application/json",
    }

    try:
        response = requests.post(url_media, json=payload, headers=headers)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
        data = response.json()
        return data
    except requests.exceptions.RequestException as e:
        print(f"Error fetching media for profile {profile_id}: {e}")
        return None
    except (KeyError, TypeError) as e:  # Handle potential parsing issues
        print(f"Data parsing error for media {profile_id}: {e}")
        return None


def process_media_item(item, og_username):  # Added og_username parameter
    """Processes a single media item and returns a dictionary of extracted data."""
    if not item:
        return None  # Handle potentially missing items

    ts = item.get("taken_at", 0) or 0
    dt_ist = datetime.datetime.fromtimestamp(ts, ist)
    taken_str = dt_ist.strftime("%Y-%m-%d %H:%M")

    code = item.get("code", "") or ""
    link = f"https://www.instagram.com/p/{code}/"

    coauthors = item.get("coauthor_producers") or []
    is_collab = bool(coauthors)
    collab_with = ", ".join(c.get("username", "") for c in coauthors) if is_collab else ""

    music_meta = item.get("music_metadata") or {}
    mi = music_meta.get("music_info") or {}
    audio_title = item.get("title") or mi.get("title") or ""
    audio_url = (
        item.get("progressive_download_url")
        or item.get("fast_start_progressive_download_url")
        or music_meta.get("cover_artwork_uri")
        or ""
    )

    play_count = item.get("play_count") or 0
    pk = "'" + str(item.get("pk",0))  # add the apostrophe

    row = {
        "full_name": item.get("user", {}).get("full_name") or "",
        "username": item.get("user", {}).get("username") or "",
        "og_username": og_username,  # Add og_username
        "pk": pk,  # Add the apostrophe
        "id": item.get("id") or 0,
        "link": link,
        "media_type": media_type_map.get(item.get("media_type"), ""),
        "is_video": item.get("is_video") or False,
        "carousel_media_count": item.get("carousel_media_count") or 0,
        "caption": item.get("caption", {}).get("text") or "",
        "taken_at_ist": taken_str,
        "like_count": item.get("like_count") or 0,
        "comment_count": item.get("comment_count") or 0,
        "reshare_count": item.get("reshare_count") or 0,
        "share_count_disabled": item.get("share_count_disabled") or False,
        "play_count": play_count,
        "is_collab": is_collab,
        "collab_with": collab_with,
        "audio_title": audio_title,
        "audio_url": audio_url,
    }
    return row


def create_profile_dataframe(profile_data):
    """Creates a Pandas DataFrame from profile data."""
    if not profile_data:
        print("No profile data to create DataFrame.")
        return pd.DataFrame()  # Return an empty DataFrame

    df = pd.DataFrame(profile_data.values())
    if not df.empty:
        df = df.set_index("username")  # Use username as index
        # Ensure correct data types for key fields (explicitly) and fill NaN
        df["edge_followed_by.count"] = df.get(
            "edge_followed_by", {}
        ).apply(lambda x: x.get("count") if isinstance(x, dict) else None)
        df["edge_follow.count"] = df.get(
            "edge_follow", {}
        ).apply(lambda x: x.get("count") if isinstance(x, dict) else None)
        df["is_verified"] = df["is_verified"].astype(bool)
        df["is_private"] = df["is_private"].astype(bool)
        df["timestamp_ist"] = pd.to_datetime(df["timestamp_ist"])
        df = df.fillna(
            {
                "edge_followed_by.count": 0,
                "edge_follow.count": 0,
            }
        )
    return df


def create_media_dataframe(all_media_rows):
    """Creates a Pandas DataFrame from media data."""
    if not all_media_rows:
        print("No media data to create DataFrame.")
        return pd.DataFrame()  # Return empty DataFrame

    df_media = pd.DataFrame(all_media_rows)
    # Ensure missing/NaN filled as 0 or appropriate defaults
    df_media = df_media.fillna(
        {
            "full_name": "",
            "username": "",
            "og_username": "",  # Added fill value for og_username
            "pk": "",  # Changed fill value for pk
            "id": 0,
            "link": "",
            "media_type": "",
            "is_video": False,
            "carousel_media_count": 0,
            "caption": "",
            "taken_at_ist": "",
            "like_count": 0,
            "comment_count": 0,
            "reshare_count": 0,
            "share_count_disabled": False,
            "play_count": 0,
            "is_collab": False,
            "collab_with": "",
            "audio_title": "",
            "audio_url": "",
        }
    )
    return df_media


# --- Main Function ---
def main():
    """Fetches and consolidates Instagram profile and media data."""
    all_profile_data = {}
    all_media_rows = []

    for username in usernames:
        # 1. Fetch Profile Data
        profile_data = get_instagram_profile_data(username, api_key)
        if profile_data:
            all_profile_data[username] = profile_data
            og_username = username  # original username

            # 2. Fetch Media Data
            profile_id = profile_data.get("profile_id")
            if profile_id:
                media_data = get_user_media(profile_id, api_key)
                if media_data:
                    items = (
                        media_data
                        .get("response", {})
                        .get("body", {})
                        .get("items", [])
                    )

                    for item in items:
                        processed_item = process_media_item(item, og_username)  # Pass og_username
                        if processed_item:
                            all_media_rows.append(processed_item)

    # 3. Create DataFrames
    df_profile = create_profile_dataframe(all_profile_data)
    df_media = create_media_dataframe(all_media_rows)

    # ---  Sort and Format df_media ---
    if not df_media.empty:
        # Sort by 'og_username' (ascending) and 'taken_at_ist' (descending)
        df_media = df_media.sort_values(
            by=["og_username", "taken_at_ist"], ascending=[True, False]
        )


    # ---  Merge Profile and Media Dataframes (Optional) ---
    # if not df_profile.empty and not df_media.empty:
    #     df_combined = pd.merge(df_media, df_profile, on='username', how='left')  # left join
    #     print("Combined DataFrame (Profile + Media):")
    #     print(df_combined)
    #     return df_profile, df_combined # return combined
    # else:
    print("Profile DataFrame:")
    print(df_profile)
    print("\nMedia DataFrame:")
    print(df_media)
    return df_profile, df_media  # return profile and combined df


# --- Execution ---
if __name__ == "__main__":
    profile_df, media_df = main()

    # ---  Further Processing or Storage (example: saving to CSV) ---
    # Example: Save the DataFrames to CSV files
    if not profile_df.empty:
        profile_df.to_csv("instagram_profiles.csv")
        print("Profile data saved to instagram_profiles.csv")
    if not media_df.empty:
        media_df.to_csv("instagram_media.csv")  # save combined dataframe
        print("Media data saved to instagram_media.csv")

Profile DataFrame:
             ai_agent_type                                          bio_links  \
username                                                                        
naukridotcom          None  [{'link_type': 'external', 'lynx_url': 'https:...   
swiggyindia           None                                                 []   
zomato                None                                                 []   
instagram             None  [{'link_type': 'external', 'lynx_url': 'https:...   

                                                      biography  \
username                                                          
naukridotcom  ✨Ambition starts here - job search, resume tip...   
swiggyindia   khaana khau raat bhar 🥰\ncrazy collabs karu ha...   
zomato                   Serving meals to those making history.   
instagram                   Discover what's new on Instagram 🔎✨   

                                        biography_with_entities  \
username                

In [1]:
##### FINAL QUERY #####

import requests
import pandas as pd
import datetime
import pytz
import os  # For handling API key (in a Colab-friendly way)
from google.colab import userdata  # For retrieving secrets in Colab

# --- Constants and Configuration ---
usernames = ["naukridotcom", "swiggyindia", "zomato", "instagram"]  # Example usernames. Replace with your list.
# API Key - Use Colab's Secrets to securely store the API key
# In Colab, go to the "Secrets" tab (usually on the left side)
# and add a new secret named "INSTAGRAM_API_KEY" with your API key as the value.
api_key = userdata.get("INSTAGRAM_API_KEY")
if not api_key:
    raise ValueError(
        "Instagram API key not found in Colab's secrets. "
        "Go to the Secrets tab and add a secret named 'INSTAGRAM_API_KEY'."
    )

# API Endpoints
url_profile = (
    "https://starapi1.p.rapidapi.com/instagram/user/get_web_profile_info"
)
url_media = "https://starapi1.p.rapidapi.com/instagram/user/get_media"  # Changed from get_user_media

# Timezone
ist = pytz.timezone("Asia/Kolkata")
media_type_map = {1: "post", 2: "reel", 8: "carousel"}


# --- Helper Functions ---
def get_instagram_profile_data(username, api_key):
    """Fetches Instagram profile data for a given username."""
    payload = {"username": username}
    headers = {
        "x-rapidapi-key": api_key,
        "x-rapidapi-host": "starapi1.p.rapidapi.com",
        "Content-Type": "application/json",
    }
    try:
        response = requests.post(url_profile, json=payload, headers=headers)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
        data = response.json()

        if (
            data
            and data["status"] == "done"
            and data["response"]["body"]["data"]["user"]
        ):
            user_data = data["response"]["body"]["data"]["user"]
            user_data["profile_id"] = user_data.get(
                "id"
            )  # Add 'profile_id' for later use
            user_data["username"] = username  # Ensure username is present

            # Process bio links
            bio_links = user_data.get("biography_with_entities", {}).get(
                "entities", []
            )
            processed_bio_links = []
            for link_data in bio_links:
                processed_bio_links.append(
                    {"title": link_data.get("title", ""), "url": link_data.get("url", "")}
                )

            user_data["processed_bio_links"] = processed_bio_links
            user_data["timestamp_ist"] = datetime.datetime.now(ist)
            return user_data
        else:
            print(
                f"Error fetching profile data for {username}: {data}"
            )  # Detailed error
            return None
    except requests.exceptions.RequestException as e:
        print(f"Request error for profile {username}: {e}")
        return None
    except (KeyError, TypeError) as e:  # Catch potential data structure errors
        print(f"Data parsing error for profile {username}: {e}")
        return None


def get_user_media(profile_id, api_key, count=50):  # Adjusted to include count parameter
    """Fetches the last 'count' media items for a given profile ID."""
    payload = {"id": profile_id, "count": count}
    headers = {
        "x-rapidapi-key": api_key,
        "x-rapidapi-host": "starapi1.p.rapidapi.com",
        "Content-Type": "application/json",
    }

    try:
        response = requests.post(url_media, json=payload, headers=headers)
        response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
        data = response.json()
        return data
    except requests.exceptions.RequestException as e:
        print(f"Error fetching media for profile {profile_id}: {e}")
        return None
    except (KeyError, TypeError) as e:  # Handle potential parsing issues
        print(f"Data parsing error for media {profile_id}: {e}")
        return None


def process_media_item(item, og_username, profile_data):  # Added og_username parameter
    """Processes a single media item and returns a dictionary of extracted data."""
    if not item:
        return None  # Handle potentially missing items

    ts = item.get("taken_at", 0) or 0
    dt_ist = datetime.datetime.fromtimestamp(ts, ist)
    taken_str = dt_ist.strftime("%Y-%m-%d %H:%M")

    code = item.get("code", "") or ""
    link = f"https://www.instagram.com/p/{code}/"

    coauthors = item.get("coauthor_producers") or []
    is_collab = bool(coauthors)
    collab_with = ", ".join(c.get("username", "") for c in coauthors) if is_collab else ""

    music_meta = item.get("music_metadata") or {}
    mi = music_meta.get("music_info") or {}
    audio_title = item.get("title") or mi.get("title") or ""
    audio_url = (
        item.get("progressive_download_url")
        or item.get("fast_start_progressive_download_url")
        or music_meta.get("cover_artwork_uri")
        or ""
    )

    play_count = item.get("play_count") or 0
    pk = "'" + str(item.get("pk", 0))  # add the apostrophe

    # Add sync_datetime and follower_count using profile_data
    sync_datetime = datetime.datetime.now(ist).strftime("%Y-%m-%d %H:%M:%S")
    follower_count = (
        profile_data.get("edge_followed_by", {}).get("count", 0)
        if profile_data
        else 0
    )

    row = {
        "full_name": item.get("user", {}).get("full_name") or "",
        "username": item.get("user", {}).get("username") or "",
        "og_username": og_username,  # Add og_username
        "pk": pk,  # Add the apostrophe
        "id": item.get("id") or 0,
        "link": link,
        "media_type": media_type_map.get(item.get("media_type"), ""),
        "is_video": item.get("is_video") or False,
        "carousel_media_count": item.get("carousel_media_count") or 0,
        "caption": item.get("caption", {}).get("text") or "",
        "taken_at_ist": taken_str,
        "like_count": item.get("like_count") or 0,
        "comment_count": item.get("comment_count") or 0,
        "reshare_count": item.get("reshare_count") or 0,
        "share_count_disabled": item.get("share_count_disabled") or False,
        "play_count": play_count,
        "is_collab": is_collab,
        "collab_with": collab_with,
        "audio_title": audio_title,
        "audio_url": audio_url,
        "sync_datetime": sync_datetime,  # Add sync_datetime
        "follower_count": follower_count,  # Add follower_count
    }
    return row


def create_profile_dataframe(profile_data):
    """Creates a Pandas DataFrame from profile data."""
    if not profile_data:
        print("No profile data to create DataFrame.")
        return pd.DataFrame()  # Return an empty DataFrame

    df = pd.DataFrame(profile_data.values())
    if not df.empty:
        df = df.set_index("username")  # Use username as index
        # Ensure correct data types for key fields (explicitly) and fill NaN
        df["edge_followed_by.count"] = df.get(
            "edge_followed_by", {}
        ).apply(lambda x: x.get("count") if isinstance(x, dict) else None)
        df["edge_follow.count"] = df.get(
            "edge_follow", {}
        ).apply(lambda x: x.get("count") if isinstance(x, dict) else None)
        df["is_verified"] = df["is_verified"].astype(bool)
        df["is_private"] = df["is_private"].astype(bool)
        df["timestamp_ist"] = pd.to_datetime(df["timestamp_ist"])
        df = df.fillna(
            {
                "edge_followed_by.count": 0,
                "edge_follow.count": 0,
            }
        )
    return df


def create_media_dataframe(all_media_rows):
    """Creates a Pandas DataFrame from media data."""
    if not all_media_rows:
        print("No media data to create DataFrame.")
        return pd.DataFrame()  # Return empty DataFrame

    df_media = pd.DataFrame(all_media_rows)
    # Ensure missing/NaN filled as 0 or appropriate defaults
    df_media = df_media.fillna(
        {
            "full_name": "",
            "username": "",
            "og_username": "",  # Added fill value for og_username
            "pk": "",  # Changed fill value for pk
            "id": 0,
            "link": "",
            "media_type": "",
            "is_video": False,
            "carousel_media_count": 0,
            "caption": "",
            "taken_at_ist": "",
            "like_count": 0,
            "comment_count": 0,
            "reshare_count": 0,
            "share_count_disabled": False,
            "play_count": 0,
            "is_collab": False,
            "collab_with": "",
            "audio_title": "",
            "audio_url": "",
            "sync_datetime": "",  # Added fill value for sync_datetime
            "follower_count": 0,  # Added fill value for follower_count
        }
    )
    return df_media


# --- Main Function ---
def main():
    """Fetches and consolidates Instagram profile and media data."""
    all_profile_data = {}
    all_media_rows = []

    for username in usernames:
        # 1. Fetch Profile Data
        profile_data = get_instagram_profile_data(username, api_key)
        if profile_data:
            all_profile_data[username] = profile_data
            og_username = username  # original username

            # 2. Fetch Media Data
            profile_id = profile_data.get("profile_id")
            if profile_id:
                media_data = get_user_media(profile_id, api_key)
                if media_data:
                    items = (
                        media_data
                        .get("response", {})
                        .get("body", {})
                        .get("items", [])
                    )

                    for item in items:
                        processed_item = process_media_item(item, og_username, profile_data)  # Pass og_username, profile_data
                        if processed_item:
                            all_media_rows.append(processed_item)

    # 3. Create DataFrames
    df_profile = create_profile_dataframe(all_profile_data)
    df_media = create_media_dataframe(all_media_rows)

    # ---  Sort and Format df_media ---
    if not df_media.empty:
        # Sort by 'og_username' (ascending) and 'taken_at_ist' (descending)
        df_media = df_media.sort_values(
            by=["og_username", "taken_at_ist"], ascending=[True, False]
        )


    # ---  Merge Profile and Media Dataframes (Optional) ---
    # if not df_profile.empty and not df_media.empty:
    #     df_combined = pd.merge(df_media, df_profile, on='username', how='left')  # left join
    #     print("Combined DataFrame (Profile + Media):")
    #     print(df_combined)
    #     return df_profile, df_combined # return combined
    # else:
    print("Profile DataFrame:")
    print(df_profile)
    print("\nMedia DataFrame:")
    print(df_media)
    return df_profile, df_media  # return profile and combined df


# --- Execution ---
if __name__ == "__main__":
    profile_df, media_df = main()

    # ---  Further Processing or Storage (example: saving to CSV) ---
    # Example: Save the DataFrames to CSV files
    if not profile_df.empty:
        profile_df.to_csv("instagram_profiles.csv")
        print("Profile data saved to instagram_profiles.csv")
    if not media_df.empty:
        media_df.to_csv("instagram_media.csv")  # save combined dataframe
        print("Media data saved to instagram_media.csv")

Profile DataFrame:
             ai_agent_type                                          bio_links  \
username                                                                        
naukridotcom          None  [{'link_type': 'external', 'lynx_url': 'https:...   
swiggyindia           None                                                 []   
zomato                None                                                 []   
instagram             None  [{'link_type': 'external', 'lynx_url': 'https:...   

                                                      biography  \
username                                                          
naukridotcom  ✨Ambition starts here - job search, resume tip...   
swiggyindia   khaana khau raat bhar 🥰\ncrazy collabs karu ha...   
zomato                   Serving meals to those making history.   
instagram                   Discover what's new on Instagram 🔎✨   

                                        biography_with_entities  \
username                

In [3]:
import os
import requests
import pandas as pd
import datetime
import pytz
from google.colab import userdata

# --- Configuration ---
usernames = ["naukridotcom", "swiggyindia", "zomato", "instagram"]
api_key = userdata.get("INSTAGRAM_API_KEY")
if not api_key:
    raise ValueError("Set INSTAGRAM_API_KEY in Colab Secrets")

url_profile = "https://starapi1.p.rapidapi.com/instagram/user/get_web_profile_info"
url_media   = "https://starapi1.p.rapidapi.com/instagram/user/get_media"
url_stories = "https://starapi1.p.rapidapi.com/instagram/user/get_stories"

ist = pytz.timezone("Asia/Kolkata")
media_type_map = {1: "post", 2: "reel", 8: "carousel"}
story_type_map = {1: "photo", 2: "video"}


def get_instagram_profile_data(username):
    payload = {"username": username}
    headers = {
        "x-rapidapi-key": api_key,
        "x-rapidapi-host": "starapi1.p.rapidapi.com",
        "Content-Type": "application/json",
    }
    r = requests.post(url_profile, json=payload, headers=headers)
    r.raise_for_status()
    data = r.json()
    user = data["response"]["body"]["data"]["user"]
    user["profile_id"] = user["id"]
    user["timestamp_ist"] = datetime.datetime.now(ist)
    user["username"] = username
    return user


def get_user_media(profile_id, count=50):
    payload = {"id": profile_id, "count": count}
    headers = {
        "x-rapidapi-key": api_key,
        "x-rapidapi-host": "starapi1.p.rapidapi.com",
        "Content-Type": "application/json",
    }
    r = requests.post(url_media, json=payload, headers=headers)
    r.raise_for_status()
    return r.json()


def get_user_stories(profile_id):
    payload = {"ids": [profile_id]}
    headers = {
        "x-rapidapi-key": api_key,
        "x-rapidapi-host": "starapi1.p.rapidapi.com",
        "Content-Type": "application/json",
    }
    r = requests.post(url_stories, json=payload, headers=headers)
    r.raise_for_status()
    return r.json()


def process_media_item(item, og_username, profile_data):
    ts = item.get("taken_at", 0) or 0
    post_dt = datetime.datetime.fromtimestamp(ts, ist).strftime("%Y-%m-%d %H:%M")
    code = item.get("code", "") or ""
    link = f"https://www.instagram.com/p/{code}/"
    coauthors = item.get("coauthor_producers") or []
    is_collab = bool(coauthors)
    collab_with = ", ".join(c.get("username","") for c in coauthors)
    row = {
        "full_name":             item["user"].get("full_name",""),
        "username":              item["user"].get("username",""),
        "og_username":           og_username,
        "story_id":              "",
        "pk":                    f"'{item.get('pk',0)}",
        "id":                    item.get("id",0),
        "link":                  link,
        "media_type":            media_type_map.get(item.get("media_type"),""),
        "is_video":              item.get("is_video",False),
        "carousel_media_count":  item.get("carousel_media_count",0),
        "caption":               item.get("caption",{}).get("text",""),
        "post_datetime_ist":     post_dt,
        "expire_datetime_ist":   "",
        "is_paid_partnership":   "Yes" if item.get("is_paid_partnership") else "No",
        "is_reel_media":         item.get("is_reel_media",False),
        "is_story":              False,
        "raw_data":              item,
    }
    return row


def process_story_item(item, reel_meta, og_username):
    ts = item.get("taken_at", 0) or 0
    post_dt = datetime.datetime.fromtimestamp(ts, ist).strftime("%Y-%m-%d %H:%M")
    exp_ts = reel_meta.get("expiring_at",0) or 0
    exp_dt = datetime.datetime.fromtimestamp(exp_ts, ist).strftime("%Y-%m-%d %H:%M")
    row = {
        "full_name":             reel_meta["user"].get("full_name",""),
        "username":              reel_meta["user"].get("username",""),
        "og_username":           og_username,
        "story_id":              item.get("id",""),
        "pk":                    "",
        "id":                    "",
        "link":                  "",
        "media_type":            story_type_map.get(item.get("media_type"),""),
        "is_video":              item.get("media_type")==2,
        "carousel_media_count":  0,
        "caption":               item.get("caption","") or "",
        "post_datetime_ist":     post_dt,
        "expire_datetime_ist":   exp_dt,
        "is_paid_partnership":   "Yes" if reel_meta.get("is_paid_partnership") else "No",
        "is_reel_media":         reel_meta.get("is_reel_media",False),
        "is_story":              True,
        "raw_data":              item,
    }
    return row


def main():
    profile_data = {}
    rows = []

    for uname in usernames:
        prof = get_instagram_profile_data(uname)
        profile_data[uname] = prof

        pid = prof["profile_id"]
        # media
        md = get_user_media(pid)
        items = md["response"]["body"]["items"]
        for it in items:
            rows.append(process_media_item(it, uname, prof))

        # stories
        sd = get_user_stories(pid)
        reel = sd["response"]["body"]["reels"].get(str(pid),{})
        for it in reel.get("items",[]):
            rows.append(process_story_item(it, reel, uname))

    df_media = pd.DataFrame(rows).fillna({
        "full_name":"","username":"","og_username":"",
        "story_id":"","pk":"","id":0,"link":"",
        "media_type":"","is_video":False,"carousel_media_count":0,
        "caption":"","post_datetime_ist":"","expire_datetime_ist":"",
        "is_paid_partnership":"No","is_reel_media":False,"is_story":False
    })

    df_profile = pd.DataFrame(profile_data.values()).set_index("username")
    print("Profiles:")
    print(df_profile)
    print("\nCombined Media+Stories:")
    print(df_media)
    return df_profile, df_media


if __name__=="__main__":
    profile_df, media_df = main()

Profiles:
             ai_agent_type                                          bio_links  \
username                                                                        
naukridotcom          None  [{'link_type': 'external', 'lynx_url': 'https:...   
swiggyindia           None                                                 []   
zomato                None                                                 []   
instagram             None  [{'link_type': 'external', 'lynx_url': 'https:...   

                                                      biography  \
username                                                          
naukridotcom  ✨Ambition starts here - job search, resume tip...   
swiggyindia   khaana khau raat bhar 🥰\ncrazy collabs karu ha...   
zomato                   Serving meals to those making history.   
instagram                   Discover what's new on Instagram 🔎✨   

                                        biography_with_entities  \
username                         

In [6]:
import os
import requests
import pandas as pd
import datetime
import pytz
import time
from google.colab import userdata, drive
from typing import Dict, List, Optional
import sqlite3

# Mount Google Drive
drive.mount('/content/drive')

# --- Configuration ---
usernames = ["naukridotcom", "swiggyindia", "zomato", "instagram"]
api_key = userdata.get("INSTAGRAM_API_KEY")
if not api_key:
    raise ValueError("Set INSTAGRAM_API_KEY in Colab Secrets")

url_profile = "https://starapi1.p.rapidapi.com/instagram/user/get_web_profile_info"
url_media = "https://starapi1.p.rapidapi.com/instagram/user/get_media"
url_stories = "https://starapi1.p.rapidapi.com/instagram/user/get_stories"

ist = pytz.timezone("Asia/Kolkata")
media_type_map = {1: "post", 2: "reel", 8: "carousel"}
story_type_map = {1: "photo", 2: "video"}

# Database setup
DB_FILE = "/content/drive/MyDrive/instagram_data.db"

class InstagramDataManager:
    def __init__(self, db_file: str = DB_FILE):
        self.db_file = db_file
        self.init_database()

    def init_database(self):
        """Initialize SQLite database with required tables."""
        conn = sqlite3.connect(self.db_file)
        cursor = conn.cursor()

        # Create profiles table
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS profiles (
                username TEXT PRIMARY KEY,
                full_name TEXT,
                biography TEXT,
                follower_count INTEGER,
                following_count INTEGER,
                media_count INTEGER,
                is_verified BOOLEAN,
                is_private BOOLEAN,
                profile_pic_url TEXT,
                last_updated TIMESTAMP
            )
        ''')

        # Create media table (includes both posts and stories)
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS media (
                id TEXT PRIMARY KEY,
                story_id TEXT,
                pk TEXT,
                username TEXT,
                og_username TEXT,
                full_name TEXT,
                link TEXT,
                media_type TEXT,
                is_video BOOLEAN,
                carousel_media_count INTEGER,
                caption TEXT,
                post_datetime_ist TIMESTAMP,
                expire_datetime_ist TEXT,
                is_paid_partnership TEXT,
                is_reel_media BOOLEAN,
                is_story BOOLEAN,
                first_fetched TIMESTAMP,
                last_updated TIMESTAMP,
                raw_data TEXT
            )
        ''')

        conn.commit()
        conn.close()

    def get_existing_media_ids(self, og_username: str) -> set:
        """Get existing media IDs for a username from database."""
        conn = sqlite3.connect(self.db_file)
        cursor = conn.cursor()
        cursor.execute("SELECT id FROM media WHERE og_username = ? AND id != ''", (og_username,))
        existing_ids = {row[0] for row in cursor.fetchall()}
        conn.close()
        return existing_ids

    def get_existing_story_ids(self, og_username: str) -> set:
        """Get existing story IDs for a username from database."""
        conn = sqlite3.connect(self.db_file)
        cursor = conn.cursor()
        cursor.execute("SELECT story_id FROM media WHERE og_username = ? AND story_id != ''", (og_username,))
        existing_ids = {row[0] for row in cursor.fetchall()}
        conn.close()
        return existing_ids

    def save_profile_data(self, profile_data: Dict):
        """Save profile data to database."""
        conn = sqlite3.connect(self.db_file)
        cursor = conn.cursor()

        cursor.execute('''
            INSERT OR REPLACE INTO profiles
            (username, full_name, biography, follower_count, following_count,
             media_count, is_verified, is_private, profile_pic_url, last_updated)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        ''', (
            profile_data.get('username'),
            profile_data.get('full_name', ''),
            profile_data.get('biography', ''),
            profile_data.get('edge_followed_by', {}).get('count', 0),
            profile_data.get('edge_follow', {}).get('count', 0),
            profile_data.get('edge_owner_to_timeline_media', {}).get('count', 0),
            profile_data.get('is_verified', False),
            profile_data.get('is_private', False),
            profile_data.get('profile_pic_url', ''),
            datetime.datetime.now(ist)
        ))

        conn.commit()
        conn.close()

    def save_media_data(self, media_rows: List[Dict]):
        """Save media data to database."""
        if not media_rows:
            return

        conn = sqlite3.connect(self.db_file)
        cursor = conn.cursor()

        current_time = datetime.datetime.now(ist)

        for row in media_rows:
            # Generate unique key for both posts and stories
            if row['is_story']:
                unique_id = f"story_{row['story_id']}"
                cursor.execute("SELECT first_fetched FROM media WHERE story_id = ?", (row['story_id'],))
            else:
                unique_id = str(row['id'])
                cursor.execute("SELECT first_fetched FROM media WHERE id = ?", (row['id'],))

            existing = cursor.fetchone()

            if existing:
                # Update existing media (stories typically don't need updates, but posts do)
                if not row['is_story']:
                    cursor.execute('''
                        UPDATE media SET
                        caption = ?, last_updated = ?, raw_data = ?
                        WHERE id = ?
                    ''', (
                        row['caption'], current_time, str(row.get('raw_data', '')),
                        row['id']
                    ))
            else:
                # Insert new media/story
                cursor.execute('''
                    INSERT INTO media
                    (id, story_id, pk, username, og_username, full_name, link, media_type,
                     is_video, carousel_media_count, caption, post_datetime_ist,
                     expire_datetime_ist, is_paid_partnership, is_reel_media, is_story,
                     first_fetched, last_updated, raw_data)
                    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                ''', (
                    str(row['id']) if row['id'] else unique_id,
                    row['story_id'], row['pk'], row['username'], row['og_username'],
                    row['full_name'], row['link'], row['media_type'], row['is_video'],
                    row['carousel_media_count'], row['caption'], row['post_datetime_ist'],
                    row['expire_datetime_ist'], row['is_paid_partnership'],
                    row['is_reel_media'], row['is_story'], current_time, current_time,
                    str(row.get('raw_data', ''))
                ))

        conn.commit()
        conn.close()

    def get_media_dataframe(self, limit: Optional[int] = None) -> pd.DataFrame:
        """Get media data as DataFrame."""
        conn = sqlite3.connect(self.db_file)
        query = '''
            SELECT * FROM media
            ORDER BY og_username ASC, post_datetime_ist DESC
        '''
        if limit:
            query += f" LIMIT {limit}"

        df = pd.read_sql_query(query, conn)
        conn.close()
        return df

    def get_profile_dataframe(self) -> pd.DataFrame:
        """Get profile data as DataFrame."""
        conn = sqlite3.connect(self.db_file)
        df = pd.read_sql_query("SELECT * FROM profiles ORDER BY username", conn)
        conn.close()
        return df

    def cleanup_expired_stories(self):
        """Remove expired stories from database."""
        conn = sqlite3.connect(self.db_file)
        cursor = conn.cursor()

        current_time = datetime.datetime.now(ist).strftime("%Y-%m-%d %H:%M")
        cursor.execute('''
            DELETE FROM media
            WHERE is_story = 1 AND expire_datetime_ist != '' AND expire_datetime_ist < ?
        ''', (current_time,))

        deleted_count = cursor.rowcount
        conn.commit()
        conn.close()

        if deleted_count > 0:
            print(f"Cleaned up {deleted_count} expired stories")

# --- API Functions ---
def get_instagram_profile_data(username):
    payload = {"username": username}
    headers = {
        "x-rapidapi-key": api_key,
        "x-rapidapi-host": "starapi1.p.rapidapi.com",
        "Content-Type": "application/json",
    }
    try:
        r = requests.post(url_profile, json=payload, headers=headers)
        r.raise_for_status()
        data = r.json()
        user = data["response"]["body"]["data"]["user"]
        user["profile_id"] = user["id"]
        user["timestamp_ist"] = datetime.datetime.now(ist)
        user["username"] = username
        return user
    except Exception as e:
        print(f"Error fetching profile for {username}: {e}")
        return None

def get_user_media(profile_id, count=50):
    payload = {"id": profile_id, "count": count}
    headers = {
        "x-rapidapi-key": api_key,
        "x-rapidapi-host": "starapi1.p.rapidapi.com",
        "Content-Type": "application/json",
    }
    try:
        r = requests.post(url_media, json=payload, headers=headers)
        r.raise_for_status()
        return r.json()
    except Exception as e:
        print(f"Error fetching media for profile {profile_id}: {e}")
        return None

def get_user_stories(profile_id):
    payload = {"ids": [profile_id]}
    headers = {
        "x-rapidapi-key": api_key,
        "x-rapidapi-host": "starapi1.p.rapidapi.com",
        "Content-Type": "application/json",
    }
    try:
        r = requests.post(url_stories, json=payload, headers=headers)
        r.raise_for_status()
        return r.json()
    except Exception as e:
        print(f"Error fetching stories for profile {profile_id}: {e}")
        return None

def process_media_item(item, og_username, profile_data):
    ts = item.get("taken_at", 0) or 0
    post_dt = datetime.datetime.fromtimestamp(ts, ist).strftime("%Y-%m-%d %H:%M")
    code = item.get("code", "") or ""
    link = f"https://www.instagram.com/p/{code}/"

    row = {
        "full_name": item["user"].get("full_name",""),
        "username": item["user"].get("username",""),
        "og_username": og_username,
        "story_id": "",
        "pk": f"'{item.get('pk',0)}",
        "id": str(item.get("id",0)),
        "link": link,
        "media_type": media_type_map.get(item.get("media_type"),""),
        "is_video": item.get("is_video",False),
        "carousel_media_count": item.get("carousel_media_count",0),
        "caption": item.get("caption",{}).get("text",""),
        "post_datetime_ist": post_dt,
        "expire_datetime_ist": "",
        "is_paid_partnership": "Yes" if item.get("is_paid_partnership") else "No",
        "is_reel_media": item.get("is_reel_media",False),
        "is_story": False,
        "raw_data": item,
    }
    return row

def process_story_item(item, reel_meta, og_username):
    ts = item.get("taken_at", 0) or 0
    post_dt = datetime.datetime.fromtimestamp(ts, ist).strftime("%Y-%m-%d %H:%M")
    exp_ts = reel_meta.get("expiring_at",0) or 0
    exp_dt = datetime.datetime.fromtimestamp(exp_ts, ist).strftime("%Y-%m-%d %H:%M")

    row = {
        "full_name": reel_meta["user"].get("full_name",""),
        "username": reel_meta["user"].get("username",""),
        "og_username": og_username,
        "story_id": str(item.get("id","")),
        "pk": "",
        "id": "",
        "link": "",
        "media_type": story_type_map.get(item.get("media_type"),""),
        "is_video": item.get("media_type")==2,
        "carousel_media_count": 0,
        "caption": item.get("caption","") or "",
        "post_datetime_ist": post_dt,
        "expire_datetime_ist": exp_dt,
        "is_paid_partnership": "Yes" if reel_meta.get("is_paid_partnership") else "No",
        "is_reel_media": reel_meta.get("is_reel_media",False),
        "is_story": True,
        "raw_data": item,
    }
    return row

def initial_data_fetch(data_manager: InstagramDataManager):
    """Fetch initial 100 content pieces + stories for each username."""
    print("Starting initial data fetch (100 posts + stories per username)...")

    for username in usernames:
        print(f"Fetching data for {username}...")

        profile_data = get_instagram_profile_data(username)
        if not profile_data:
            continue

        data_manager.save_profile_data(profile_data)
        profile_id = profile_data["profile_id"]

        rows = []

        # Fetch media data (100 posts in batches)
        for batch in range(2):
            media_data = get_user_media(profile_id, count=50)
            if not media_data:
                break

            items = media_data["response"]["body"]["items"]
            for item in items:
                rows.append(process_media_item(item, username, profile_data))

            if len(items) < 50:
                break

            time.sleep(1)  # Rate limiting

        # Fetch stories
        story_data = get_user_stories(profile_id)
        if story_data:
            reel = story_data["response"]["body"]["reels"].get(str(profile_id), {})
            for item in reel.get("items", []):
                rows.append(process_story_item(item, reel, username))

        # Save all data
        data_manager.save_media_data(rows)
        posts_count = sum(1 for r in rows if not r['is_story'])
        stories_count = sum(1 for r in rows if r['is_story'])
        print(f"Saved {posts_count} posts and {stories_count} stories for {username}")

        time.sleep(2)  # Rate limiting

    print("Initial data fetch completed!")

def update_data_fetch(data_manager: InstagramDataManager):
    """Fetch 20 new posts + all current stories and update existing metrics."""
    print("Starting update (20 new posts + current stories)...")

    data_manager.cleanup_expired_stories()

    for username in usernames:
        print(f"Updating data for {username}...")

        profile_data = get_instagram_profile_data(username)
        if not profile_data:
            continue

        data_manager.save_profile_data(profile_data)
        profile_id = profile_data["profile_id"]

        existing_media_ids = data_manager.get_existing_media_ids(username)
        existing_story_ids = data_manager.get_existing_story_ids(username)

        rows = []
        new_posts_count = 0

        # Fetch latest media
        media_data = get_user_media(profile_id, count=50)
        if media_data:
            items = media_data["response"]["body"]["items"]
            for item in items:
                processed_item = process_media_item(item, username, profile_data)
                item_id = processed_item['id']

                if item_id not in existing_media_ids and new_posts_count < 20:
                    rows.append(processed_item)
                    new_posts_count += 1
                elif item_id in existing_media_ids:
                    rows.append(processed_item)

        # Fetch current stories
        story_data = get_user_stories(profile_id)
        new_stories_count = 0
        if story_data:
            reel = story_data["response"]["body"]["reels"].get(str(profile_id), {})
            for item in reel.get("items", []):
                processed_story = process_story_item(item, reel, username)
                story_id = processed_story['story_id']

                if story_id not in existing_story_ids:
                    rows.append(processed_story)
                    new_stories_count += 1

        data_manager.save_media_data(rows)
        print(f"Added {new_posts_count} new posts and {new_stories_count} new stories for {username}")

        time.sleep(2)  # Rate limiting

    print("Update completed!")

def export_to_csv(data_manager: InstagramDataManager):
    """Export current data to CSV files."""
    try:
        # Export profiles
        df_profile = data_manager.get_profile_dataframe()
        if not df_profile.empty:
            df_profile.to_csv("/content/drive/MyDrive/instagram_profiles.csv", index=False)
            print(f"Exported {len(df_profile)} profiles to instagram_profiles.csv")

        # Export media
        df_media = data_manager.get_media_dataframe()
        if not df_media.empty:
            df_media.to_csv("/content/drive/MyDrive/instagram_media.csv", index=False)
            print(f"Exported {len(df_media)} media items to instagram_media.csv")

            # Separate posts and stories
            df_posts = df_media[df_media['is_story'] == False]
            df_stories = df_media[df_media['is_story'] == True]

            if not df_posts.empty:
                df_posts.to_csv("/content/drive/MyDrive/instagram_posts.csv", index=False)
                print(f"Exported {len(df_posts)} posts to instagram_posts.csv")

            if not df_stories.empty:
                df_stories.to_csv("/content/drive/MyDrive/instagram_stories.csv", index=False)
                print(f"Exported {len(df_stories)} stories to instagram_stories.csv")

    except Exception as e:
        print(f"Error exporting to CSV: {e}")

def view_current_data():
    """View current data statistics."""
    data_manager = InstagramDataManager()

    df_profile = data_manager.get_profile_dataframe()
    df_media = data_manager.get_media_dataframe()

    print(f"Profiles: {len(df_profile)}")
    print(f"Total media items: {len(df_media)}")

    if not df_media.empty:
        df_posts = df_media[df_media['is_story'] == False]
        df_stories = df_media[df_media['is_story'] == True]

        print(f"Posts: {len(df_posts)}")
        print(f"Stories: {len(df_stories)}")

        print("\nContent count by username:")
        for username in df_media['og_username'].unique():
            user_posts = df_posts[df_posts['og_username'] == username]
            user_stories = df_stories[df_stories['og_username'] == username]
            print(f"{username}: {len(user_posts)} posts, {len(user_stories)} stories")

def run_single_update():
    """Run a single update cycle."""
    data_manager = InstagramDataManager()

    # Check if this is the first run
    df_media = data_manager.get_media_dataframe()
    if df_media.empty:
        print("No existing data found. Running initial fetch...")
        initial_data_fetch(data_manager)
    else:
        print(f"Found {len(df_media)} existing media items. Running update...")
        update_data_fetch(data_manager)

    # Export to CSV
    export_to_csv(data_manager)

    # Clean up expired stories
    data_manager.cleanup_expired_stories()

    # View current data
    view_current_data()

if __name__ == "__main__":
    run_single_update()

Mounted at /content/drive
No existing data found. Running initial fetch...
Starting initial data fetch (100 posts + stories per username)...
Fetching data for naukridotcom...
Saved 12 posts and 2 stories for naukridotcom
Fetching data for swiggyindia...
Saved 12 posts and 0 stories for swiggyindia
Fetching data for zomato...
Saved 12 posts and 0 stories for zomato
Fetching data for instagram...
Saved 12 posts and 0 stories for instagram
Initial data fetch completed!
Exported 4 profiles to instagram_profiles.csv
Exported 50 media items to instagram_media.csv
Exported 48 posts to instagram_posts.csv
Exported 2 stories to instagram_stories.csv
Profiles: 4
Total media items: 50
Posts: 48
Stories: 2

Content count by username:
instagram: 12 posts, 0 stories
naukridotcom: 12 posts, 2 stories
swiggyindia: 12 posts, 0 stories
zomato: 12 posts, 0 stories
