In [1]:
from pytrends.request import TrendReq
import pandas as pd

# Connect to Google Trends
pytrends = TrendReq(hl='en-US', tz=330)  # India timezone

keywords = ["Facebook", "Instagram", "Snapchat", "Twitter"]
timeframe = "2015-01-01 2025-01-01"

all_data = []

for kw in keywords:
    print(f"Fetching Google Trends data for: {kw}")
    pytrends.build_payload([kw], timeframe=timeframe, geo="IN")
    df = pytrends.interest_over_time()
    if not df.empty:
        df['Platform'] = kw
        all_data.append(df)

# Combine data for all platforms
trends_df = pd.concat(all_data)
trends_df.reset_index(inplace=True)

# Save file
output_path = "../data/raw/google_trends_india.csv"
trends_df.to_csv(output_path, index=False)

trends_df.head()


Fetching Google Trends data for: Facebook
Fetching Google Trends data for: Instagram
Fetching Google Trends data for: Snapchat
Fetching Google Trends data for: Twitter


Unnamed: 0,date,Facebook,isPartial,Platform,Instagram,Snapchat,Twitter
0,2015-01-01,100.0,False,Facebook,,,
1,2015-02-01,94.0,False,Facebook,,,
2,2015-03-01,89.0,False,Facebook,,,
3,2015-04-01,88.0,False,Facebook,,,
4,2015-05-01,91.0,False,Facebook,,,


In [2]:
from google_play_scraper import Sort, reviews
import pandas as pd
from tqdm import tqdm

apps = {
    "Facebook": "com.facebook.katana",
    "Instagram": "com.instagram.android",
    "Snapchat": "com.snapchat.android",
    "Twitter": "com.twitter.android"
}

total_reviews = []

for app_name, app_id in apps.items():
    print(f"Collecting reviews for → {app_name}")
    
    count = 0
    batch = 2000  # Fetch 2000 per batch
    max_reviews = 50000  # 50k per app
    
    while count < max_reviews:
        rvws, _ = reviews(
            app_id,
            lang="en",
            country="in",
            sort=Sort.NEWEST,
            count=batch,
            filter_score_with=None
        )
        if not rvws:
            break

        df = pd.DataFrame(rvws)
        df["Platform"] = app_name
        total_reviews.append(df)
        
        count += len(df)
        print(f"{count} collected...")

reviews_df = pd.concat(total_reviews, ignore_index=True)
reviews_df.to_csv("../data/raw/reviews_raw.csv", index=False)

print("Total collected:", len(reviews_df))


Collecting reviews for → Facebook
2000 collected...
4000 collected...
6000 collected...
8000 collected...
10000 collected...
12000 collected...
14000 collected...
16000 collected...
18000 collected...
20000 collected...
22000 collected...
24000 collected...
26000 collected...
28000 collected...
30000 collected...
32000 collected...
34000 collected...
36000 collected...
38000 collected...
40000 collected...
42000 collected...
44000 collected...
46000 collected...
48000 collected...
50000 collected...
Collecting reviews for → Instagram
2000 collected...
4000 collected...
6000 collected...
8000 collected...
10000 collected...
12000 collected...
14000 collected...
16000 collected...
18000 collected...
20000 collected...
22000 collected...
24000 collected...
26000 collected...
28000 collected...
30000 collected...
32000 collected...
34000 collected...
36000 collected...
38000 collected...
40000 collected...
42000 collected...
44000 collected...
46000 collected...
48000 collected...
50000 co

In [5]:
import pandas as pd

df = pd.read_csv("../data/processed/reviews_sentiment.csv")
df_small = df.sample(10000, random_state=42)  # consistent sampling
df_small.to_csv("../data/processed/reviews_sentiment_small.csv", index=False)

print("Sample saved: 10k rows")


Sample saved: 10k rows
