In [1]:
import pandas as pd
import datetime
from pytrends.pytrends.request import TrendReq


In [3]:
def merge_by_rising_and_top(df_rising, df_top):
    len_rising = len(df_rising)
    len_top = len(df_top)
    
    if len_rising < len_top:
        for i in range(len_rising, len_top):
            df_rising = df_rising.append(pd.Series(), ignore_index=True)
    elif len_top < len_rising:
        for i in range(len_top, len_rising):
            df_top = df_top.append(pd.Series(), ignore_index=True)
                        
    df_rising.insert(1, df_top.columns[0], df_top[df_top.columns[0]].values, True)
    df_rising.insert(3, df_top.columns[1], df_top[df_top.columns[1]].values, True)
    
    return df_rising
    

In [7]:
def acquire_top20_google_trends_per_day(geo, start_date, end_date, head_size):
    pytrend = TrendReq()
    
    queries_df = pd.DataFrame()
    topics_df = pd.DataFrame()
    
    current_date = start_date
    while current_date <= end_date:
        current_date_str = str(current_date)
        timeframe = f"{current_date_str} {current_date_str}"
        pytrend.build_payload(kw_list=[' '], geo=geo, timeframe=timeframe)
        
        df_queries_by_rising = pytrend.related_top_queries_by_rising()
        df_queries_by_top = pytrend.related_top_queries_by_top()
        df_queries_tmp = merge_by_rising_and_top(df_queries_by_rising, df_queries_by_top)
    
        df_queries_tmp.insert(loc=0, column='Date', value=current_date_str)
        
        df_topics_by_rising = pytrend.related_top_topics_by_rising()
        df_topics_by_top = pytrend.related_top_topics_by_top()
        df_topics_tmp = merge_by_rising_and_top(df_topics_by_rising, df_topics_by_top)

        df_topics_tmp.insert(loc=0, column='Date', value=current_date_str)
        
        
        #TODO: optimize it
        queries_df = queries_df.append(df_queries_tmp.head(head_size))
        topics_df = topics_df.append(df_topics_tmp.head(head_size))
        
        current_date += datetime.timedelta(days=1)
        
    topics_df.insert(loc=5, column='geo', value=geo)
    queries_df.insert(loc=5, column='geo', value=geo)
        
    return topics_df, queries_df
    

In [11]:
def collect_top20_google_trends_per_day(geos, start_date, end_date, path):
    for geo in geos:
        topics_df, queries_df =  acquire_top20_google_trends_per_day(geo=geo, 
                                                                               start_date=start_date, 
                                                                               end_date=end_date,
                                                                               head_size=15)
        topics_df.to_csv(f"{path}/google-trends-search-topics-{geo}.csv", index=False)
        queries_df.to_csv(f"{path}/google-trends-search-queries-{geo}.csv", index=False)

In [9]:
data_path = "../data/google-trends"

In [12]:
collect_top20_google_trends_per_day(geos=['US', 'US-NY'], 
                                           start_date=datetime.date(2019, 5, 1), 
                                           end_date=datetime.datetime.now().date(),
                                           path=data_path)