In [1]:
import pandas as pd
import numpy as np

from pytrends.request import TrendReq
from pytrends.exceptions import ResponseError

import time
import os

In [2]:
!ls ./Data

bbc_news.csv                          description-dbscan-clusters.csv
bbc_news_new.csv                      description-kmeans-clusters.csv
combined-agg-complete-clusters.csv    description-optics-clusters.csv
combined-agg-ward-clusters.csv        title-agg-complete-clusters.csv
combined-dbscan-clusters.csv          title-agg-ward-clusters.csv
combined-kmeans-clusters.csv          title-dbscan-clusters.csv
combined-optics-clusters.csv          title-kmeans-clusters.csv
description-agg-complete-clusters.csv title-optics-clusters.csv
description-agg-ward-clusters.csv     [34mtopics[m[m


In [3]:
file_names = list(filter(lambda x: x.endswith('-clusters.csv') and x not in os.listdir('./Data/topics/'), os.listdir('./Data')))
file_names

['description-agg-complete-clusters.csv',
 'title-agg-complete-clusters.csv',
 'description-dbscan-clusters.csv',
 'combined-optics-clusters.csv',
 'description-kmeans-clusters.csv',
 'title-optics-clusters.csv',
 'description-agg-ward-clusters.csv',
 'title-kmeans-clusters.csv',
 'title-dbscan-clusters.csv',
 'combined-kmeans-clusters.csv',
 'combined-dbscan-clusters.csv',
 'combined-agg-ward-clusters.csv',
 'title-agg-ward-clusters.csv',
 'description-optics-clusters.csv']

In [4]:
datas = {k: None for k in file_names}
datas

{'description-agg-complete-clusters.csv': None,
 'title-agg-complete-clusters.csv': None,
 'description-dbscan-clusters.csv': None,
 'combined-optics-clusters.csv': None,
 'description-kmeans-clusters.csv': None,
 'title-optics-clusters.csv': None,
 'description-agg-ward-clusters.csv': None,
 'title-kmeans-clusters.csv': None,
 'title-dbscan-clusters.csv': None,
 'combined-kmeans-clusters.csv': None,
 'combined-dbscan-clusters.csv': None,
 'combined-agg-ward-clusters.csv': None,
 'title-agg-ward-clusters.csv': None,
 'description-optics-clusters.csv': None}

In [5]:
for k in file_names:
    datas[k] = pd.read_csv(f'./Data/{k}')
datas

{'description-agg-complete-clusters.csv':             word     tfidf  label
 0         forget  0.572116      8
 1        worried  0.584743      8
 2        extreme  0.533802      8
 3        helping  1.000000      8
 4           done  0.602631      8
 ...          ...       ...    ...
 1372  vulnerable  0.593579      9
 1373      launch  0.658241      9
 1374        book  0.812209      9
 1375     decider  0.479084      9
 1376        wild  0.742275      9
 
 [1377 rows x 3 columns],
 'title-agg-complete-clusters.csv':            word     tfidf  label
 0        punish  0.547101      6
 1        global  0.709860      6
 2         arena  0.614520      6
 3       highest  0.577350      6
 4         grant  0.568683      6
 ...         ...       ...    ...
 1561       year  0.918845      7
 1562     esteem  0.506666      7
 1563  childhood  0.597876      7
 1564    slavery  0.521664      7
 1565       epic  0.791599      7
 
 [1566 rows x 3 columns],
 'description-dbscan-clusters.csv':     

In [6]:
trends = TrendReq(hl='en-US', tz=360)
trends

<pytrends.request.TrendReq at 0x7ff6d1b30cd0>

In [7]:
def infer_topic(data):
    topics_data = pd.DataFrame()
    for kw in data['word']:
        print('Starting: ', kw)
        
        backoff = 1.0001
        while True:
            try:
                trends.build_payload(kw_list=[kw])
                related_topics = trends.related_topics()[kw]['top'][['topic_title', 'topic_type']][:3].T
                break
            except ResponseError as e:
                print('Retrying:', e)
                time.sleep(1 * backoff)
                if backoff < 100:
                    backoff = pow(backoff, 2)
            except Exception as e:
                print('Bad situation...')
                time.sleep(10)
            print('Using backoff: ', backoff)
        n = min(len(related_topics.loc['topic_title', :]), len(related_topics.loc['topic_type', :]))
        tmp = pd.DataFrame(pd.concat([related_topics.loc['topic_title', :], related_topics.loc['topic_type', :]])).T
        tmp.columns = [f'topic_title_{c}' for c in tmp.columns[:n]] + [f'topic_type_{c}' for c in tmp.columns[n:]]

        topics_data = topics_data.append(tmp)
        
        print('Finished: ', kw)
        print('*' * 20)
        time.sleep(1)
    return topics_data

In [8]:
topics = {k: None for k in file_names}
topics

{'description-agg-complete-clusters.csv': None,
 'title-agg-complete-clusters.csv': None,
 'description-dbscan-clusters.csv': None,
 'combined-optics-clusters.csv': None,
 'description-kmeans-clusters.csv': None,
 'title-optics-clusters.csv': None,
 'description-agg-ward-clusters.csv': None,
 'title-kmeans-clusters.csv': None,
 'title-dbscan-clusters.csv': None,
 'combined-kmeans-clusters.csv': None,
 'combined-dbscan-clusters.csv': None,
 'combined-agg-ward-clusters.csv': None,
 'title-agg-ward-clusters.csv': None,
 'description-optics-clusters.csv': None}

In [9]:
for k, d in datas.items():
    topics[k] = infer_topic(d)
    topics[k].to_csv(f'./Data/topics/{k}', index=False)

Starting:  forget
Retrying: The request failed: Google returned a response with code 429.
Using backoff:  1.00020001
Retrying: The request failed: Google returned a response with code 429.
Using backoff:  1.000400060004
Retrying: The request failed: Google returned a response with code 429.
Using backoff:  1.0008002800560067
Retrying: The request failed: Google returned a response with code 429.


KeyboardInterrupt: 

In [None]:
!mkdir ./Data/topics

In [None]:
for k, v in topics.items():
    v.to_csv(f'./Data/topics/{k}')