# Trending.py Ver 0.1

network of words built around an input word based on their trending status on Google Trends via the unoffical open source API pytrends

## Imports

In [2]:
import pandas as pd                        
from pytrends.request import TrendReq
import plotly.express as px
from scipy.signal import savgol_filter
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import math
from random import shuffle

## Set Up Connection to Google Trends

In [3]:
# set up connction to google trends
# build a payload for a keyword, geo can be set via two letters for country (DE = Germany), or left blank ('') for worldwide
def trend(keyword, geo, timeframe):
    pytrend = TrendReq(hl='de')
    pytrend.build_payload(kw_list=[keyword],geo=geo , timeframe =  timeframe)
    
    # return interest over time
    trend.interest = pytrend.interest_over_time()
    
    # return related queries
    related = pytrend.related_queries()
    trend.related = related[keyword]
    trend.related_top = trend.related['top']
    trend.related_rising = trend.related['rising']
    
    # return categories this word belongs to -- this seems to not always 100% recreated google trend results/use carefully
    trend.related_topics = pytrend.related_topics()[keyword]
    trend.related_topics_top = trend.related_topics['top']  # topics are in english by default
    trend.related_topics_rising = trend.related_topics['rising']
    

## Get Data

### Define search parameters

In [122]:
searchterm = 'Data Science'
language_region = 'DE'
period = 'today 5-y'
secondlevelcut = 10

### First level search related queries

In [125]:
#LEVEL 1 search, get related topics or queries to the initial searchterm
trend(searchterm,language_region, period)
#call different objects that you are interestet in, every object should return a dict or an dataframe
df_trend_related_search_top = trend.related_top
#drop value column
df_trend_related_search_top = df_trend_related_search_top.drop(columns={"value"})
df_trend_related_search_top

Unnamed: 0,query
0,python data science
1,master data science
2,data science jobs
3,data science studium
4,data scientist
5,data science gehalt
6,towards data science
7,r for data science


### Visualize found data

In [11]:
def visualize_interest(df):
    fig = px.line(df, x=df.index, y=savgol_filter(df.iloc[:,0],13,2), title=f'Interest in {df.columns[0]}')
    fig['layout']['yaxis1'].update(title='', range=[0, 100], autorange=False)
    fig.show()

def visualize_related_queries(df):
    fig = make_subplots(rows=1, cols=2)

    fig.append_trace(go.Bar(
        x=df['top']['query'],
        y=df['top']['value'],
        name='top'
    ), row=1, col=1)

    fig.append_trace(go.Bar(
        x=df['rising']['query'],
        y=df['rising']['value'],
        name='rising'
    ), row=1, col=2)

    fig.update_layout(height=600, width=1000, title_text="related queries")
    fig.show()
    
def visualize_topics(df):
    fig = make_subplots(rows=1, cols=2)

    fig.append_trace(go.Bar(
        x=df['top']['topic_title'],
        y=df['top']['value'],
        name='top'
    ), row=1, col=1)

    fig.append_trace(go.Bar(
        x=df['rising']['topic_title'],
        y=df['rising']['value'],
        name='rising'
    ), row=1, col=2)

    fig.update_layout(height=800, width=1000, title_text="related topics")
    fig.show()  

In [135]:
visualize_interest(trend.interest)
visualize_related_queries(trend.related)
visualize_topics(trend.related_topics)

NameError: name 'visualize_interest' is not defined

### Second level search related queries

In [126]:
#LEVEL 2 search, get related topics or queries from topics/queries related to the initial searchterm
df_head = pd.DataFrame(df_trend_related_search_top)
df_head = df_head.set_index(['query'])
dict_out = {}

#iterate over df and search every keyword
for i in range(0,len(df_trend_related_search_top),1):
    #rint(df_trend_related_search_top['query'][i])
    searchstring = df_trend_related_search_top['query'][i]
    
    #start the search at Level 2
    trend(searchstring,language_region, period)
    #call different objects that you are interestet in, every object should return a dict or an dataframe
    df_out_search_top = trend.related_top
   
    #if values found
    if (df_out_search_top is not None):
        values = df_out_search_top['query'].tolist()
        dict_out[searchstring] = values

#print(dict_out)

#df which contains the top related search terms to the start term
df_head = df_head.transpose()

#df which contains the top related search terms to the top related search terms
df_top_related = pd.DataFrame.from_dict(dict_out, orient='index')
df_top_related = df_top_related.transpose()

#output matrix
df_result = pd.concat([df_head,df_top_related])
df_result = df_result[:secondlevelcut]
df_result

Unnamed: 0,python data science,master data science,data science jobs,data science studium,data scientist,data science gehalt,towards data science,r for data science
0,,,,,data scientist gehalt,,,


### First level search related topics

In [127]:
#LEVEL 1 search
trend(searchterm,language_region, period)

#call different objects that you are interestet in, every object should return a dict or an dataframe
df_trend_related_topics_top = trend.related_topics_top
#drop value column
df_trend_related_topics_top = df_trend_related_topics_top.drop(columns={"value"})

#Fix for the problem: Double entrysin column topic_title. One is "Thema" the other is e.g. "Behörde" 
#FIRST STEP: search for duplicates with different topic_type and drop the one !='Thema'
#initialize df_doubles with the dataframe 
df_doubles = df_trend_related_topics_top
#find duplicates
df_doubles = df_doubles[(df_doubles.duplicated(subset=['topic_title'], keep= False) == True)&(df_doubles['topic_type']!= 'Thema')]
#add column double and sign drop or not
df_trend_related_topics_top['double'] = df_trend_related_topics_top.index.isin(df_doubles.index)
#drop rows based on column "double"
df_trend_related_topics_top= df_trend_related_topics_top[df_trend_related_topics_top['double'] != True]
#drop column double
df_trend_related_topics_top = df_trend_related_topics_top.drop(['double'], axis=1)
#reset index
df_trend_related_topics_top = df_trend_related_topics_top.reset_index(drop=True)

#SECOND STEP: find doubles both with topic_type='Thema' and keep the first
df_trend_related_topics_top = df_trend_related_topics_top.drop_duplicates(subset=['topic_title'], keep='first')
#reset index
df_trend_related_topics_top = df_trend_related_topics_top.reset_index(drop=True)

df_trend_related_topics_top

Unnamed: 0,formattedValue,hasData,link,topic_mid,topic_title,topic_type
0,100,True,/trends/explore?q=/m/026sq&date=today+5-y&geo=DE,/m/026sq,Daten,Thema
1,96,True,/trends/explore?q=/m/06mq7&date=today+5-y&geo=DE,/m/06mq7,Wissenschaft,Thema
2,95,True,/trends/explore?q=/m/0jt3_q3&date=today+5-y&ge...,/m/0jt3_q3,Data Science,Thema
3,13,True,/trends/explore?q=/m/016t_3&date=today+5-y&geo=DE,/m/016t_3,Master,Abschluss
4,10,True,/trends/explore?q=/m/07tf8&date=today+5-y&geo=DE,/m/07tf8,Universität,Schulform
5,10,True,/trends/explore?q=/m/05z1_&date=today+5-y&geo=DE,/m/05z1_,Python,Programmiersprache
6,6,True,/trends/explore?q=/m/0dkw5&date=today+5-y&geo=DE,/m/0dkw5,Maschine,Thema
7,6,True,/trends/explore?q=/m/01hyh_&date=today+5-y&geo=DE,/m/01hyh_,Maschinelles Lernen,Thema
8,5,True,/trends/explore?q=/m/07hs4p&date=today+5-y&geo=DE,/m/07hs4p,Kurs,Lehrveranstaltung
9,5,True,/trends/explore?q=/m/02gcn9&date=today+5-y&geo=DE,/m/02gcn9,Analytisches Informationssystem,Thema


### Second level search related topics

In [128]:
df_head_topics = pd.DataFrame(df_trend_related_topics_top)
df_head_topics = df_head_topics.set_index(['topic_title'])
dict_topics_out = {}

#iterate over df and search every keyword
for i in range(0,len(df_trend_related_topics_top),1):
    #print(df_trend_related_search_top['query'][i])
    searchstring = df_trend_related_topics_top['topic_mid'][i]
    
    #start the search at Level 2
    trend(searchstring,language_region, period)
    #call different objects that you are interestet in, every object should return a dict or an dataframe
    df_out_topics_top = trend.related_topics_top
   
    #if values found
    if (df_out_topics_top is not None):
        topic_values = df_out_topics_top['topic_title'].tolist()
        dict_topics_out[df_trend_related_topics_top['topic_title'][i]] = topic_values
        
        
#df which contains the top related search terms to the start term
df_head_topics = df_head_topics[['topic_mid']] #drop every column except value and topic_mid
df_head_topics = df_head_topics.transpose()
#trop topic_mid
df_head_topics = df_head_topics.drop('topic_mid')

#df which contains the top related search terms to the top related search terms
df_top_topics_related = pd.DataFrame.from_dict(dict_topics_out, orient='index')
df_top_topics_related = df_top_topics_related.transpose()

#output matrix
df_result_topics = pd.concat([df_head_topics, df_top_topics_related])
#result_topics = pd.concat([df_head_topics, df_top_topics_related], join="inner")
df_result_topics = df_result_topics[:secondlevelcut]
df_result_topics

Unnamed: 0,Daten,Wissenschaft,Data Science,Master,Universität,Python,Maschine,Maschinelles Lernen,Kurs,Analytisches Informationssystem,...,Statistik,R,Künstliche Intelligenz,Big Data,Informatik,Geschäft,Lernen,Analyse,Bachelor,Fernstudium
0,Technische Daten,Wissenschaftliche Arbeit,Daten,Universität,Universität zu Köln,Liste,Maschinelles Lernen,Maschine,Volkshochschule,Daten,...,Statistik,Daten,Intelligenz,Daten,Universität,Betriebswirtschaftslehre,Maschine,SWOT-Analyse,Master,Studium
1,Technik,Universität,Wissenschaft,Bachelor,Webmail,Zeichenkette,Nespresso,Lernen,FitX,SAP,...,Coronavirus disease 2019,RStudio,Maschine,Analytisches Informationssystem,Studium,Business Development,Maschinelles Lernen,Daten,Universität,IU Internationale Hochschule
2,Microsoft Excel,Arbeit,Python,Studium,Goethe-Universität Frankfurt am Main,Datei,Waschmaschine,Daten,Lernen,Datenanalyse,...,Fortnite,Graphische Darstellung,Maschinelles Lernen,Künstliche Intelligenz,Informatiker,International,Sprache,Beispiel,Der Bachelor,Master
3,Datei,Daten,Arbeitsentgelt,Betriebswirtschaftslehre,Mainz,Feld,Nespresso,Python,Sprache,Business Objects,...,Stats Royale für Clash Royale,RStudio,Roboter,Definition,Computer,Showgeschäft,Deep Learning,Lyrik,Studium,Ausbildung
4,Data Science,Data Science,Master,Psychologie,Bonn,Zuordnungstabelle,Waschen,Künstliche Intelligenz,Erste Hilfe,Cloud Computing,...,Comunio,Column,WOMBO,Analyse,Wissenschaft,Internationales Management,good,Interpretation,Wissenschaft,Psychologie
5,Personenbezogene Daten,Science-Fiction,Universität,Wissenschaft,Universität Duisburg-Essen,pandas,Lernen,Lernen,Barista,Google Analytics,...,World of Tanks,Funktion,Programmpaket,Intelligenz,Master,Management,Python,Datenanalyse,Betriebswirtschaftslehre,Universität
6,Big Data,Fiktion,Maschinelles Lernen,Bewerbung,Johannes Gutenberg-Universität Mainz,NumPy,Kaffee,Intelligenz,Training,Predictive Analytics,...,Influenza,ggplot2,Börsengehandelter Fonds,Cloud Computing,Informatikstudium,Business Intelligence,Daten,Kurzgeschichte,Bachelor of Science,Hagen
7,Wissenschaft,Master,Maschine,Management,Rheinische Friedrich-Wilhelms-Universität Bonn,Daten,Espressomaschine,Algorithmus,Industrie- und Handelskammer,Geschäft,...,R6Stats,Tabelle,ASUS,Datenanalyse,Bachelor,Intelligenz,Kurs,Statistik,Bachelor of Arts,Bachelor
8,Datenbank,Bachelor,Kurs,Fernstudium,Bibliothek,Variable,Espresso,Deep Learning,McFit,Big Data,...,Todesfall,Datensatz,Technologie,Börsengehandelter Fonds,Arbeitsentgelt,Geschäftsleiter,Bestärkendes Lernen,Cartoon,Arbeitsentgelt,sozial
9,Löschen von Daten,Biologie,Datenanalyse,Arbeitsentgelt,Universität Hamburg,Funktion,Nähmaschine,Data Science,Udemy,Business Analytics,...,Tom Clancy's Rainbow Six: Siege,Datei,Asus,Geschäft,Mathematik,Geschäftsplan,Verstärkung,Inhaltsangabe,Psychologie,Soziale Arbeit


## Ranking

### Ranking function

#### Define different reanking methods

In [114]:
# Last Date
def byLastDate(df):
    df = df.iloc[-1:]
    sorted_df = df.sort_values(df.last_valid_index(), ascending=False, axis=1)
    sorted_df = sorted_df.transpose()  
    sorted_df = sorted_df.reset_index()
    return sorted_df

# certain Date
def byDate(df,t):
    df = df[df.index == t]
    sorted_df = df.sort_values(df.last_valid_index(), axis=1)
    return sorted_df

# multiple Dates
def byMultiDates(df,*args):
    df_0 = pd.DataFrame()
    for arg in args:
        df_0 = df_0.append(df[df.index == arg])
        sorted_df = df_0.sort_values(df_0.last_valid_index(), axis=1)
    return sorted_df


#mean 
def byMean(df):
    df = df.transpose()
    df['mean'] = df.mean(axis=1)
    df = df['mean'].reset_index()
    sorted_df = df.sort_values(by='mean', ascending=False).reset_index(drop=True)
    top = round(sorted_df.loc[0]['mean'],4)
    sorted_df['normalized'] = sorted_df.apply(lambda x: (x['mean']/top)*100, axis=1).round(6)
    return sorted_df


#last date non zero

#pre last date

#### Rank searchtmerms from tuplets

The function getValues(data) needs as its argument a list with search strings. A dataframe with the normalized trend values over time will be returned.

In [45]:
def getValues(data):
    if (type(data) == list):
        tmp = data
        lists = []
        df_values = pd.DataFrame()
        while (len(tmp) > 0):            
            for i in range(0, 4, 1):
                part = tmp[0:5]
                lists.append(part)
                if (len(tmp) > 0):
                    tmp = tmp[4:]
                else:
                    break
                break
        for sublist in lists:
#            print(sublist)
            #search each sublist
            pytrend1 = TrendReq(hl='de')
            pytrend1.build_payload(kw_list=sublist,geo=language_region , timeframe = period)    
            df_out = pytrend1.interest_over_time()
            
            #drop column
            df_out = df_out.drop(['isPartial'], axis=1)
            
            #fix bug -->mean
            #df_out = df_out.mean(axis=0).reset_index().set_index('index').rename(columns={0: 'value'}).transpose()
            
            #initial df_values in first step
            if(len(df_values) <= 0):
                df_values = pd.concat([df_values, df_out], axis=1)
                
            else:
                #ATTENTION: avoid zero division
                if ((df_out.iloc[0,0] != 0)&(df_values.iloc[0,-1] != 0) ):
                    #normFac= float(df_values.iloc[0,-1] / df_out.iloc[0,0])
                    normFac= float(df_values.iloc[:,-1].mean() / df_out.iloc[:,0].mean())
#                     print(df_values.iloc[:,-1].mean())
#                     print(df_out.iloc[:,0].mean())
                    #problem: Beginn einer Kettenreaktion sobald ein Faktor 0 wird
                    #print(df_out)
#                     print('Last mean value previous set: '+str(df_values.iloc[:,-1].mean()))
#                     print('First mean value actual set: '+str(df_out.iloc[:,0].mean()))
#                     print('Normalization factor: '+str(normFac))
                    
                else:
                    #use mean() for normalization factor
                    #normalize
                    df_values = df_values.replace([0],0.000001)
                    df_values = df_values.fillna(0.000001)
                    df_out = df_out.replace([0],0.000001)
                    df_out = df_out.fillna(0.000001)
                    #use this value instead of zero or NULL -->Problematic! 
                    #if (df_out.iloc[0,0] == 0):                        
                        #df_out.iloc[0,0] = 0.1
                    #else: 
                    #    df_out.iloc[0,-1]= 0.1
                    
                    #normFac= float(df_values.iloc[0,-1] / df_out.iloc[0,0])
                    normFac= float(df_values.iloc[:,-1].mean() / df_out.iloc[:,0].mean())
                    #normFac = 0.0000000000000000001
                    #print(df_out)
#                     print('Last mean value previous set: '+str(df_values.iloc[:,-1].mean()))
#                     print('First mean value actual set: '+str(df_out.iloc[:,0].mean()))
#                     print('Normalization factor: '+str(normFac))
                
                #normalize df_out
                for column in df_out:
                    df_out[column]= df_out[column] * normFac
                df_out = df_out.round(0)
                

                df_out.drop(columns=df_out.columns[0], axis=1, inplace=True)
                df_values = pd.concat([df_values, df_out], axis=1)
            
        
        
    elif (type(data) == str):
        trend(data,language_region, period)
        df_values = trend.interest
       
                
            
    
    else:
        print('Error.')
    return df_values

#### Transform function

This function transform a given dataframe into a list by adding column header and ALL values.

In [14]:
def dftolist(df):
    list_out = []
    
    #add column header to list
    list_out = df.columns.tolist()
    
    #iterate over columns
    for column in df:
        #all column values (as a list) to temp_list
        temp_list = df[column].values
        #reassign temp_list and add each item except None to the new templist 
        temp_list = [i for i in temp_list if i is not None]
        #reassign temp_list and add each item except nan to the new templist 
        temp_list = [x for x in temp_list if str(x) != 'nan']

        #add each entry from temp_list to the output list
        for item in temp_list:
            list_out.append(item)    
    
    #remove double entrys
    list_out = list(set(list_out))
    return list_out

#### Rank all the matrix values (search terms)

In [106]:
# f=sorting function, df=data, k=related topics=1 or queries=2
def df_to_ranking(f, df, k):    
    #all entrys in search_list
    search_list = dftolist(df)
    #find all values
    df_values = getValues(search_list)
    #transpose matrix for easier handling
    #df_values = df_values.transpose()  
    df_valuelist = df_values
    # pick desired sorting algorithm
    sorted_data = f(df_valuelist)
    if k ==1:
        sorted_data['relationship'] = sorted_data.apply(lambda x: getRelationship(x['index'], 'topic'), axis=1)
    elif k == 2:
        sorted_data['relationship'] = sorted_data.apply(lambda x: getRelationship(x['index'], 'term'), axis=1)
        
    return sorted_data

#### Get relationship

In [17]:
#get the ralationship of a given term/topic
def getRelationship(searchterm, searchtype):
    if (searchtype == 'term'):
        #dict with all first&second level SEARCHTERMS
        search_on=dict_out        
    elif(searchtype == 'topic'):
        #dict with all first&second level TOPICS
        search_on=dict_topics_out        
    else:
        print('Error. Can`t get relationship')
        return 'Error. Can`t get relationship'
        
    #first level
    first_level = []
    first_level = list(search_on.keys())
    #first_level
    #print('First Level: '+str((searchterm in first_level)))

    #second level values
    second_level = []
    dict_list = list(search_on.values())
    for l in dict_list:
        for item in l:
            second_level.append(item)
    #second_level
    #print('Second Level: '+str((searchterm in second_level)))
    
    
    #return
    if((searchterm in first_level) & (searchterm in second_level)):
        back = 'both levels'
    elif ((searchterm in first_level)):
        back = 'first level'
    elif((searchterm in second_level)):
        back = 'second level'
    else: 
        back = 'Error'
    
    return back

### Start ranking and get output

In [129]:
df_searchterm_ranking = df_to_ranking(byLastDate, df_result, 2)
#df_searchtopic_ranking = df_to_ranking(df_result_topics)

#normalize df_searchterm_ranking
#top = round(df_searchterm_ranking.loc[0]['mean'],4)
#df_searchterm_ranking['normalized'] = df_searchterm_ranking.apply(lambda x: (x['mean']/top)*100, axis=1).round(6)

#normalize df_searchtopic_ranking
#top = round(df_searchtopic_ranking.loc[0]['mean'],4)
#df_searchtopic_ranking['normalized'] = df_searchtopic_ranking.apply(lambda x: (x['mean']/top)*100, axis=1).round(6)

df_searchterm_ranking.head(20)
#df_searchtopic_ranking.head(20)

date,index,2021-12-05 00:00:00,relationship
0,data scientist,59.0,first level
1,master data science,20.0,Error
2,python data science,19.0,Error
3,data scientist gehalt,6.0,second level
4,data science jobs,1e-06,Error
5,data science studium,1e-06,Error
6,towards data science,1e-06,Error
7,data science gehalt,1e-06,Error
8,r for data science,1e-06,Error


In [130]:
df_searchterm_ranking = df_to_ranking(byMean, df_result,2)
df_searchterm_ranking.head(20)

Unnamed: 0,index,mean,normalized,relationship
0,data scientist,36.153846,100.000128,first level
1,master data science,10.673077,29.521315,Error
2,python data science,8.811539,24.372372,Error
3,data science jobs,5.734616,15.861724,Error
4,data scientist gehalt,5.315385,14.702148,second level
5,data science studium,3.269231,9.042567,Error
6,data science gehalt,2.184616,6.042563,Error
7,towards data science,1.823078,5.042562,Error
8,r for data science,1.800001,4.978732,Error


In [131]:
df_searchtopic_ranking = df_to_ranking(byLastDate, df_result_topics, 1)

In [132]:
df_searchtopic_ranking.head(20)

date,index,2021-12-05 00:00:00,relationship
0,Kurs,9229563.0,both levels
1,Mainz,5769275.0,second level
2,Daten,3356205.0,both levels
3,Funktion,2013723.0,second level
4,Medizin,1678102.0,second level
5,Bachelor,914305.0,both levels
6,Geschäft,576928.0,both levels
7,Column,261230.0,second level
8,Technologie,167810.0,second level
9,Künstliche Intelligenz,130615.0,both levels


In [133]:
df_searchtopic_ranking = df_to_ranking(byMean, df_result_topics, 1)
df_searchtopic_ranking.head(20)

Unnamed: 0,index,mean,normalized,relationship
0,Mainz,8074322.0,100.0,second level
1,Kurs,6281267.0,77.79311,both levels
2,Daten,4474725.0,55.419198,both levels
3,Bachelor,2107423.0,26.100305,both levels
4,Funktion,1944663.0,24.084529,second level
5,Medizin,1864630.0,23.093331,second level
6,Geschäft,651040.7,8.0631,both levels
7,Column,394356.8,4.884086,second level
8,Google Analytics,289795.4,3.589098,second level
9,Technologie,194917.8,2.414045,second level


### Visualize results

In [112]:
def visualize_ranking(df):
    data = df
    fig = px.bar(data, x='index', y='mean')
    fig.show()

In [134]:
visualize_ranking(df_searchterm_ranking.head(20))