# Google Trends 
Google Trends data processing and csv file combination. 

### 1. multiTimeline csv files
Each csv file has one cause of death + a control field with monthly Google search trends. 

In [1]:
import pandas as pd, numpy as np

In [2]:
multiTimeline_files = {
    "multiTimeline_Accident.csv": True, 
    "multiTimeline_Alzheimers.csv": True,
    "multiTimeline_CLRD.csv": True, 
    "multiTimeline_DiabetesCancerStrokeSuicide.csv": False,
    "multiTimeline_Flu.csv": True, 
    "multiTimeline_Heart.csv": True, 
    "multiTimeline_Kidney.csv": True
}

In [4]:
# scale the dataframe such that the max value of the control is 100
def scale_data_control(df, exclude):
    control = [x for x in df.columns.values if "Control" in x][0]
    scaling_factor = max(df[control])/100
    
    # skip straight to return statement if scaling factor is 1
    if not scaling_factor == 1:
        scale_col = [x for x in df.columns.values if not exclude in x]
        df[scale_col] = (df[scale_col]/scaling_factor).astype(int)
    return df
    
# some dataframes require all the non-month and non-control columns to be summed
def sum_cols(df, file):
    # determine name to save column under from file name
    sum_name = file.replace("."," ").replace("_"," ").split()[1]
    
    # save summed column and remove all other component columns
    df[sum_name] = df.sum(axis=1)
    df = df[[sum_name]]
    
    return df

In [5]:
# combine all separate csv files into one
for file, sum_bool in multiTimeline_files.items():
    # read file to pandas df
    df = pd.read_csv(file)

    # if first iteration, create output dataframe that contains only the month column
    try:
        df_by_month
    except:
        df_by_month = df[["Month"]]
        
    # scale the data
    df = scale_data_control(df, "Month")
    
    # remove month and control columns
    df = df[[x for x in df.columns.values if not (("Month" in x) or ("Control" in x))]]
    
    # sum columns of data
    if sum_bool: 
        df = sum_cols(df, file)
    
    # save to final output dataframe
    df_by_month = df_by_month.join(df)

Done


In [6]:
# select the average for each cause of death over a year
def average_by_year(year, df):
    by_year = [year] + [int(df[col].mean()) for col in df.columns.values]
    
    # returns list of year + average for each cause of death
    return by_year

In [7]:
# create year column
df_by_month['Year'] = df_by_month['Month'].apply(lambda x: x.split("-")[0])

# create dict for final df
keys = df_by_month.drop(columns=["Month","Year"]).columns.values
final_dict = {"Year":[], **{k:[] for k in keys}}

# for each year
for year in df_by_month['Year'].unique():
    mask = list(df_by_month["Year"] == year)
    df = df_by_month.iloc[mask].drop(columns=["Month","Year"])
    
    # update dictionary to include yearly data
    for key, value in zip([*final_dict.keys()], average_by_year(year, df)):
        final_dict[key].append(value)

# save dictionary to dataframe
final_df = pd.DataFrame.from_dict(final_dict)
    
# save output dataframe to csv
final_df.to_csv("combinedMultiTimeline.csv")

In [8]:
final_df

Unnamed: 0,Year,Accident,Alzheimers,CLRD,Stroke,Diabetes,Cancer,Suicide,Flu,Heart,Kidney
0,2004,38,5,2,7,23,63,17,23,11,2
1,2005,38,5,3,7,21,62,16,34,10,2
2,2006,37,4,3,7,21,58,13,30,9,2
3,2007,41,4,3,6,21,56,13,21,9,2
4,2008,43,5,3,6,20,57,14,23,9,2
5,2009,50,5,3,6,20,58,14,125,9,2
6,2010,53,4,3,6,19,55,15,21,9,2
7,2011,54,5,4,7,18,55,14,23,9,2
8,2012,57,4,4,7,18,55,16,25,10,2
9,2013,62,4,4,7,18,55,15,36,9,2


### 2. geoMap csv files

In [9]:
geoMap_files = {
    "geoMap_Accident.csv": True, 
    "geoMap_Alzheimers.csv": True,
    "geoMap_CLRD.csv": True, 
    "geoMap_DiabetesCancerStrokeSuicide.csv": False,
    "geoMap_Flu.csv": True, 
    "geoMap_Heart.csv": True, 
    "geoMap_Kidney.csv": True
}

In [10]:
# combine all separate csv files into one
for file, sum_bool in geoMap_files.items():
    # read file to pandas df
    df = pd.read_csv(file)

    # if first iteration, create output dataframe that contains only the month column
    try:
        df_by_region
    except:
        df_by_region = df[["Region"]]
        
    # convert data to int and scale the data
    df = df.fillna("0")
    for col in [x for x in df.columns.values if not ("Region" in x)]:
        df[col] = df[col].apply(lambda x: int(x.replace("%","").replace("<1","0")))
    df = scale_data_control(df, "Region")
    
    # remove region and control columns
    df = df[[x for x in df.columns.values if not (("Region" in x) or ("Control" in x))]]
    
    # sum columns of data
    if sum_bool: 
        df = sum_cols(df, file)
    
    # save to final output dataframe
    df_by_region = df_by_region.join(df)

# save output dataframe to csv
df_by_region.to_csv("combinedGeoMap.csv")

Done
