In [30]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from matplotlib import pyplot as plt
import random
import json
pd.set_option('display.max_colwidth', None)


### Loading Raw Data

In [19]:
def json_to_dict(json_fp):
    with open(json_fp, 'r') as json_file:
        data_dict = json.load(json_file)
    return data_dict

In [20]:
# Original Data
rising_queries = pd.read_csv("./data/rising_quer_all.csv")
top_queries = pd.read_csv("./data/top_quer_all.csv")

# Grouped Data
grouped_top_queries = pd.read_csv("./cleaned_data/grouped_top_queries.csv")
grouped_rising_queries = pd.read_csv("./cleaned_data/grouped_rising_queries.csv")

# Group Dictionaries
rising_group_dict = json_to_dict("./cleaned_data/rising_group_dict.json")
top_group_dict = json_to_dict("./cleaned_data/top_group_dict.json")

### Visualizing Dataframes

### Important Functions

In [6]:
import pandas as pd

def compare_dataframes(df1, df2):
    diff_df1_values = []
    diff_df2_values = []

    if df1.shape != df2.shape:
        raise ValueError("Different shapes")

    for col in df1.columns:
        for i in range(df1.shape[0]):
            value1 = df1.loc[i, col]
            value2 = df2.loc[i, col]

            if((str(value1) != str("nan")) & (str(value2) != str("nan"))):
                if value1 != value2:
                    diff_df1_values.append(value1)
                    diff_df2_values.append(value2)

    return diff_df1_values, diff_df2_values

# Removes values found in org_list
def tool_removal(df, search_terms):
    search_terms_upper = [term.capitalize() for term in search_terms]

    for col in df.columns:
        for i in range(df.shape[0]):
            cell_value = str(df.loc[i, col])
            if cell_value in search_terms:
                df.loc[i, col] = np.nan
            elif cell_value in search_terms_upper:
                df.loc[i, col] = np.nan

# Generates 5 random queries among the original and grouped datasets. 
def random_viewer(nv, ov):
    for i in range(5):
        rand_ind = random.randint(0, len(nv))
        print("New value: " + nv[rand_ind])
        print("Old value: " + ov[rand_ind])
        print("")

# Produces and counts the uniques
def count_uniques(ndf, odf):
    print("New length: " + str(len(np.unique(ndf.values.flatten()))))
    print("Old length: " + str(len(np.unique(odf.values.flatten()))))

### Removing *Search Tools*

In [8]:
tool_list = ["google", "facebook", "twitter", "bing"]
tool_removal(grouped_top_queries, tool_list)
tool_removal(grouped_rising_queries, tool_list)

### Storing Changed values

In [9]:
rising_new_vals, rising_old_vals = compare_dataframes(grouped_rising_queries, rising_queries.iloc[:, 2:])
top_new_vals, top_old_vals = compare_dataframes(grouped_top_queries, top_queries.iloc[:, 2:])

### Counting Unique Values

In [33]:
count_uniques(grouped_rising_queries.applymap(str), rising_queries.iloc[:, 2:].applymap(str))
print("")
count_uniques(grouped_top_queries.applymap(str), top_queries.iloc[:, 2:].applymap(str))

New length: 448
Old length: 581

New length: 315
Old length: 401


### Group Dictionary

In [58]:
rising_group_df = pd.DataFrame(rising_group_dict.items(), columns=["Grouped Term", "Related Queries"])

In [57]:
top_group_df = pd.DataFrame(top_group_dict.items(), columns=["Grouped Term", "Related Queries"])

In [60]:
rising_group_df.to_csv("./cleaned_data/rising_group_dictionary.csv", index = False)
top_group_df.to_csv("./cleaned_data/top_group_dictionary.csv", index = False)