In [2]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from matplotlib import pyplot as plt

In [3]:
rising_queries = pd.read_csv("./data/rising_quer_all.csv")
top_queries = pd.read_csv("./data/top_quer_all.csv")

In [6]:
rising_queries.iloc[:, 2:]

Unnamed: 0,RisingQueries1,RisingQueries2,RisingQueries3,RisingQueries4,RisingQueries5,RisingQueries6,RisingQueries7,RisingQueries8,RisingQueries9,RisingQueries10,...,RisingQueries16,RisingQueries17,RisingQueries18,RisingQueries19,RisingQueries20,RisingQueries21,RisingQueries22,RisingQueries23,RisingQueries24,RisingQueries25
0,third wave feminism,,,,,,,,,,...,,,,,,,,,,
1,third wave feminism,,,,,,,,,,...,,,,,,,,,,
2,vatican feminism,ladies against feminism,,,,,,,,,...,,,,,,,,,,
3,liberal feminism,,,,,,,,,,...,,,,,,,,,,
4,third wave feminism,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
826,waves of feminism,second wave feminism,intersectional feminism,what is feminism,feminism definition,,,,,,...,,,,,,,,,,
827,what is feminism,,,,,,,,,,...,,,,,,,,,,
828,radical feminism,feminist approach,,,,,,,,,...,,,,,,,,,,
829,google scholar,second wave feminism,,,,,,,,,...,,,,,,,,,,


In [7]:
import pandas as pd
from fuzzywuzzy import fuzz
from collections import defaultdict

def group_similar_terms(df, threshold=85):
    # Create a dictionary to store the groups of similar terms for each column
    term_groups = {col: [] for col in df.columns}

    for col in df.columns:
        for query in df[col]:
            found_group = None

            for col2 in term_groups.keys():
                for group, terms in term_groups[col2]:  # Groups and terms are a tuple. The group is the combined term and the terms are the cloud of terms. 
                    for term in terms:
                        similarity_score = fuzz.token_sort_ratio(query, term)
                        if similarity_score >= threshold:
                            found_group = group   # Assumes that there is a preexisting group. 
                            break

                    if found_group:
                        break

            # If no existing group is found, create a new group
            # Adds group name to column
            if not found_group:
                found_group = f'{query}'
                term_groups[col].append((found_group, [found_group]))

            # Add the query to the found group
            else:
                term_groups[col].append((found_group, [found_group]))

    # Convert the dictionary back to a dataframe
    # For each column in df.columns, for each group in term_groups, for each term add it to the column
    grouped_data = {col: [term for group, terms in term_groups[col] for term in terms] for col in df.columns}
    grouped_df = pd.DataFrame(grouped_data)


    return grouped_df


def produce_groups(df, threshold=85):
    # Create a dictionary to store the groups of similar terms for the entire dataset
    term_groups = defaultdict(list)

    def process_row(row):
        for col, query in row.items():
            found_group = None

            for group, terms in term_groups.items():
                for term in terms:
                    similarity_score = fuzz.token_sort_ratio(query, term)
                    if similarity_score >= threshold: 
                        found_group = group   
                        break

                if found_group:
                    break

            # If no existing group is found, create a new group
            # Adds query itself to group name.
            if not found_group:
                found_group = f'Group_({query})'
                term_groups[found_group].append(query)

            # Add the query to the found group
            else:
                term_groups[found_group].append(query)

    df.apply(process_row, axis=1)

    # Convert the dictionary to a group Dictionary
    grouped_terms_dict = {group: list(terms) for group, terms in term_groups.items()}

    return grouped_terms_dict


# Cleans the dictionary by only keeping unique values
def dict_cleaner(dict):
    unique_dict = {}
    for key, values in dict.items():
        unique_values = list(set(values))
        unique_dict[key] = unique_values
    return unique_dict


# Consolidates all functions for clean processing
def group_info(df, threshold = 85):
    grouped_dataframe = group_similar_terms(df, threshold)
    groups = dict_cleaner(produce_groups(df, threshold))
    return grouped_dataframe, groups


grouped_rising_queries, rising_group_dict = group_info(rising_queries.iloc[:, 2:], 85)
grouped_top_queries, top_group_dict = group_info(top_queries.iloc[:, 2:], 85)

In [8]:
def filtered_unique_counts(df):
    val_count1 = pd.value_counts(df.values.flatten())
    filtered_val1 = val_count1[(val_count1 > 100) & (val_count1 < 2000)]

    return filtered_val1

### Saving Grouped Data to csv

In [12]:
grouped_rising_queries.to_csv("./cleaned_data/grouped_rising_queries.csv", index = False)
grouped_top_queries.to_csv("./cleaned_data/grouped_top_queries.csv", index = False)

### Saving Group Dictionary to Json

In [16]:
import json

def save_dict_to_json(dictionary, filename):
    with open(filename, 'w') as json_file:
        json.dump(dictionary, json_file)


In [18]:
rising_group_dict = dict_cleaner(rising_group_dict)
top_group_dict = dict_cleaner(top_group_dict)

In [19]:

save_dict_to_json(rising_group_dict, './cleaned_data/rising_group_dict.json')
save_dict_to_json(top_group_dict, './cleaned_data/top_group_dict.json')