In [11]:
import pandas as pd

In [12]:
all_queries = pd.read_csv("./data/all_queries_8countries.csv")

In [18]:
import pandas as pd
import spacy
from collections import defaultdict

common_words_to_exclude = ["feminism", "what"]

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

def calculate_similarity(query, term):
    # Calculate similarity using spaCy's similarity score
    query_doc = nlp(query)
    term_doc = nlp(term)
    similarity_score = query_doc.similarity(term_doc)
    return similarity_score

def group_similar_terms(df, threshold=0.90):
    term_groups = {col: {} for col in df.columns}

    for col in df.columns:
        for query in df[col]:
            found_group = None

            for group, terms in term_groups[col].items():
                common_word_count = 0

                for term in terms:
                    similarity_score = calculate_similarity(query, term)
                    if similarity_score >= threshold:
                        found_group = group
                        break

                    # Split the terms into words and check for common words
                    query_str = str(query)
                    term_str = str(term)
                    query_words = set(query_str.lower().split())
                    term_words = set(term_str.lower().split())

                    # Calculate the number of common words (excluding common_words_to_exclude)
                    common_word_intersection = query_words.intersection(term_words)
                    common_word_count = sum(1 for word in common_word_intersection if word not in common_words_to_exclude)

                    if common_word_count >= 2:
                        found_group = group
                        break

                if found_group:
                    break

            if found_group is None:
                found_group = f'Group_({query})'
                term_groups[col][found_group] = [query]
            else:
                term_groups[col][found_group].append(query)

    grouped_data = {col: [term for group, terms in term_groups[col].items() for term in terms] for col in df.columns}
    grouped_df = pd.DataFrame(grouped_data)

    return grouped_df

def produce_groups(df, threshold=0.90):
    term_groups = defaultdict(list)

    for col in df.columns:
        for query in df[col]:
            found_group = None

            for group, terms in term_groups.items():
                common_word_count = 0

                for term in terms:
                    similarity_score = calculate_similarity(query, term)
                    if similarity_score >= threshold:
                        found_group = group
                        break

                    # Split the terms into words and check for common words
                    query_str = str(query)
                    term_str = str(term)
                    query_words = set(query_str.lower().split())
                    term_words = set(term_str.lower().split())

                    # Calculate the number of common words (excluding common_words_to_exclude)
                    common_word_intersection = query_words.intersection(term_words)
                    common_word_count = sum(1 for word in common_word_intersection if word not in common_words_to_exclude)

                    if common_word_count >= 2:
                        found_group = group
                        break

                if found_group:
                    break

            if found_group is None:
                found_group = f'Group_({query})'
                term_groups[found_group].append(query)
            else:
                term_groups[found_group].append(query)

    grouped_terms_dict = {group: list(set(terms)) for group, terms in term_groups.items()}

    return grouped_terms_dict

def group_info(df, threshold=0.90):
    grouped_dataframe = group_similar_terms(df, threshold)
    groups = produce_groups(df, threshold)
    return grouped_dataframe, groups

# Example usage:
grouped_all_queries, all_group_dict = group_info(all_queries.iloc[:, 3:6], 0.90)


KeyboardInterrupt: 

In [None]:
def filtered_unique_counts(df):
    val_count1 = pd.value_counts(df.values.flatten())
    filtered_val1 = val_count1[(val_count1 > 100) & (val_count1 < 2000)]

    return filtered_val1

In [None]:
grouped_all_queries.to_csv("./cleaned_data/grouped_all_queries2.csv", index = False)

In [None]:
import json

def save_dict_to_json(dictionary, filename):
    with open(filename, 'w') as json_file:
        json.dump(dictionary, json_file)

In [None]:
# Cleans the dictionary by only keeping unique values
def dict_cleaner(dict):
    unique_dict = {}
    for key, values in dict.items():
        unique_values = list(set(values))
        unique_dict[key] = unique_values
    return unique_dict

all_group_dict = dict_cleaner(all_group_dict)

In [None]:
save_dict_to_json(all_group_dict, './cleaned_data/all_group_dict2.json')

In [None]:
all_group_df = pd.DataFrame(all_group_dict.items(), columns=["Grouped Term", "Related Queries"])

In [None]:
all_group_df.to_csv("./cleaned_data/all_group_dictionary.csv", index = False)