In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [None]:
all_queries = pd.read_csv("./data/all_queries_8countries.csv")

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

common_words_to_exclude = ["feminism", "what"]

def preprocess_text(text):
    # Tokenize the text and remove common words to exclude
    tokens = [word for word in text.lower().split() if word not in common_words_to_exclude]
    return " ".join(tokens)

def calculate_cosine_similarity(query, term, tfidf_vectorizer):
    # Transform query and term into TF-IDF vectors
    query_vector = tfidf_vectorizer.transform([query])
    term_vector = tfidf_vectorizer.transform([term])

    # Calculate cosine similarity between query and term vectors
    similarity_score = cosine_similarity(query_vector, term_vector)
    return similarity_score[0][0]

def group_similar_terms(df, threshold=0.5):
    term_groups = {col: {} for col in df.columns}

    # Create a TF-IDF vectorizer to convert text to numerical vectors
    tfidf_vectorizer = TfidfVectorizer()

    for col in df.columns:
        # Preprocess and fit-transform the text data for the column
        text_data = df[col].apply(preprocess_text)
        tfidf_matrix = tfidf_vectorizer.fit_transform(text_data)

        for query in df[col]:
            found_group = None

            for group, terms in term_groups[col].items():
                for term in terms:
                    similarity_score = calculate_cosine_similarity(query, term, tfidf_vectorizer)
                    if similarity_score >= threshold:
                        found_group = group
                        break

                if found_group:
                    break

            if found_group is None:
                found_group = f'Group_({query})'
                term_groups[col][found_group] = [query]
            else:
                term_groups[col][found_group].append(query)

    grouped_data = {col: [term for group, terms in term_groups[col].items() for term in terms] for col in df.columns}
    grouped_df = pd.DataFrame(grouped_data)

    return grouped_df

def produce_groups(df, threshold=0.5):
    term_groups = defaultdict(list)

    # Create a TF-IDF vectorizer to convert text to numerical vectors
    tfidf_vectorizer = TfidfVectorizer()

    for col in df.columns:
        # Preprocess and fit-transform the text data for the column
        text_data = df[col].apply(preprocess_text)
        tfidf_matrix = tfidf_vectorizer.fit_transform(text_data)

        for query in df[col]:
            found_group = None

            for group, terms in term_groups.items():
                for term in terms:
                    similarity_score = calculate_cosine_similarity(query, term, tfidf_vectorizer)
                    if similarity_score >= threshold:
                        found_group = group
                        break

                if found_group:
                    break

            if found_group is None:
                found_group = f'Group_({query})'
                term_groups[found_group].append(query)
            else:
                term_groups[found_group].append(query)

    grouped_terms_dict = {group: list(set(terms)) for group, terms in term_groups.items()}

    return grouped_terms_dict

def group_info(df, threshold=0.5):
    grouped_dataframe = group_similar_terms(df, threshold)
    groups = produce_groups(df, threshold)
    return grouped_dataframe, groups

# Example usage:
# grouped_all_queries, all_group_dict = group_info(all_queries.iloc[:, 3:], 0.5)


In [None]:
def filtered_unique_counts(df):
    val_count1 = pd.value_counts(df.values.flatten())
    filtered_val1 = val_count1[(val_count1 > 100) & (val_count1 < 2000)]

    return filtered_val1

In [None]:
grouped_all_queries.to_csv("./cleaned_data/grouped_all_queries3.csv", index = False)

In [None]:
import json

def save_dict_to_json(dictionary, filename):
    with open(filename, 'w') as json_file:
        json.dump(dictionary, json_file)

In [None]:
all_group_dict = dict_cleaner(all_group_dict)

In [None]:
save_dict_to_json(all_group_dict, './cleaned_data/all_group_dict3.json')

In [None]:
all_group_df = pd.DataFrame(all_group_dict.items(), columns=["Grouped Term", "Related Queries"])

In [None]:
all_group_df.to_csv("./cleaned_data/all_group_dictionary3.csv", index = False)