# Feature Engineering

## Imports

In [None]:
import pandas as pd
import numpy as np
import asyncio
import nest_asyncio
import aiohttp
import time
import os
import re
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.utils import resample
import umap
import scipy.sparse
from typing import List, Dict, Tuple, Any, Set, Optional
import inflect
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import matplotlib.ticker as mtick
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import json
import pickle
import hdbscan
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio

## Loading Data

In [None]:
df = pd.read_parquet('arxiv_cleaned.parquet', engine='pyarrow')

## Non-Text Features

### Number of Authors

In [None]:
df['number_of_authors'] = df['authors_parsed'].apply(len)

# Drop the original column
df = df.drop(columns=['authors_parsed'])

### Flatten Domain and Area

In [None]:
# Split the 'Domain' column into individual domains and explode into rows
df_domains = df['Domain'].str.split(';').explode().str.strip()

# Get all unique domains
unique_domains = df_domains.unique()

# Create binary columns for each unique domain
for domain in unique_domains:
    df[domain] = df['Domain'].str.contains(domain, case=False, na=False)

In [None]:
# Split the 'Area' column into individual areas and explode into rows
df_areas = df['Area'].str.split(';').explode().str.strip()

# Get all unique areas
unique_areas = df_areas.unique()

# Create a DataFrame with binary columns for each unique area (more memory efficient, many columns...)
binary_columns = {area: df['Area'].str.contains(area, case=False, na=False) for area in unique_areas}
binary_df = pd.DataFrame(binary_columns)

# Concatenate the original DataFrame with the new binary columns
df = pd.concat([df, binary_df], axis=1)

## Text derived Features

### Abstract

#### Pre-Processing

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.strip()
    return text

df['abstract'] = df['abstract'].fillna('')

df['abstract'] = df['abstract'].apply(preprocess_text)

Lemmatization

In [None]:
def lemmatize_text(text: str) -> str:
    if pd.isna(text) or not isinstance(text, str):
        return ""  # Return empty string for NaN or non-string inputs
    
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text.lower())  # Lowercase and tokenize
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)

df['abstract'] = df['abstract'].apply(lemmatize_text)


Standardize Abbreviations

In [None]:
print("\n--- Standardizing Abbreviations ---")
abbreviation_map = {
    # CS / General ML
    'machine learning ml': 'machine learning',
    'learning ml': 'machine learning',
    'ml models': 'machine learning model',
    'reinforcement learning rl': 'reinforcement learning',
    'artificial intelligence ai': 'artificial intelligence',
    'internet things iot': 'internet things',
    'convolutional neural network cnn': 'convolutional neural network',
    'deep learning dl': 'deep learning',
    'deep neural network dnn': 'deep neural network',
    'neural networks cnns': 'neural networks',
    'language models llms': 'language model',
    'language model lm': 'language model',
    'language processing nlp': 'natural language processing',
    'natural language processing nlp': 'natural language processing',
    'federated learning fl': 'federated learning',
    'principal component analysis pca': 'principal component analysis',
    'structural equation modelling sem': 'structural equation modelling',
    'extended reality xr': 'extended reality',
    # Physics
    'black hole bh': 'black hole',
    'dark matter dm': 'dark matter',
    'density functional theory dft': 'density functional theory',
    'molecular dynamics md': 'molecular dynamics',
    # EESS
    'computed tomography ct': 'computed tomography',
    'magnetic resonance imaging mri': 'magnetic resonance imaging',
    'electroencephalography eeg': 'electroencephalography',
    'base station bs': 'base station',
    'channel state information csi': 'channel state information',
    'multipleinput multipleoutput mimo': 'multipleinput multipleoutput',
    'signaltonoise ratio snr': 'signaltonoise ratio',
    'automatic speech recognition asr': 'automatic speech recognition',
    'massive machinetype communications mmtc': 'massive machinetype communications',
    'reconfigurable intelligent surface ris': 'reconfigurable intelligent surface',
    'user equipment ue': 'user equipment',
    # QB
    # molecular dynamics md (already listed)
    # magnetic resonance imaging mri (already listed)
    # electroencephalography eeg (already listed)
    # artificial intelligence ai (already listed)
    # Econ
    # structural equation modelling sem (already listed)
    # QF
    'agentbased models abms': 'agentbased models',
    'agentbased model gabm': 'agentbased model',
    'environmental social governance esg': 'environmental social governance',
    'value risk var': 'value risk',
}

def standardize_abbreviations(text, abbreviation_map):
    if pd.isna(text) or not isinstance(text, str):
        return text, 0  # Return as-is if NaN or not a string, with 0 replacements

    replacement_count = 0  # Track the number of replacements
    for abbr, full_form in abbreviation_map.items():
        if abbr in text:  # Check if the abbreviation exists in the text
            text = text.replace(abbr, full_form)  # Replace abbreviations with full forms
            replacement_count += 1  # Increment the counter for each replacement
    return text, replacement_count

# Apply the function to the "abstract" column and track replacements
adjustment_counts = []
df['abstract'], adjustment_counts = zip(*df['abstract'].apply(lambda x: standardize_abbreviations(x, abbreviation_map)))

# Print total adjustments
total_adjustments = sum(adjustment_counts)
print(f"Total number of adjustments made: {total_adjustments}")

Creating Checkpoint to load dataframe from here

In [None]:
# Checkpoint
#df.to_parquet("checkpoint_lemmatized.parquet", engine="pyarrow", index=False)

df = pd.read_parquet("checkpoint_lemmatized.parquet", engine="pyarrow")

#### Keyword Extraction (per domain and month)

In [None]:
def extract_monthly_top_keywords(
    dataframe: pd.DataFrame,
    category_column: str,
    date_column: str = 'first_date',
    text_column: str = 'abstract',
    top_n: int = 30,
    boost_bigrams: float = 1.0,
    boost_trigrams: float = 2.0,
    max_df: float = 0.8,
    min_df: int = 1,
    ngram_range: Tuple[int, int] = (2, 3)
) -> Tuple[List[str], Dict[Any, List[Tuple[str, float]]]]:
    """
    Analyzes text data within a specific category of a DataFrame, grouped by month,
    to extract the top N keywords using TF-IDF, with optional boosting for n-grams.

    Args:
        dataframe (pd.DataFrame): The input DataFrame containing the data.
        category_column (str): The name of the boolean column used to filter the
                               DataFrame for the relevant category.
        date_column (str): The name of the column containing datetime objects.
        text_column (str): The name of the column containing the text data (e.g., abstracts).
        top_n (int): The number of top keywords to extract for each month.
        boost_bigrams (float): Factor to boost the TF-IDF score of bigrams.
        boost_trigrams (float): Factor to boost the TF-IDF score of trigrams.
        tfidf_max_df (float): max_df parameter for TfidfVectorizer. Ignore terms
                              that appear in more than this fraction of documents.
        tfidf_min_df (int): min_df parameter for TfidfVectorizer. Ignore terms
                            that appear in less than this number of documents.
        tfidf_ngram_range (Tuple[int, int]): ngram_range parameter for TfidfVectorizer.

    Returns:
        Tuple[List[str], Dict[Any, List[Tuple[str, float]]]]:
            - A list of unique top keywords found across all months.
            - A dictionary where keys are the year-month periods and values are
              lists of (keyword, score) tuples for that month's top keywords.
    """

    # --- Input Validation ---
    if not isinstance(dataframe, pd.DataFrame):
        raise TypeError("Input 'dataframe' must be a pandas DataFrame.")
    if category_column not in dataframe.columns:
        raise ValueError(f"Column '{category_column}' not found in DataFrame.")
    if date_column not in dataframe.columns:
        raise ValueError(f"Column '{date_column}' not found in DataFrame.")
    if text_column not in dataframe.columns:
        raise ValueError(f"Column '{text_column}' not found in DataFrame.")
    if not pd.api.types.is_datetime64_any_dtype(dataframe[date_column]):
         try:
             # Attempt conversion if not already datetime
             dataframe[date_column] = pd.to_datetime(dataframe[date_column])
             print(f"Warning: Column '{date_column}' converted to datetime objects.")
         except Exception as e:
            raise TypeError(f"Column '{date_column}' must be of datetime type or convertible to it. Error: {e}")
    if not pd.api.types.is_bool_dtype(dataframe[category_column]):
        # Attempt conversion if possible (e.g., 0/1)
        try:
            dataframe[category_column] = dataframe[category_column].astype(bool)
            print(f"Warning: Column '{category_column}' converted to boolean type.")
        except Exception as e:
            raise TypeError(f"Column '{category_column}' must be of boolean type or convertible to it. Error: {e}")
    if not isinstance(top_n, int) or top_n <= 0:
        raise ValueError("'top_n' must be a positive integer.")

    # --- Data Preparation ---
    # Filter for the specified category and create a copy to avoid SettingWithCopyWarning
    df_filtered = dataframe[dataframe[category_column]].copy()

    # Handle potential NaNs in text column before processing
    df_filtered.dropna(subset=[text_column], inplace=True)
    df_filtered[text_column] = df_filtered[text_column].astype(str) # Ensure text is string

    # Create the year_month period
    df_filtered['year_month'] = df_filtered[date_column].dt.to_period('M')

    monthly_top_keywords = {} # Dictionary to store results {year_month: [(keyword, score), ...]}

    # Sort by year_month to process chronologically
    df_filtered = df_filtered.sort_values('year_month')

    # --- Group by Month and Analyze ---
    # Group by the created year_month period
    for year_month, group_df in df_filtered.groupby('year_month'):

        # Get the processed text data for the current month
        texts_this_month = group_df[text_column]

        # Skip if no valid (non-empty) texts for this month
        if texts_this_month.empty or texts_this_month.str.strip().eq('').all():
            print(f"No valid text data found for month {year_month}. Skipping.")
            monthly_top_keywords[year_month] = []
            continue

        try:
            # Initialize TF-IDF Vectorizer FOR THIS MONTH'S DATA
            vectorizer = TfidfVectorizer(
                stop_words='english',
                max_df=max_df,
                min_df=min_df,
                ngram_range=ngram_range
            )

            # Fit and transform the texts *for this month*
            tfidf_matrix = vectorizer.fit_transform(texts_this_month)

            # Get the feature names (keywords) learned from this month's data
            feature_names = vectorizer.get_feature_names_out()

            # Calculate the sum of TF-IDF scores for each term across all docs in this month
            sum_tfidf = tfidf_matrix.sum(axis=0)

            # Map scores to feature names
            scores = [(feature_names[col], sum_tfidf[0, col]) for col in range(tfidf_matrix.shape[1])]

            # Boost N-grams
            boosted_scores = []
            for term, score in scores:
                num_spaces = term.count(' ')
                boosted_score = score
                if num_spaces == 1: 
                    boosted_score *= boost_bigrams
                elif num_spaces == 2: 
                    boosted_score *= boost_trigrams

                if boosted_score > 0: # Only add terms with a positive boosted score
                     boosted_scores.append((term, boosted_score))

            # Sort terms by the potentially boosted score in descending order
            sorted_scores = sorted(boosted_scores, key=lambda x: x[1], reverse=True)

            # Get the top N keywords for the month
            top_keywords_this_month = sorted_scores[:top_n]

            # Store the results
            monthly_top_keywords[year_month] = top_keywords_this_month

        except ValueError as e:
            # Handle cases where TF-IDF might fail (e.g., all terms are stop words after filtering)
            print(f"Could not process month {year_month} with TF-IDF: {e}")
            monthly_top_keywords[year_month] = []

    # --- Collect Unique Keywords ---
    unique_keywords_set = set()
    for keywords in monthly_top_keywords.values():
        unique_keywords_set.update([keyword for keyword, _ in keywords])

    unique_keywords_list = sorted(list(unique_keywords_set))

    return unique_keywords_list, monthly_top_keywords

In [None]:
category_columns_to_process: List[str] = [
        "Physics",
        "Computer Science",
        "Statistics",
        "Mathematics",
        "Electrical Engineering and Systems Science",
        "Quantitative Biology",
        "Economics",
        "Quantitative Finance"
]

domain_specific_params: Dict[str, Dict[str, Any]] = {
    "Physics": {"min_df": 0.0005, "max_df": 0.6, "top_n": 100}, # Many records in Physics domain
    "Computer Science": {"min_df": 0.0005, "max_df": 0.6, "top_n": 60}, # Many records in Computer Science domain
    "Statistics": {"min_df": 0.04, "max_df": 0.6},
    "Mathematics": {"min_df": 0.01, "max_df": 0.6},
    "Electrical Engineering and Systems Science": {"min_df": 0.04, "max_df": 0.6}, # Missing keywords mid 2015 - beginning of 2017 even with very unrestrictive parameters
    # https://info.arxiv.org/new/eess_announce.html  Electrical Engineering and Systems Science archive (eess) was introduced 18 September 2017
    "Quantitative Biology": {"min_df": 0.04, "max_df": 0.6},
    "Economics": {"min_df": 0.06, "max_df": 0.6},
    "Quantitative Finance": {"min_df": 0.06, "max_df": 0.6},
}

domain_keywords: Dict[str, List[str]] = {}

print("Starting keyword extraction across multiple categories...")

# Loop through each category
for category_col in category_columns_to_process:
    print(f"\nProcessing category: '{category_col}'...")

    # Check if the category column exists in the DataFrame
    if category_col not in df.columns:
        print(f"Warning: Column '{category_col}' not found in DataFrame. Skipping.")
        continue

    # Check if there's any data for this category
    if not df[category_col].any():
        print(f"No entries found for category '{category_col}'. Skipping.")
        continue

    try:
        # Get the domain-specific parameters for this category
        params = domain_specific_params.get(category_col, {})
        top_n = params.get("top_n", 30)
        min_df = params.get("min_df", 1)
        max_df = params.get("max_df", 0.8)
        ngram_range = params.get("ngram_range", (2, 3))

        # Call the keyword extraction function with the specific parameters
        unique_keywords_list, monthly_keywords_dict = extract_monthly_top_keywords(
            dataframe=df,
            category_column=category_col,
            date_column='first_date',
            text_column='abstract',
            top_n=top_n,
            min_df=min_df,
            max_df=max_df,
            ngram_range=ngram_range
        )

        # Store the unique keywords for this category in the dictionary
        domain_keywords[category_col] = unique_keywords_list

        print(f"Finished processing '{category_col}'. Added {len(unique_keywords_list)} unique keywords.")
        if not unique_keywords_list:
            print(f"(No keywords met the TF-IDF criteria for '{category_col}')")

    except (ValueError, TypeError) as e:
        print(f"Error processing category '{category_col}': {e}. Skipping this category.")
    except Exception as e:
        # Catch any other unexpected errors
        print(f"An unexpected error occurred while processing category '{category_col}': {e}. Skipping this category.")

print("\n--- Keyword Extraction Complete ---")

# Print the total number of keywords per domain
for domain, keywords in domain_keywords.items():
    print(f"Domain: {domain}, Total Keywords: {len(keywords)}")

Creating Checkpoint

In [None]:
# Checkpoint a saving

with open('domain_keywords.json', 'w') as json_file:
    json.dump(domain_keywords, json_file, indent=4)
    

export_data = {
    "unique_domains": unique_domains.tolist(),
    "unique_areas": unique_areas.tolist(),
    "unique_keywords_list": unique_keywords_list
}

with open("unique_data.json", "w") as json_file:
    json.dump(export_data, json_file, indent=4)

In [None]:
# Checkpoint b loading

df = pd.read_parquet("checkpoint_lemmatized.parquet", engine="pyarrow")

with open('domain_keywords.json', 'r') as json_file:
    domain_keywords = json.load(json_file)

with open("unique_data.json", "r") as json_file:
    imported_data = json.load(json_file)

unique_domains = np.array(imported_data["unique_domains"])
unique_areas = np.array(imported_data["unique_areas"])
unique_keywords_list = imported_data["unique_keywords_list"]

#### Keyword Cleaning

In [None]:
df_data = []


for domain, keyword_list in domain_keywords.items():
    if not isinstance(keyword_list, list):
        print(f"  - Warning: Keywords for domain '{domain}' is not a list. Skipping this domain for DataFrame.")
        continue
    if not keyword_list:
        print(f"  - No keywords for domain '{domain}'. This domain will have no entries in DataFrame.")
        # If you want to represent domains with no keywords, you could add a row with None or empty string for keyword
        # df_data.append({'domain': domain, 'keyword': None}) # Example
        continue
    for keyword in keyword_list:
        # Append a dictionary for each keyword, associating it with its domain
        df_data.append({'domain': domain, 'keyword': keyword})

domain_keywords_df = pd.DataFrame(df_data)

In [None]:
# --- Remove Generic Phrases ---
print("\n--- Removing Generic Phrases ---")
generic_phrases_to_remove = [
    'et al', 'results indicate', 'results suggest', 'results demonstrate',
    'paper propose', 'paper present', 'work propose', 'introduce novel', 'propose new',
    'experimental results', 'numerical results', 'simulation results',
    'demonstrate effectiveness', 'effectiveness proposed', 'proposed method',
    'proposed approach', 'proposed model', 'proposed algorithm', 'proposed scheme',
    'recent years', 'widely used', 'commonly used', 'wide range', 'large number',
    'based on', 'compared to', 'address problem', 'paper deals', 'paper investigates',
    'paper study', 'study investigates', 'study shows', 'study aims', 'aim paper',
    'demonstrate proposed', 'results proposed', 'findings indicate', 'findings suggest',
    'existing methods', 'existing approaches', 'existing literature', 'previous work',
    'previous studies', 'recent work', 'recent studies', 'recent developments',
    'provide evidence', 'paper provide', 'study provides', 'present paper',
    'main result', 'key role', 'important role', 'crucial role', 'significant role',
    'better understanding', 'deeper understanding', 'valuable insights', 'shed light',
    'future research', 'possible future research', 'directions future research',
    'case study', 'numerical examples', 'experimental data', 'real data', 'real world',
    'publicly available', 'code available','good agreement', 'high accuracy', 'better performance',
    'superior performance', 'computational cost', 'computational complexity', 'computationally efficient',
    'state art', 'stateoftheart methods','stateoftheart performance','results obtained', 'results confirm',
    'results reveal', 'address challenges', 'address issue', 'address gap',
    'apply method', 'apply results', 'approach based', 'method based', 'framework based',
    'consider problem', 'develop novel', 'develop framework',
    'empirical evidence', 'empirical application', 'empirical analysis',
    'establish existence', 'evaluate performance', 'examine potential',
    'explain sustainability', 'explore key', 'extract important',
    'findings emphasize', 'focus specifically', 'gain insights',
    'highlight importance', 'illustrate method', 'implementable pricebased',
    'improve performance', 'increase probability', 'investigate effect',
    'make use', 'obtain new', 'outperforms existing',
    'play crucial role', 'plays important role', 'present comprehensive',
    'provide new', 'purpose paper', 'purpose research',
    'showcase potential', 'solve problem', 'study aimed', 'study demonstrates',
    'study examines', 'suggests evaluate', 'theoretical analysis', 'theoretical results',
    'understand relationship', 'use cases', 'using data', 'using numerical', 'et al phys',
    'data used', 'question answering', 'study propose', 'use case', 'present new',
    'evidence suggests', 'model used', 'plays crucial', 'plays crucial role', 'present novel',
    'propose novel', 'paves way', 'significant challenge', 'outperforms state-of-the-art', 'introduces novel',
    'prove existence', 'new results', 'previous results', 'new examples', 'new method',
    'novel approach', 'similar results', 'open problem', 'new proof', 'recent results',
    'closely related', 'previously known', 'recently introduced', 'new approach', 'present new',
    'gives rise', 'model based', 'extensive experiments', 'upper bound', 'recently proposed',
    'present study', 'standard model', 'work present', 'pave way', 'present results', 'present work',
    'provide valuable insights', 'results provide', 'results pave way', 'standard model sm', 'taken account', 
    'taking account', 'best practices', 'challenging task', 'demonstrate effectiveness proposed', 'demonstrate proposed method',
    'effectiveness proposed approach', 'effectiveness proposed model', 'effectiveness proposed method',
    'novel method', 'findings reveal', 'experimental results demonstrate', 'experimental results proposed', 'experiments conducted', 'experiments demonstrate',
    'extensive experimental results', 'extensive experiments conducted', 'extensive experiments demonstrate', 'extensive experiments realworld', 'paper consider',
    'paper explores', 'paper introduce novel', 'paper introduces novel', 'paper present novel', 'paper presents comprehensive',
    'paper presents new', 'paper presents novel', 'paper propose new', 'paper propose novel', 'paper proposes novel',
    'result suggest', 'result demonstrate', 'result indicate', 'result obtained', 'result pave way', 'result provide', 'result suggest',
    'address challenge', 'address challenge propose', 'address issue paper', 'address issue propose', 'address limitation propose',
    'existing approach', 'existing method', 'experimental research data', 'experimental result', 'experimental result demonstrate',
    'experimental result proposed', 'extensive experiment', 'paper address problem', 'paper describes', 'paper introduce',
    'paper introduce new', 'paper investigate', 'paper present new', 'paper study problem', 'present novel approach',
    'research performance', 'result demonstrate', 'result demonstrate proposed', 'result obtained', 'result proposed',
    'result proposed method', 'data set', 'data set different', 'data point', 'data based',
    'experimental result', 'data collection', 'introduce new', 'paper aim', 'paper analysis',
    'paper analyze', 'paper deal', 'paper introduces', 'paper investigate', 'paper model approximate',
    'paper provides', 'recent advance', 'recent development', 'recent year', 'recently used',
    'significantly outperforms', 'special case', 'open question', 'stateoftheart method', 'provide example',
    'provide explicit', 'provide insight', 'provide new proof', 'recent advancement', 'recent paper',
    'recent result', 'result hold', 'result numerical example', 'result obtained', 'result paper',
    'study conducted', 'study investigate', 'study investigated', 'study present', 'study reported'

    # Synonym / Incomplete expressions section
    'deep neural', 'large language', 'neural networks', 'data sets', 'machine learning algorithms',
    'models llms', 'convolutional neural', 'van der', 'der waals', 'deep neural', 'language models',
    'intelligence ai', 'deep learning models', 'learning models', 'resonance imaging', 'magnetic resonance',
    'network cnn', 'neural network cnn', 'learning dl', 'markov chain monte', 'chain monte carlo',
    'monte carlo mcmc', 'chain monte', 'carlo mcmc', 'natural language', 'quantum manybody',
    'manybody systems', 'galactic nuclei agn', 'galactic nuclei', 'active galactic', 'nuclei agn',
    'quantum information', 'information processing', 'carlo simulations', 'md simulations', 'blood glucose values',
    'computer vision tasks', 'datasets demonstrate', 'deep learningbased', 'deep reinforcement', 'learning model'
]

def remove_keywords_with_prefixes(df, column, prefixes):
    """
    Removes rows from the DataFrame where the specified column's values start with any of the given prefixes.

    Args:
        df (pd.DataFrame): The DataFrame to process.
        column (str): The column to check for prefixes.
        prefixes (list): A list of prefixes to check.

    Returns:
        pd.DataFrame: A new DataFrame with the filtered rows.
    """
    pattern = f"^({'|'.join(prefixes)})"
    mask = ~df[column].str.lower().str.match(pattern)
    return df[mask]


count_before_generic_removal = len(domain_keywords_df)
mask = ~domain_keywords_df['keyword'].str.lower().isin(generic_phrases_to_remove)
refined_df = domain_keywords_df[mask].copy()
print(f"Keyword count after removing generic phrases: {len(refined_df)} (Removed {count_before_generic_removal - len(refined_df)})")

count_before_prefix_removal = len(refined_df)
prefixes_to_remove = ['result', 'paper', 'provide', 'recent', 'address', 'model ', 'method ', 'significantly', 'consider ', 'considers ', 'considered ']
refined_df = remove_keywords_with_prefixes(refined_df, 'keyword', prefixes_to_remove)
print(f"Keyword count after removing prefixes: {len(refined_df)} (Removed {count_before_prefix_removal - len(refined_df)})")

# --- Display final counts ---
print("\nFinal keyword counts per domain:")
print(refined_df['domain'].value_counts())

#### Keyword Visualization

##### Overall in categories

In [None]:
custom_palette_hex = ["#89043d","#8a817c","#bcb8b1","#f4f3ee","#e0afa0"]

def palette_color_func(word, **kwargs):
    return random.choice(custom_palette_hex)

domain_keyword_frequencies: Dict[str, Dict[str, int]] = {}

# Loop through unique domains in the refined keyword DataFrame
for domain_name in refined_df['domain'].unique():
    print(f"\nProcessing: '{domain_name}'...")

    # Get refined keywords for this domain
    refined_keywords_list = refined_df[refined_df['domain'] == domain_name]['keyword'].tolist()

    # Filter original DataFrame for abstracts of this domain
    category_mask = df[domain_name] == True
    category_abstracts = df.loc[category_mask, "abstract"].astype(str).fillna('')

    if category_abstracts.empty or not refined_keywords_list:
        print(f"  - Skipping '{domain_name}' (no abstracts or keywords).")
        continue

    # Calculate frequencies
    frequencies = {}
    lower_abstracts_series = category_abstracts.str.lower()
    for keyword in refined_keywords_list:
        escaped_keyword = re.escape(keyword.lower())
        # Use regex=True because re.escape is used.
        count = lower_abstracts_series.str.count(escaped_keyword).sum()
        if count > 0:
            frequencies[keyword] = int(count)

    domain_keyword_frequencies[domain_name] = frequencies

    if not frequencies:
        print(f"  - Skipping '{domain_name}' (no keywords found in abstracts).")
        continue

    print(f"  - Generating word cloud for '{domain_name}'...")

    # Generate and display Word Cloud
    wordcloud_generator = WordCloud(
        width=1200,
        height=600,
        background_color='black',
        collocations=False,
        min_font_size=10,
        #colormap='magma',
        color_func=palette_color_func,
        random_state=None
    ).generate_from_frequencies(frequencies)

    plt.figure(figsize=(16, 8))
    plt.imshow(wordcloud_generator, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Domain: {domain_name}', fontsize=18)
    plt.tight_layout(pad=0)
    plt.show()

In [None]:
# Export domain_keyword_frequencies to a JSON file
with open('domain_keyword_frequencies.json', 'w') as json_file:
    json.dump(domain_keyword_frequencies, json_file, indent=4)

# # Reimport domain_keyword_frequencies from the JSON file
# with open('domain_keyword_frequencies.json', 'r') as json_file:
#     domain_keyword_frequencies = json.load(json_file)

In [None]:
# Remove unnecessary local variables
try:
    del category_abstracts, category_mask, count, count_before_generic_removal, count_before_prefix_removal
    del custom_palette_hex, df_data, domain, escaped_keyword
    del frequencies, generic_phrases_to_remove, imported_data, json_file
    del keyword, keywords, lower_abstracts_series, mask, prefixes_to_remove
    del refined_keywords_list, wordcloud_generator
except NameError as e:
    print(f"Variable not found: {e}")
else:
    print("All specified variables deleted successfully.")

##### Category specific - DEPRECATED - 

In [None]:
# Filtering out keywords based on cross-domain frequency (keywords appearing in >1 domains more than 20 times)

freq_df = pd.DataFrame([{'domain': dom, 'keyword': kw, 'frequency': fq}
                        for dom, kw_fq_dict in domain_keyword_frequencies.items()
                        for kw, fq in kw_fq_dict.items()])

# 2. Identify keywords frequent (>=5) in >1 domain
domain_counts = freq_df.query('frequency >= 20').groupby('keyword')['domain'].nunique()
keywords_to_exclude = domain_counts[domain_counts > 1].index

# 3. Filter the original refined_df to exclude these keywords
domain_specific_keywords_df = refined_df[~refined_df['keyword'].isin(keywords_to_exclude)].reset_index(drop=True)

In [None]:
for domain_name in domain_specific_keywords_df['domain'].unique():
    print(f"\nProcessing: '{domain_name}'...")

    # Get the list of domain-specific keywords for this domain
    specific_keywords_set = set(domain_specific_keywords_df[
        domain_specific_keywords_df['domain'] == domain_name
    ]['keyword'])

    # Get the original frequencies for ONLY these specific keywords
    # Check if the domain exists in the original frequency dictionary
    if domain_name not in domain_keyword_frequencies:
        print(f"  - Skipping '{domain_name}' (no original frequencies found).")
        continue

    # Filter the original frequencies for the current domain's specific keywords
    frequencies_for_cloud = {
        keyword: freq
        for keyword, freq in domain_keyword_frequencies[domain_name].items()
        if keyword in specific_keywords_set and freq > 0 
    }

    print(f"  - Generating word cloud for '{domain_name}' with {len(frequencies_for_cloud)} specific keywords...")

    # Generate and display Word Cloud using the filtered frequencies
    wordcloud_generator = WordCloud(
        width=1200,
        height=600,
        background_color='black',
        collocations=False,
        min_font_size=10,
        color_func=palette_color_func,
        random_state=None
    ).generate_from_frequencies(frequencies_for_cloud)

    # Display Plot
    plt.figure(figsize=(16, 8))
    plt.imshow(wordcloud_generator, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Domain: {domain_name}', fontsize=18)
    plt.tight_layout(pad=0)
    plt.show()

## Features and Final Dataframe for Clustering

In [None]:
TOP_N_KEYWORDS_PER_DOMAIN = 400    
RANDOM_STATE = 42              

In [None]:
# ----- Keyword Selection -----

domain_top_keywords_map: Dict[str, List[str]] = {}
# Set to store the global union of all top N keywords
final_keyword_list_set = set()

# Check if domain_keyword_frequencies exists and is populated
if 'domain_keyword_frequencies' not in locals() or not domain_keyword_frequencies:
    print("Error: 'domain_keyword_frequencies' dictionary not found or empty. Cannot select top keywords.")
    final_keyword_list = [] # Initialize empty list
else:
    # Iterate through the domains present in the frequency dictionary
    for domain_name, keyword_freqs in domain_keyword_frequencies.items():
        print(f"  Processing domain: {domain_name}")

        if not keyword_freqs:
            print(f"    No keywords/frequencies found for domain '{domain_name}'.")
            domain_top_keywords_map[domain_name] = [] # Store empty list for this domain
            continue

        # Sort keywords in this domain by frequency (descending)
        valid_items = [(kw, fq) for kw, fq in keyword_freqs.items() if isinstance(fq, (int, float))]
        sorted_keywords = sorted(valid_items, key=lambda item: item[1], reverse=True)

        # Select the top N keywords for this specific domain
        top_n_for_domain = [kw for kw, freq in sorted_keywords[:TOP_N_KEYWORDS_PER_DOMAIN]]
        print(f"    Selected {len(top_n_for_domain)} keywords for '{domain_name}'.")

        # Store this list in the map
        domain_top_keywords_map[domain_name] = top_n_for_domain

        # Add these keywords to the global set for vectorizer vocabulary
        final_keyword_list_set.update(top_n_for_domain)

    # Convert the final global set to a sorted list for consistent column order in TfidfVectorizer
    final_keyword_list = sorted(list(final_keyword_list_set))
    print(f"\nCreated map 'domain_top_keywords_map' with top keywords per domain.")
    print(f"Created global list 'final_keyword_list' with {len(final_keyword_list)} unique keywords across all domains (for vectorizer).")

In [None]:
# ----- Create Binary Keyword Features -----

if not final_keyword_list:
     print("Warning: final_keyword_list is empty. No keyword features will be created.")
     keyword_cols_created = []
else:
    print(f"\n--- Creating Binary Features for {len(final_keyword_list)} Keywords ---")

    # Ensure abstract is string type and handle potential NaNs
    lower_abstracts = df['abstract'].astype(str).fillna('').str.lower()

    # --- Configure TfidfVectorizer ---
    max_ngram_length = 1
    if final_keyword_list:
         max_ngram_length = max(len(kw.split()) for kw in final_keyword_list)

    print(f"  Configuring TfidfVectorizer (max_ngram={max_ngram_length})...")
    vectorizer = TfidfVectorizer(
        vocabulary=final_keyword_list,
        lowercase=True,
        binary=True,
        use_idf=False,
        norm=None,
        ngram_range=(1, max_ngram_length)
    )

    print("  Applying vectorizer to abstracts...")
    X_keywords_sparse = vectorizer.fit_transform(lower_abstracts)
    print(f"  Vectorizer finished. Output shape (sparse): {X_keywords_sparse.shape}")

    # --- Create DataFrame from Sparse Matrix ---
    feature_names = vectorizer.get_feature_names_out()
    keyword_cols_created = [f'kw_{name}' for name in feature_names] # These are the columns added
    print(f"  Creating DataFrame from sparse matrix for {len(keyword_cols_created)} keyword features...")
    try:
        df_keywords = pd.DataFrame.sparse.from_spmatrix(
            X_keywords_sparse,
            index=df.index,
            columns=keyword_cols_created
        )
        print("  Keyword DataFrame created.")

        # --- Concatenate with original DataFrame ---
        cols_to_drop_from_df = [col for col in keyword_cols_created if col in df.columns]
        if cols_to_drop_from_df:
             print(f"  Dropping existing columns from df before concat: {cols_to_drop_from_df}")
             df = df.drop(columns=cols_to_drop_from_df)

        df = pd.concat([df, df_keywords], axis=1)
        # Convert sparse keyword columns to int (0/1) AFTER concat if needed by downstream steps
        # This might increase memory usage significantly.
        # df[keyword_cols_created] = df[keyword_cols_created].astype(int)
        print(f"Concatenated keyword features. New df shape: {df.shape}")

    except MemoryError:
        print("\nError: MemoryError encountered while creating keyword DataFrame.")
        print("Consider reducing TOP_N_KEYWORDS_PER_DOMAIN or using algorithms accepting sparse input.")
        keyword_cols_created = [] # Prevent errors later
    except Exception as e:
        print(f"\nAn unexpected error occurred creating keyword DataFrame: {e}")
        keyword_cols_created = []

In [None]:
# ----- Final Feature Lists -----

# --- Metadata Features ---
metadata_features = ['number_of_authors'] + list(unique_areas)

# --- Keyword Features ---
# keyword_cols_created

In [None]:
for col_name in df.columns:
    if pd.api.types.is_sparse(df[col_name].dtype):
        df[col_name] = df[col_name].sparse.to_dense()

df.to_parquet("checkpoint_with_keywords.parquet", engine="pyarrow", index=False)

variables_to_export = {
    "domain_top_keywords_map": domain_top_keywords_map,
    "final_keyword_list": final_keyword_list,
    "final_keyword_list_set": list(final_keyword_list_set),
    "keyword_cols_created": keyword_cols_created,
    "metadata_features": metadata_features,
    "unique_areas": unique_areas.tolist(),
    "unique_domains": unique_domains.tolist(),
    "unique_keywords_list": unique_keywords_list,
}

with open("checkpoint_variables.pkl", "wb") as f:
    pickle.dump(variables_to_export, f)