### Open file

In [1]:
import pandas as pd
from nltk.corpus import stopwords
import re
import pandas as pd
import re
import unicodedata
import ftfy  
from langdetect import detect
import numpy as np 

# open metadata sample file

metadata = pd.read_csv(
    'purposes_metadata_final.csv',  
    dtype={
        'cord_uid': 'string',
        'sha': 'string',
        'source_x': 'string',
        'title': 'string',
        'doi': 'string',
        'pmcid': 'string',
        'pubmed_id': 'Int64',  
        'license': 'string',
        'abstract': 'string',
        'publish_time': 'string',  
        'authors': 'string',
        'journal': 'string',
        'mag_id': 'string',  
        'who_covidence_id': 'string',  
        'arxiv_id': 'string',  
        'pdf_json_files': 'string',
        'pmc_json_files': 'string',
        'url': 'string',
        's2_id': 'string',
        'referenced_by_count':'int64', 
        'JournalName_DOI':'string', 
        'tags':'string',
        'TagCount':'int64'
    }, encoding='utf-8', low_memory=False
)

print(len(metadata))

9318


### Clean and normalise

In [2]:

# Drop duplicates
metadata = metadata.drop_duplicates(subset=['cord_uid'])
original_rows = len(metadata)

# Fix encoding
def fix_encoding(text):
    if pd.isna(text) or text.strip() == "":
        return text
    return ftfy.fix_text(text)

metadata['title'] = metadata['title'].apply(fix_encoding)
metadata['abstract'] = metadata['abstract'].apply(fix_encoding)
metadata['authors'] = metadata['authors'].apply(fix_encoding)
metadata['journal'] = metadata['journal'].apply(fix_encoding)

# Language detection
def is_english(text):
    if pd.isna(text) or text.strip() == "":
        return True
    try:
        return detect(text) == 'en'
    except:
        return False

metadata['title_is_en'] = metadata['title'].apply(is_english)
metadata['abstract_is_en'] = metadata['abstract'].apply(is_english)
metadata = metadata[~(metadata['title_is_en'] == False) & ~(metadata['abstract_is_en'] == False)]
english_rows = len(metadata)

# Clean text with whitelist
def clean_text(text):
    if pd.isna(text):
        return text
    text = unicodedata.normalize('NFC', text)
    # Handle common LaTeX (optional)
    text = re.sub(r'\$\s*\\alpha\s*\$', 'alpha', text, flags=re.IGNORECASE)
    # Whitelist: a-z, A-Z, 0-9, space, period, comma, hyphen, colon
    text = re.sub(r'[^a-zA-Z0-9\s.,-:;/()]', '', text)
    # Optional: lowercase now
    # text = text.lower()
    return text.strip()

metadata['title'] = metadata['title'].apply(clean_text)
metadata['abstract'] = metadata['abstract'].apply(clean_text)
metadata['authors'] = metadata['authors'].apply(clean_text)
metadata['journal'] = metadata['journal'].apply(clean_text)

# Standardize datetime
metadata['publish_time'] = pd.to_datetime(metadata['publish_time'], errors='coerce')

# Replace missing abstracts with title
metadata['abstract'] = metadata['abstract'].fillna(metadata['title'])

# Drop temp columns
metadata = metadata.drop(columns=['title_is_en', 'abstract_is_en'])

# Log stats
print(f"Original rows: {original_rows}")
print(f"Rows after English filter: {english_rows}")
print(f"Rows after cleaning: {len(metadata)}")

# Save
metadata.to_csv('metadata_cut_clean.csv', index=False, encoding='utf-8')

Original rows: 9318
Rows after English filter: 9293
Rows after cleaning: 9293


### Preprocess

In [3]:
# Function to preprocess text (lowercase and remove extra spaces)
def preprocess_text(text):
    if isinstance(text, str):
        return ' '.join(text.lower().split())
    return text

# preprocess metadata file
# open file if not already open
file_path = "metadata_cut_clean.csv"
metadata = pd.read_csv(file_path)

for col in metadata.select_dtypes(include='object').columns:
    metadata[col] = metadata[col].apply(preprocess_text)

# remove 'abstract' if first word in abstract col
for row in metadata.iterrows():
    if metadata.loc[row[0]]['abstract'][:8] == 'abstract':
        metadata.loc[row[0], 'abstract'] = metadata.loc[row[0]]['abstract'][9:]
    
# Save the cleaned data to a new CSV file
output_file_path = "preprocessed_metadata.csv"
metadata.to_csv(output_file_path, index=False)

# preprocess topics / queries file
# Load the original CSV file
file_path = "topics-rnd3.csv"
df = pd.read_csv(file_path)
 
# Apply preprocessing to all text columns
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].apply(preprocess_text)

# Save the cleaned data to a new CSV file
output_file_path = "preprocessed_topics.csv"
df.to_csv(output_file_path, index=False)

In [3]:
# # lowercase text

# metadata['title'] = metadata['title'].str.lower()
# metadata['abstract'] = metadata['abstract'].str.lower()



# # optional - remove stopwords

# from nltk.corpus import stopwords
# # import nltk
# # nltk.download('stopwords')
# stop_words = set(stopwords.words('english'))
# metadata['abstract_no_stop'] = metadata['abstract'].apply(
#     lambda x: ' '.join(word for word in x.split() if word not in stop_words)
# )