In [1]:
import os
import json
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

# Make sure stopwords are available
nltk.download("stopwords")
nltk.download("punkt")

# Paths
PROCESSED_DIR = "data/processed"
INPUT_FILE = os.path.join(PROCESSED_DIR, "abstracts.json")
OUTPUT_FILE = os.path.join(PROCESSED_DIR, "cleaned_abstracts.json")

# Load data
with open(INPUT_FILE, "r", encoding="utf-8") as f:
    papers = json.load(f)

stop_words = set(stopwords.words("english"))

def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove special characters/numbers
    text = re.sub(r"[^a-z\s]", "", text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords + short tokens
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
    return tokens

# Apply cleaning
for paper in papers:
    paper["tokens"] = clean_text(paper["abstract"])

# Save cleaned file
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(papers, f, indent=2)

print(f"✅ Cleaned {len(papers)} papers and saved to {OUTPUT_FILE}")
print("Sample tokens:", papers[0]["tokens"][:30])

# ---- Word frequency stats ----
all_tokens = [token for paper in papers for token in paper["tokens"]]
freq_dist = Counter(all_tokens)

print("\n🔎 Top 20 most common words:")
for word, count in freq_dist.most_common(20):
    print(f"{word}: {count}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...


✅ Cleaned 10 papers and saved to data/processed\cleaned_abstracts.json
Sample tokens: ['physical', 'artificial', 'intelligence', 'prove', 'one', 'important', 'challenges', 'artificial', 'intelligence', 'governance', 'physical', 'artificial', 'intelligence', 'would', 'define', 'responsible', 'intelligent', 'application', 'society']

🔎 Top 20 most common words:
artificial: 26
intelligence: 24
market: 8
manipulation: 5
learning: 4
theory: 4
proceedings: 4
proposed: 3
human: 3
conference: 3
uncertainty: 3
held: 3
physical: 2
application: 2
classification: 2
detection: 2
development: 2
directions: 2
discussed: 2
performing: 2


[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
