### Documentation here https://dev.elsevier.com/documentation/ArticleRetrievalAPI.wadl

In [1]:
import os
import json
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models import Word2Vec
from compound_keywords import compound_keywords

#### NLTK Text Pre-processing:

In [2]:
def preprocess_text(text):
    # Define a list of English stopwords
    stop_words = set(stopwords.words('english'))
    # Add specific terms to be removed
    remove_terms = {'introduction', 'literature', 'review', 'figure', 'doi', 'fig', 'table', 'conclusion', 'discussion', 'acknowledgement'}
    stop_words.update(remove_terms)

    # Initialize lemmatizer and stemmer
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()

    # Convert to lowercase, then stem and lemmatize the text
    text = text.lower()
    for original, compound in compound_keywords.items():
        text = text.replace(original.replace('_', ' '), compound)

    # Tokenize the text into sentences
    sentence_tokens = sent_tokenize(text)
    sentences = []

    # Tokenize each sentence into words and remove stopwords
    for sentence in sentence_tokens:
        words = word_tokenize(sentence)
        filtered_words = []
        for word in words:
            word = lemmatizer.lemmatize(word)
            word = stemmer.stem(word)
            if word.lower() not in stop_words:
                filtered_words.append(word)
        sentences.append(filtered_words)
    
    return sentences

def process_json_files(directory):
    all_sentences = []

    # Iterate through each JSON file in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                # Extract the original text from the JSON structure
                text = data['full-text-retrieval-response']['originalText']
                # Process the text to tokenize and remove stopwords
                processed_sentences = preprocess_text(text)
                # Append the processed sentences to the overall list
                all_sentences.extend(processed_sentences)
    
    return all_sentences

In [3]:
# Directory path
directory = 'C:/Users/wenha/OneDrive - University College London/Desktop/first_paper_code/downloaded_articles'

# De-bug
# processed_sentences = process_json_files(directory)
# print(processed_sentences)

#### Train and Save Word2Vec Model:

In [4]:
# Define the Word2Vec model
model = Word2Vec(
    sentences=process_json_files(directory),  # Training data (tokenized sentences)
    vector_size=100,           # Size of the embedding vectors
    window=5,                  # Context window size
    min_count=2,               # Minimum occurrence in vocabulary
    workers=4                  # Number of threads for model training
)

PermissionError: [Errno 13] Permission denied: 'C:/Users/wenha/OneDrive - University College London/Desktop/first_paper_code'

In [5]:
model_save_path = "word2vec_model.model"
model.save(model_save_path)

#### Import Word2Vec Model:

In [6]:
from gensim.models import Word2Vec

model = Word2Vec.load("word2vec_model.model")
print("Model loaded successfully.")

Model loaded successfully.


In [7]:
# Find similar words 
similar_words_fabric = model.wv.most_similar('mental_health', topn=10)
similar_words_textile = model.wv.most_similar('clinic', topn=10)

print("Words similar to 'mental_health':")
for word, similarity in similar_words_fabric:
    print(f"{word}: {similarity}")

print("\nWords similar to 'clinic':")
for word, similarity in similar_words_textile:
    print(f"{word}: {similarity}")

Words similar to 'mental_health':
well-b: 0.8400946259498596
dementia: 0.7953222393989563
disord: 0.7934803366661072
wellb: 0.7872560620307922
health_and_comfort: 0.7718464136123657
building-rel: 0.7596529722213745
organiz: 0.7552937865257263
mental: 0.7549329996109009
distress: 0.7459940314292908
lifestyl: 0.7452057600021362

Words similar to 'clinic':
medic: 0.8250412940979004
immunolog: 0.781755805015564
veterinari: 0.7530952095985413
dental: 0.7232619524002075
disord: 0.7215447425842285
neurolog: 0.7215095162391663
patholog: 0.7203361988067627
toxicolog: 0.719054639339447
adolesc: 0.7169308066368103
dementia: 0.7058877944946289


In [8]:
# Find similarity
similarity_1 = model.wv.similarity('mental_health', 'clinic')
print(similarity_1)

0.7024076
