#### NLTK Text Pre-processing:

In [1]:
import os
import json
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models import Word2Vec
from _cpwords import compound_keywords

In [2]:
def preprocess_text(text):
    # Define a list of English stopwords
    stop_words = set(stopwords.words('english'))
    # Add specific terms to be removed
    remove_terms = {'introduct', 'literature', 'review', 'figure', 'doi', 'fig', 'table', 'conclusion', 
                    'altimg', 'gif', 'png', 'discussion', 'acknowledgment', 'appendix','http', 'copyright'}
    stop_words.update(remove_terms)

    # Initialize lemmatizer and stemmer
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()

    # Convert to lowercase
    text = text.lower()
    for original, compound in compound_keywords.items():
        text = text.replace(original, compound)
    
    # Tokenize the text into sentences
    sentence_tokens = sent_tokenize(text)
    sentences = []

    # Tokenize each sentence into words and remove stopwords
    for sentence in sentence_tokens:
        words = word_tokenize(sentence)
        filtered_words = []
        for word in words:
            word = lemmatizer.lemmatize(word)
            #word = stemmer.stem(word)
            
            # Remove non-alphabetic characters except underscores and hyphens
            word = re.sub(r'[^\w\s\-]', '', word)
            
            #if word.lower() not in stop_words:
            if word.lower() not in stop_words and not word.isdigit() and 1 < len(word) <= 20:
                filtered_words.append(word)
            
        sentences.append(filtered_words)
    
    return sentences

def process_json_files(directory):
    all_sentences = []

    # Iterate through each JSON file in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.json'):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                
                # Extract the original text from the JSON structure
                text = data['full-text-retrieval-response']['originalText']
                # Process the text to tokenize and remove stopwords
                processed_sentences = preprocess_text(text)
                # Append the processed sentences to the overall list
                all_sentences.extend(processed_sentences)
    
    return all_sentences

#### Train and Save Word2Vec Model:

In [4]:
directory = 'papers_json'

model = Word2Vec(
    sentences=process_json_files(directory),  
    vector_size=300,           # Size of the embedding vectors
    window=20,                 # Context window size
    min_count=2,               # Minimum occurrence in vocabulary
    workers=4                  # Number of threads for model training
)

In [None]:
model_save_path = "word2vec_model.model"
model.save(model_save_path)

NameError: name 'model' is not defined

#### Model Examination:

In [None]:
import os
import json
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models import Word2Vec
from _cpwords import compound_keywords

model = Word2Vec.load("word2vec_model.model")
print("Model loaded successfully.")

Model loaded successfully.


In [6]:
# Find similar words 
similar_words_rl = model.wv.most_similar('rl', topn=100)
similar_words_carbon = model.wv.most_similar('carbon', topn=20)

print("Words similar to 'rl':")
for word, similarity in similar_words_rl:
    print(f"{word}: {similarity}")

print("\nWords similar to 'carbon':")
for word, similarity in similar_words_carbon:
    print(f"{word}: {similarity}")

Words similar to 'rl':
drl: 0.8500821590423584
dqn: 0.7308868169784546
ddpg: 0.7207165360450745
single-agent: 0.7065060138702393
cmarl: 0.7046096920967102
ppo: 0.6917793154716492
maddpg: 0.6907556056976318
model-free: 0.6855440735816956
learns: 0.6803081631660461
cmarl-exne: 0.671204149723053
learned: 0.6618658304214478
ddqn: 0.6617934703826904
marl: 0.648317813873291
sac: 0.6459187865257263
learn: 0.644686758518219
q-network: 0.6402889490127563
rl-based: 0.6401153802871704
pre-trained: 0.6353899240493774
irl: 0.6348980665206909
dyna-pinn: 0.6346455216407776
agent: 0.6309409141540527
policy: 0.626416802406311
gail: 0.6236981749534607
bdq: 0.6234402656555176
q-learning: 0.622119128704071
ma-cwsc: 0.616926372051239
rl-mpc: 0.6125118732452393
exploration: 0.597885251045227
tl: 0.5975538492202759
trained: 0.5969592332839966
relbot: 0.5934699177742004
drl-based: 0.5888583660125732
sarl: 0.5883486866950989
dyna-style: 0.5852611660957336
unsl: 0.5841846466064453
pre-training: 0.57950669527053

In [7]:
# Find similarity
similarity_1 = model.wv.similarity('control', 'hour')
print(similarity_1)

0.08287287
