In [3]:
from gensim.models import Word2Vec
import pandas as pd

In [2]:
data_train_path = "../data/cleaned_data/cleaned_data_train.csv"
data_test_path = "../data/cleaned_data/cleaned_data_test.csv"
data_dev_path = "../data/cleaned_data/cleaned_data_dev.csv"

In [8]:
def prepare_data(file_path):
    # Load your dataset
    data = pd.read_csv(file_path)
    
    # Initialize variables for storing sequences
    sentences = []
    labels = []
    sentence = []
    label_seq = []
    
    for _, row in data.iterrows():
        # Check for sentence end
        if row['token'] != '.':  # Replace '.' with your sentence delimiter, if any
            sentence.append(row['token'])
            label_seq.append(row['label'])
        else:
            # Add the completed sentence and labels to the lists
            sentences.append(sentence)
            labels.append(label_seq)
            sentence = []
            label_seq = []

    # Handle any leftover sequence (if the dataset doesn't end with a sentence delimiter)
    if sentence and label_seq:
        sentences.append(sentence)
        labels.append(label_seq)

    return sentences, labels

In [23]:
train_sentences, train_labels = prepare_data(data_train_path)
test_sentences, test_labels = prepare_data(data_test_path)
dev_sentences, dev_labels = prepare_data(data_dev_path)

In [24]:
len(train_sentences), len(train_labels)

(923, 923)

In [25]:
train_sentences[0]

['In',
 'this',
 'article',
 'we',
 'discuss',
 'several',
 'metrics',
 'of',
 'coherence',
 'defined',
 'using',
 'centering',
 'theory',
 'and',
 'investigate',
 'the',
 'usefulness',
 'of',
 'such',
 'metrics',
 'for',
 'information',
 'ordering',
 'in',
 'automatic',
 'text',
 'generation']

In [26]:
# Combine train_sentences, test_sentences, and dev_sentences
all_sentences = train_sentences + test_sentences + dev_sentences
all_labels = train_labels + test_labels

In [27]:
len(all_sentences), len(all_labels)

(1172, 1058)

In [33]:
model = Word2Vec(all_sentences, vector_size=300, window=5, min_count=2, sg=0, workers=4)

In [34]:
model.wv['NLP']

array([ 3.11733633e-02,  1.24633089e-01,  9.28443205e-03,  7.17265680e-02,
       -5.76661527e-03, -1.38257161e-01,  1.06778659e-01,  3.03798378e-01,
        1.09113166e-02, -1.83319277e-03, -2.17040032e-02, -1.34624526e-01,
       -1.17992202e-03,  4.41281162e-02, -1.23243921e-01, -9.81178060e-02,
        9.23287794e-02,  4.15359903e-03,  2.82037798e-02, -3.43210101e-02,
       -4.59621474e-02, -4.29066159e-02,  1.24193765e-01,  4.04658280e-02,
        1.09530270e-01, -2.78591886e-02, -1.37272507e-01,  4.18445915e-02,
       -1.00793928e-01, -1.16419174e-01,  4.32174280e-02, -4.30817492e-02,
        5.76726794e-02, -1.55678811e-02,  2.03437563e-02,  1.27075380e-02,
        2.61044912e-02, -1.39524594e-01,  1.33019220e-02, -2.99425069e-02,
       -6.23868667e-02,  5.37047759e-02,  3.11366003e-03, -1.00717045e-01,
        4.57360446e-02,  1.40916333e-01,  1.42081897e-03,  4.22880091e-02,
       -3.81673947e-02,  8.95741805e-02,  3.74806784e-02,  2.93084532e-02,
       -1.04871035e-01,  

In [36]:
# save model
model.save('../models/trained_models/word2vec_model_embedding.h5')

In [37]:
# load model
model = Word2Vec.load('../models/trained_models/word2vec_model_embedding.h5')

In [44]:
model.wv.most_similar("machine")

[('for', 0.9997607469558716),
 ('and', 0.9997566938400269),
 ('than', 0.9997555017471313),
 (',', 0.9997542500495911),
 ('using', 0.9997527599334717),
 ('(', 0.999751627445221),
 ('the', 0.9997514486312866),
 (')', 0.9997497797012329),
 ('has', 0.9997488260269165),
 ('have', 0.9997475147247314)]

## FastText

In [45]:
from gensim.models import FastText

In [46]:
# Training a FastText model with the given parameters
fast_text_model = FastText(sentences=all_sentences, vector_size=300, window=5, min_count=2, sg=0, workers=4)


In [49]:
fast_text_model.wv.most_similar("machine")

[('machines', 0.9999983906745911),
 ('Machine', 0.9999964237213135),
 ('matching', 0.9999943971633911),
 ('approaching', 0.9999943375587463),
 ('achieving', 0.999994158744812),
 ('improving', 0.9999939203262329),
 ('contrasting', 0.9999939203262329),
 ('modeling', 0.9999939203262329),
 ('comparing', 0.9999939203262329),
 ('mapping', 0.9999938607215881)]

In [50]:
fast_text_model.save("../models/trained_models/fasttext_model_embedding.h5")

In [51]:
# remove stopwords from each sentence
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(words):
    return [word for word in words if word not in stop_words]

In [52]:
cleaned_sentences = [remove_stopwords(sentence) for sentence in all_sentences]

In [58]:
word2vec_model = Word2Vec(cleaned_sentences, vector_size=300, window=5, min_count=2, sg=0, workers=4)

In [59]:
word2vec_model.wv.most_similar("machine")

[(',', 0.9996338486671448),
 ('models', 0.999614953994751),
 ('-', 0.9995870590209961),
 ('learning', 0.9995536208152771),
 ('data', 0.9995476603507996),
 ('translation', 0.9995428919792175),
 ('used', 0.999542772769928),
 ('model', 0.9995414614677429),
 ('information', 0.9995402097702026),
 ('two', 0.9995371699333191)]

In [60]:
word2vec_model.save('../models/trained_models/word2vec_model_embedding2.h5')

In [61]:
# Training a FastText model with the given parameters
fast_text_model = FastText(sentences=cleaned_sentences, vector_size=300, window=5, min_count=2, sg=0, workers=4)

In [62]:
fast_text_model.wv.most_similar("machine")

[('machines', 0.9999980926513672),
 ('Machine', 0.9999961853027344),
 ('approaching', 0.9999948143959045),
 ('matching', 0.9999947547912598),
 ('Representations', 0.9999945759773254),
 ('contrasting', 0.9999945759773254),
 ('achieving', 0.9999945163726807),
 ('prominent', 0.9999944567680359),
 ('translations', 0.9999944567680359),
 ('representations', 0.9999944567680359)]

In [65]:
import os

# Ensure the directory exists
os.makedirs("../models/trained_models", exist_ok=True)

try:
	# Save the FastText model
	fast_text_model.save("../models/trained_models/fasttext_model_embedding2.h5")
	print("Model saved successfully.")
except OSError as e:
	print(f"Error saving the model: {e}")

Error saving the model: 600000000 requested and 0 written
