In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
%cd /content/drive/MyDrive/Colab Notebooks/NLP Assignment 2/

/content/drive/MyDrive/Colab Notebooks/NLP Assignment 2


In [7]:
!pip install -r requirements.txt

Collecting pytrec-eval (from -r requirements.txt (line 6))
  Downloading pytrec_eval-0.5.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pytrec-eval
  Building wheel for pytrec-eval (setup.py) ... [?25l[?25hdone
  Created wheel for pytrec-eval: filename=pytrec_eval-0.5-cp310-cp310-linux_x86_64.whl size=308202 sha256=bd8caa05b6c363744c225e0c180fc745b2964ade215d9c7695fafa780d9c13af
  Stored in directory: /root/.cache/pip/wheels/51/3a/cd/dcc1ddfc763987d5cb237165d8ac249aa98a23ab90f67317a8
Successfully built pytrec-eval
Installing collected packages: pytrec-eval
Successfully installed pytrec-eval-0.5


In [11]:
import numpy as np
import os
import os.path as osp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import pickle
import pytrec_eval
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from gensim.models import Word2Vec
import nltk
from nltk.corpus import brown
from nltk.tokenize import word_tokenize
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor

nltk.download('brown')

class TfidfTextContextSimilarWord:
    def __init__(self, text: list):
        self.text = text
        self.X = sparse.csr_matrix(np.array([]))
        self.vocabulary = []

    def tune_min_df(self, min_df: list, val_list_of_words: list, evaluator):
        best_result = -1
        best_min_df = None
        scores = np.zeros((len(min_df)))
        vocab_sizes = []

        selection_parameter = 'ndcg average'
        for i, min_df_ in enumerate(min_df):
            self.fit(min_df_)
            predictions = self.predict(val_list_of_words)
            predictions = prepare_dict_for_evaluation(predictions)

            vocab_sizes.append(len(self.vocabulary))

            m_s = evaluator.evaluate(predictions)[selection_parameter]
            if m_s > best_result:
                best_result = m_s
                best_min_df = min_df_
            scores[i] = m_s
        return best_min_df, scores, vocab_sizes

    def fit(self, min_df: float):
        vectorizer = TfidfVectorizer(min_df=min_df, stop_words='english', ngram_range=(1, 1))
        X = vectorizer.fit_transform(self.text)

        self.X = X.T
        self.X = sparse.csr_matrix(self.X)
        self.vocabulary = list(vectorizer.get_feature_names_out())

    def predict(self, list_of_words):
        tfidf_similar = {}
        for i in list_of_words:
            ix = [i_ for i_, w in enumerate(self.vocabulary) if w == i]
            if len(ix) == 1:
                ix_ = ix[0]
                w1 = self.X[ix_]
                sim_scores = [(w, p) for w, p in zip(self.vocabulary,
                                                     list(enumerate(cosine_similarity(w1, self.X)))[0][1])]
                sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
                sim_scores = sim_scores[1:11]

                dict_ = {}
                for elem in sim_scores:
                    dict_[elem[0]] = elem[1]
                    tfidf_similar[i] = dict_
            else:
                pass
        return tfidf_similar

    def save_model(self, model_loc: str):
        if os.path.exists(model_loc):
            save_sparse_csr(osp.join(model_loc, 'tfidf_word_context.pkl'), self.X)
            with open(osp.join(model_loc, 'tfidf_vocabulary.txt'), "w") as file:
                file.write("\n".join(self.vocabulary))  # Write each word in a new line
        else:
            try:
                os.makedirs(model_loc, exist_ok=True)
                save_sparse_csr(osp.join(model_loc, 'tfidf_word_context.pkl'), self.X)
                with open(osp.join(model_loc, 'tfidf_vocabulary.txt'), "w") as file:
                    file.write("\n".join(self.vocabulary))  # Write each word in a new line
            except TypeError:
                print('Cant save the model !')

    def load_model(self, model_loc: str):
        if os.path.exists(model_loc):
            self.X = load_sparse_csr(osp.join(model_loc, 'tfidf_word_context.pkl'))
            with open(osp.join(model_loc, 'tfidf_vocabulary.txt'), "r") as file:
                self.vocabulary = file.read().splitlines()
        else:
            raise Exception('Cant load the model !')

class Word2VecSimilarWord:
    def __init__(self, text):
        self.text = text
        self.model = Word2Vec()

    def tune_window_n_size(self, windows: list, vector_sizes: list, val_list_of_words: list,
                           evaluator):
        best_result = -1
        best_window = None
        best_vector_size = None

        scores = np.zeros((len(windows), len(vector_sizes)))
        selection_parameter = 'ndcg average'
        for i, window in enumerate(windows):
            for j, vector_size in enumerate(vector_sizes):
                self.fit(window, vector_size)
                predictions = self.predict(val_list_of_words)
                predictions = prepare_dict_for_evaluation(predictions)

                m_s = evaluator.evaluate(predictions)[selection_parameter]
                if m_s > best_result:
                    best_result = m_s
                    best_window = window
                    best_vector_size = vector_size
                scores[i][j] = m_s
        return best_window, best_vector_size, scores

    def fit(self, window: int, vector_size: int):
        try:
            self.model = Word2Vec(sentences=self.text, window=window, min_count=1, vector_size=vector_size,
                                  epochs=10)
        except DeprecationWarning:
            self.model = Word2Vec(sentences=self.text, window=window, min_count=1, size=vector_size,
                                  epochs=10)

    def save_model(self, model_loc: str):
        if os.path.exists(model_loc):
            self.model.save(osp.join(model_loc, "word2vec.model"))
        else:
            try:
                os.makedirs(model_loc, exist_ok=True)
                self.model.save(osp.join(model_loc, "word2vec.model"))
            except TypeError:
                print('Cant save the model !')


    def load_model(self, model_loc: str):
        if osp.exists(model_loc):
            self.model = Word2Vec.load(osp.join(model_loc, "word2vec.model"))
        else:
            raise Exception('Cant load the model !')

    def predict(self, list_of_words: list):
        try:
            vocab = list(self.model.wv.index_to_key)
        except DeprecationWarning:
            vocab = list(self.model.wv.vocab.keys())
        word2vec_similar = {}
        for i in list_of_words:
            if i in vocab:
                sims = self.model.wv.most_similar(i, topn=10)
                dict_ = {}
                for elem in sims:
                    dict_[elem[0]] = elem[1]
                word2vec_similar[i] = dict_
            else:
                pass
        return word2vec_similar

class DataLoader:
    def __init__(self):
        self.corpus_allowed = ['romance', 'news']

    def fetch_data(self, corpus: str = ''):
        if corpus in self.corpus_allowed:
            if corpus == 'romance':
                tokenized_sentences = brown.sents(categories='romance')
                sentences = [' '.join(sent) for sent in brown.sents(categories='romance')]
                return tokenized_sentences, sentences
            elif corpus == 'news':
                tokenized_sentences = brown.sents(categories='news')
                sentences = [' '.join(sent) for sent in brown.sents(categories='news')]
                return tokenized_sentences, sentences
        else:
            raise Exception('Corpus not in allowed corpuses')

class SimLexLoader:
    def __init__(self, file_loc: str = "Dataset/SimLex-999.txt"):
        self.file_loc = file_loc

    def load(self):
        result1 = [x.split('\t')[0] for x in open(self.file_loc).readlines()]
        result2 = [x.split('\t')[1] for x in open(self.file_loc).readlines()]
        result3 = [x.split('\t')[3] for x in open(self.file_loc).readlines()]

        result1.remove('word1')
        result2.remove('word2')
        result3.remove('SimLex999')

        self.df = pd.DataFrame(list(zip(result1, result2, result3)),
                       columns =['word1', 'word2', 'SimLex999'])
        return self.df

    @staticmethod
    def return_one_level_similar_words(temp, i_sim, result_dict):
        i_sim_old = i_sim
        for j in list(i_sim.keys()):
            if j in temp:
                j_dict = result_dict.get(j)
                for k in list(j_dict.keys()):
                    if k not in list(i_sim.keys()):
                        i_sim.update({k: j_dict[k]})
        return i_sim, i_sim_old

    def get_top_words(self):
        result_dict = {}

        for word1, word2, simi in zip(self.df['word1'], self.df['word2'], self.df['SimLex999']):
            if word1 not in result_dict.keys():
                result_dict[word1] = {word2: simi}
            else:
                if word2 not in result_dict.get(word1):
                    result_dict[word1].update({word2: simi})

        temp = result_dict.keys()

        for i in tqdm(temp, desc='Combining outputs: '):
            i_sim = result_dict.get(i)
            i_sim_old = {}
            while len(i_sim) < 10 and list(i_sim.keys()) != list(i_sim_old.keys()):
                i_sim, i_sim_old = self.return_one_level_similar_words(temp, i_sim, result_dict)
            result_dict[i] = i_sim

        final_dict = dict()

        for key, val in result_dict.items():
            val = {k: float(v) for k, v in val.items()}
            sorted_val = {k: v for k, v in sorted(val.items(), key=lambda item: item[1], reverse=True)}
            final_dict[key] = sorted_val

        return final_dict

def save_sparse_csr(filename, array):
    array = array.todense()
    with open(filename, 'wb') as f:
        pickle.dump(array, f)

def load_sparse_csr(filename):
    with open(filename, 'rb') as f:
        x = pickle.load(f)
    return x

def prepare_dict_for_evaluation(res):
    res_for_eval = {}
    for k in res.keys():
        dict_ = {}
        sorted_gt = [(k, v) for k, v in sorted(res.get(k).items(), key=lambda item: item[1])]
        for idx, l in enumerate(sorted_gt[:10]):
            dict_[l[0]] = 1
        res_for_eval[k] = dict_
    return res_for_eval

class PyTrecEvaluator:
    def __init__(self, res_golden):
        self.evaluator = pytrec_eval.RelevanceEvaluator(res_golden, {'ndcg'})

    def evaluate(self, result_evaluation: dict):
        result = self.evaluator.evaluate(result_evaluation)

        metrics = {}
        for measure in sorted(list(result[list(result.keys())[0]].keys())):
            metrics[f'{measure} average'] = \
                pytrec_eval.compute_aggregated_measure(
                    measure, [query_measures[measure] for query_measures in result.values()])
        return metrics

def hyper_parameter_CM(array, index: list, columns: list, title: str,
                       xlabel: str, ylabel: str, img_loc: str):
    df_cm = pd.DataFrame(array, index=index, columns=columns)
    plt.figure(figsize=(10, 7))
    sns.heatmap(df_cm, annot=True, cmap='gist_earth_r')
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.savefig(img_loc)
    plt.close()

def line_plots(arrays: list, labels: list, xticks: list, xlabel: str,
               ylabel: str, title: str, img_loc: str):
    if len(arrays) == 2:
        fig, ax1 = plt.subplots()

        ax2 = ax1.twinx()
        ax1.plot(arrays[0], 'g-', label=labels[0])
        ax2.plot(arrays[1], 'b-', label=labels[0])
        ax1.legend()
        ax2.legend()
        ax1.set_xlabel(xlabel)
        ax1.set_ylabel(labels[0], color='g')
        ax2.set_ylabel(labels[1], color='b')
        plt.title(title)
    else:
        for array, label in zip(arrays, labels):
            plt.plot(array, label=label)
        plt.legend()
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
        plt.title(title)
    plt.xticks(range(len(arrays[0])), xticks)
    plt.savefig(img_loc, bbox_inches='tight')
    plt.close()

class Experiment:
    def __init__(self, model_loc: str, output_loc: str, corpus: str, file_loc: str):
        self.model_loc = model_loc
        self.output_loc = output_loc
        self.corpus = corpus
        self.file_loc = file_loc

    def load_simlex_data(self):
        simlex_loader = SimLexLoader(file_loc=self.file_loc)
        df = simlex_loader.load()
        print('#' * 8 + ' Simlex - loaded dataframe' + '#' * 8)
        print(df.head())
        self.result_golden = simlex_loader.get_top_words()
        self.test_words = list(self.result_golden.keys())

    def initiate_evaluator(self):
        self.result_golden_for_evaluator = prepare_dict_for_evaluation(self.result_golden)
        self.evaluator = PyTrecEvaluator(res_golden=self.result_golden_for_evaluator)

    def load_data(self):
        loader = DataLoader()
        self.tokenized_sentences, self.sentences = loader.fetch_data(self.corpus)

    def tfidf(self):
        print('#' * 8 + ' Tfidf - model' + '#' * 8)
        min_df = [0.0005, 0.001, 0.005, 0.01]
        tfidf = TfidfTextContextSimilarWord(self.sentences)
        best_min_df, scores, vocab_sizes = tfidf.tune_min_df(min_df=min_df,
                                                              val_list_of_words=self.test_words,
                                                              evaluator=self.evaluator)
        print('Tfidf best min_df: ', best_min_df)
        print('Tfidf scores: ', scores)
        print('Tfidf vocab sizes: ', vocab_sizes)
        hyper_parameter_CM(scores, min_df, ['ndcg'], 'Tfidf hyper-parameter tuning',
                           'min_df', 'ndcg', osp.join(self.output_loc, 'tfidf_hyper_param.png'))
        tfidf.save_model(osp.join(self.model_loc, 'tfidf_best_model'))

    def word2vec(self):
        print('#' * 8 + ' Word2Vec - model' + '#' * 8)
        windows = [2, 3, 5, 7]
        vector_sizes = [50, 100, 150, 200]
        word2vec = Word2VecSimilarWord(self.tokenized_sentences)
        best_window, best_vector_size, scores = word2vec.tune_window_n_size(windows=windows,
                                                                            vector_sizes=vector_sizes,
                                                                            val_list_of_words=self.test_words,
                                                                            evaluator=self.evaluator)
        print('Word2Vec best window: ', best_window)
        print('Word2Vec best vector size: ', best_vector_size)
        print('Word2Vec scores: ', scores)
        hyper_parameter_CM(scores, windows, vector_sizes, 'Word2Vec hyper-parameter tuning',
                           'window', 'vector size',
                           osp.join(self.output_loc, 'word2vec_hyper_param.png'))
        word2vec.save_model(osp.join(self.model_loc, 'word2vec_best_model'))

    def run(self):
        self.load_simlex_data()
        self.initiate_evaluator()
        self.load_data()
        self.tfidf()
        self.word2vec()

if __name__ == '__main__':
    model_loc = 'model'
    output_loc = 'output'
    corpus = 'romance'
    file_loc = 'Dataset/SimLex-999.txt'
    experiment = Experiment(model_loc=model_loc, output_loc=output_loc, corpus=corpus, file_loc=file_loc)
    experiment.run()


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


######## Simlex - loaded dataframe########
   word1        word2 SimLex999
0    old          new      1.58
1  smart  intelligent       9.2
2   hard    difficult      8.77
3  happy     cheerful      9.55
4   hard         easy      0.95


Combining outputs: 100%|██████████| 616/616 [00:00<00:00, 41787.69it/s]


######## Tfidf - model########
Tfidf best min_df:  0.0005
Tfidf scores:  [0.00723051 0.0054856  0.00315596 0.00530336]
Tfidf vocab sizes:  [2021, 1119, 155, 54]
######## Word2Vec - model########
Word2Vec best window:  5
Word2Vec best vector size:  150
Word2Vec scores:  [[0.00346362 0.00293392 0.00240323 0.00267763]
 [0.00276972 0.00334003 0.00341921 0.00242605]
 [0.00346204 0.00160937 0.00566724 0.00208077]
 [0.001667   0.00145449 0.00285823 0.0044895 ]]
