In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fasttext-crawl-300d-2m/crawl-300d-2M.vec
/kaggle/input/glove6b/glove.6B.100d.txt
/kaggle/input/glove6b/glove.6B.200d.txt
/kaggle/input/glove6b/glove.6B.300d.txt
/kaggle/input/glove6b/glove.6B.50d.txt
/kaggle/input/word2vec-google/GoogleNews-vectors-negative300.bin
/kaggle/input/conceptnet-numberbatch-vectors/numberbatch-en.txt
/kaggle/input/conceptnet-numberbatch-vectors/numberbatch-en-17.06.txt/numberbatch-en-17.06.txt


# 1. Importing Data & Modules

In [2]:
from annoy import AnnoyIndex


In [None]:
# Will import data when I am doing NER, POS, etc. tasks on it

# 2. Data Preparation

In [None]:
# Will perform visualization after identifying the tasks and the dataset

# 3. CBOW Model

# 4. Pretrained Embeddings

In [27]:
class PreTrainedEmbeddings(object):
    def __init__(self, word_to_index, word_vectors):
        """
        Args:
        word_to_index (dict): mapping from word to integers
        word_vectors (list of numpy arrays)
        """
        self.word_to_index = word_to_index
        self.word_vectors = word_vectors
        self.index_to_word = {v: k for k, v in self.word_to_index.items()}
        self.index = AnnoyIndex(len(word_vectors[0]),metric='euclidean')
        for _, i in self.word_to_index.items():
            self.index.add_item(i, self.word_vectors[i])
        self.index.build(50)
        
        
    @classmethod
    def from_embeddings_file(cls, embedding_file):
        """
        Instantiate from pretrained vector file.
        
        Vector file should be of the format:
            word0 x0_0 x0_1 x0_2 x0_3 ... x0_N
            word1 x1_0 x1_1 x1_2 x1_3 ... x1_N
        
        Args:
            embedding_file (str): location of the file
        
        Returns:
            instance of PretrainedEmbeddings
        
        """
        word_to_index = {}
        word_vectors = []
        with open(embedding_file) as fp:
            Initial_padding = 1
            # positions_to_check = [2,4]
            for position, line in enumerate(fp):
                if position >= Initial_padding :
                    #print(line)
                    line = line.split(" ")
                    word = line[0]
                    #print(word)
                    #print(type(word))
                    #print(float(line[3]))
                    #print(type(float(line[3])))
                    #print([x for x in line[1:-1]])
                    vec = np.array([float(x) for x in line[1:-1]])
                    word_to_index[word] = len(word_to_index)
                    word_vectors.append(vec)
                    #print(len(vec))
             
        return cls(word_to_index, word_vectors)
    
    def get_embedding(self, word):
        """
        Args:
        word (str)
        Returns:
            an embedding (numpy.ndarray)
        """
        return self.word_vectors[self.word_to_index[word]]
    
    
    def get_closest_to_vector(self, vector, n=1):
        """
        Given a vector, return its n nearest neighbors

        Args:
            vector (np.ndarray): should match the size of the vectors
            in the Annoy index
            n (int): the number of neighbors to return
        Returns:
            [str, str, ...]: words nearest to the given vector
            The words are not ordered by distance
        """
        nn_indices = self.index.get_nns_by_vector(vector, n)
        return [self.index_to_word[neighbor] for neighbor in nn_indices]
    
    
    def compute_and_print_analogy(self, word1, word2, word3):
        """Prints the solutions to analogies using word embeddings
        Analogies are word1 is to word2 as word3 is to __
        This method will print: word1 : word2 :: word3 : word4
        Args:
            word1 (str)
            word2 (str)
            word3 (str)
        """
        vec1 = self.get_embedding(word1)
        vec2 = self.get_embedding(word2)
        vec3 = self.get_embedding(word3)
        # Simple hypothesis: Analogy is a spatial relationship
        spatial_relationship = np.dot(vec2, vec1)
        vec4 = vec3 + spatial_relationship
        closest_words = self.get_closest_to_vector(vec4, n=4)
        existing_words = set([word1, word2, word3])
        closest_words = [word for word in closest_words if word not in existing_words]
        if len(closest_words) == 0:
            print("Could not find nearest neighbors for the vector!")
            return
        
        for word4 in closest_words:
            print("{} : {} :: {} : {}".format(word1, word2, word3,word4))
            
    

In [23]:
# Have to remove the '\n' in this data file and an initial padding of 1
fasttext_embeddings_300d = PreTrainedEmbeddings.from_embeddings_file('../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec')

300
300


In [28]:
# Don't have to remove the '\n' in this data file and no initial padding
glove_embeddings_100d = PreTrainedEmbeddings.from_embeddings_file('../input/glove6b/glove.6B.100d.txt')

99
99


In [29]:
# Don't have to remove the '\n' in this data file and no initial padding
glove_embeddings_200d = PreTrainedEmbeddings.from_embeddings_file('../input/glove6b/glove.6B.200d.txt')

199
199


In [30]:
# Don't have to remove the '\n' in this data file and no initial padding
glove_embeddings_300d = PreTrainedEmbeddings.from_embeddings_file('../input/glove6b/glove.6B.300d.txt')

299
299


In [31]:
# Have to figure out how to work on bin format
# word2vec_embeddings_300d = PreTrainedEmbeddings.from_embeddings_file('../input/word2vec-google/GoogleNews-vectors-negative300.bin')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x94 in position 19: invalid start byte

In [26]:
# Don't have to remove the '\n' in this data file and an initial padding of 1
numberbatch_embeddings_300d = PreTrainedEmbeddings.from_embeddings_file('../input/conceptnet-numberbatch-vectors/numberbatch-en.txt')

300
300


# 5. Application Analysis

#### A. Word Similarity Task

#### B.Word Analogy Task

#### C. NER task