<a href="https://colab.research.google.com/github/StevenPeutz/Masterthesis-Disinformation-NLP/blob/master/CODE/2_DimensionReduction_Word2Vec_FastText.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dimension reduction for pretrained embeddings
- FastText (100 -> 50)
- Word2Vec (100 -> 50)
- (for GloVe the 50dim version is used), so no reduction required)


<br>
This is done for the following reasons;

*   Comparison of architectures is considered more far if all pretrained embeddings contains equal dimensions.
*   RAM reductions (local 16GB, google colab 32).
*   Reduction in storage space of embedding files (github limits).
*   This allows all embedding and model combinations to be tested within a single 'overview' environment. Of course this comes at the cost of classification performance. Therefor all embedding and model combination are also run in seperate environments where RAM will not be a limiting factor.  
<br>
<br>
The technique used for dimension reduction is PCA.



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from gensim.models import KeyedVectors
from sklearn.decomposition import PCA

In [None]:
import gzip
import io

# Reducing FastText (wiki-news-300d-1M.vec)

In [None]:
"""
model_path = '/content/drive/MyDrive/MYDATA/Embeddings_PreTrained/FastText/wiki-news-300d-1M.vec.gz'
#model = KeyedVectors.load_word2vec_format(model_path, binary=False)
with gzip.open(model_path, 'rb') as f:
    model = KeyedVectors.load_word2vec_format(f, binary=False)
"""

"\nmodel_path = '/content/drive/MyDrive/MYDATA/Embeddings_PreTrained/FastText/wiki-news-300d-1M.vec.gz'\n#model = KeyedVectors.load_word2vec_format(model_path, binary=False)\nwith gzip.open(model_path, 'rb') as f:\n    model = KeyedVectors.load_word2vec_format(f, binary=False)\n"

In [None]:
model_path = '/content/drive/MyDrive/MYDATA/Embeddings_PreTrained/FastText/wiki-news-300d-1M.vec.'
model = KeyedVectors.load_word2vec_format(model_path, binary=False)

In [None]:
# Extract the word vectors from the model
word_vectors = model.vectors

# Reduce the dimensionality of the vectors to 50 using PCA
pca = PCA(n_components=50)
word_vectors_50d = pca.fit_transform(word_vectors)

In [None]:
# Save the reduced vectors to a file in text format
"""
with open("/content/drive/MyDrive/MYDATA/Embeddings_PreTrained/FastText/PCA_reduced-vectors.txt", "w") as f:
    for i, word in enumerate(model.index2word):
        vector_str = " ".join([str(x) for x in word_vectors_50d[i]])
        f.write(f"{word} {vector_str}\n")
"""

In [None]:
# Save the reduced vectors to a file in compressed gzip format
with gzip.open("/content/drive/MyDrive/MYDATA/Embeddings_PreTrained/FastText/ft_PCA_reduced-vectors.gz", "wt") as f:
    for i, word in enumerate(model.index2word):
        vector_str = " ".join([str(x) for x in word_vectors_50d[i]])
        f.write(f"{word} {vector_str}\n")

Note that the gzip.open() function is used instead of the open() function, and the file extension is changed to .gz. The "wt" argument specifies that the file is opened in text mode, allowing you to use the same write() function as before. When you want to read the compressed file back into a Python script, you can use the gzip.open() function again, this time with the "rt" argument to open the file in text mode.

# Reducing Word2Vec (w2v.bin)

In [None]:
model_path = '/content/drive/MyDrive/MYDATA/Embeddings_PreTrained/word2vec/w2v.bin'
model = KeyedVectors.load_word2vec_format(model_path, binary=True)

In [None]:
# Extract the word vectors from the model
word_vectors = model.vectors

# Reduce the dimensionality of the vectors to 50 using PCA
pca = PCA(n_components=50)
word_vectors_50d = pca.fit_transform(word_vectors)

In [None]:
# Save the reduced vectors to a file in text format
with gzip.open("/content/drive/MyDrive/MYDATA/Embeddings_PreTrained/word2vec/w2v_PCA_reduced-vectors.gz", "wt") as f:
    for i, word in enumerate(model.index2word):
        vector_str = " ".join([str(x) for x in word_vectors_50d[i]])
        f.write(f"{word} {vector_str}\n")