## Mount drive and set current directory

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
import numpy as np
print(os.getcwd())
os.chdir('/content/drive/My Drive/1006')
print(os.getcwd())

/content
/content/drive/My Drive/1006


## Load train and test sets from files

In [None]:
dataset_name = 'eo_'
#dataset_name = 'news_'
#dataset_name = 'twitter_'

In [None]:
data_train = pd.read_csv(dataset_name+'clean_train.csv', index_col=0)
data_test = pd.read_csv(dataset_name+'clean_test.csv', index_col=0)

if dataset_name=='twitter_':
  data_train.rename(columns={'Tweet':'text'}, inplace=True)
  data_test.rename(columns={'Tweet':'text'}, inplace=True)

## Count Vectors and TF-IDF Vectors

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import VarianceThreshold

stp_wrds = ['a', 'an', 'the', 'of', 'and', 'but', 'or', 'of', 'to']
pipe = Pipeline([('vect', CountVectorizer(stop_words=stp_wrds, ngram_range=(1, 3))), \
                 ('selector', VarianceThreshold(threshold=0.028))])

cvec_train = pipe.fit_transform(data_train['text'], data_train['label'])
cvec_test = pipe.transform(data_test['text'])
cvec_train.shape, cvec_test.shape

((8437, 8249), (2110, 8249))

In [None]:
temp = pd.DataFrame(cvec_train.toarray())
temp['label'] = data_train['label'].tolist()
temp.to_csv(dataset_name+'cvec_train.csv')

temp = pd.DataFrame(cvec_test.toarray())
temp['label'] = data_test['label'].tolist()
temp.to_csv(dataset_name+'cvec_test.csv')

In [None]:
stp_wrds = ['a', 'an', 'the', 'of', 'and', 'but', 'or', 'of', 'to']
pipe = Pipeline([('vect', TfidfVectorizer(stop_words=stp_wrds, ngram_range=(1, 3))), \
                 ('selector', VarianceThreshold(threshold=0.00001))])

tfidf_train = pipe.fit_transform(data_train['text'], data_train['label'])
tfidf_test = pipe.transform(data_test['text'])
tfidf_train.shape, tfidf_test.shape

((8437, 8072), (2110, 8072))

In [None]:
temp = pd.DataFrame(tfidf_train.toarray())
temp['label'] = data_train['label'].tolist()
temp.to_csv(dataset_name+'tfidf_train.csv')

temp = pd.DataFrame(tfidf_test.toarray())
temp['label'] = data_test['label'].tolist()
temp.to_csv(dataset_name+'tfidf_test.csv')

In [None]:
del temp

## Sentence/Doc-Level Embeddings

In [None]:
#from bert_embedding import BertEmbedding
#bert_embedding = BertEmbedding(model='bert_12_768_12', dataset_name='book_corpus_wiki_en_uncased')

### gensim Doc2Vec (train from scratch case; dropped)

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(data_train['text'])]

model = Doc2Vec(tagged_data, vector_size=300, window=3, min_count=1, epochs = 15)
model.save("gensimDoc2Vec_"+dataset_name+".model")

### BERT-based

In [None]:
!pip install sentence_transformers

Collecting sentence_transformers
[?25l  Downloading https://files.pythonhosted.org/packages/6a/e2/84d6acfcee2d83164149778a33b6bdd1a74e1bcb59b2b2cd1b861359b339/sentence-transformers-0.4.1.2.tar.gz (64kB)
[K     |████████████████████████████████| 71kB 8.1MB/s 
[?25hCollecting transformers<5.0.0,>=3.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/f9/54/5ca07ec9569d2f232f3166de5457b63943882f7950ddfcc887732fc7fb23/transformers-4.3.3-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 29.6MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 49.6MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K    

In [None]:
from sentence_transformers import SentenceTransformer
'''
roberta-base-nli-stsb-mean-tokens
bert-base-nli-stsb-mean-tokens
distilroberta-base-paraphrase-v1
'''

'\nroberta-base-nli-stsb-mean-tokens\nbert-base-nli-stsb-mean-tokens\ndistilroberta-base-paraphrase-v1\n'

In [None]:
sbert_model = SentenceTransformer('roberta-base-nli-stsb-mean-tokens')
sentence_embeddings = sbert_model.encode(data_train['text'].tolist())
sentence_embeddings.shape

100%|██████████| 461M/461M [00:24<00:00, 18.9MB/s]


(8437, 768)

In [None]:
pd.DataFrame(sentence_embeddings).to_csv(dataset_name+"roberta_train.csv")

In [None]:
sbert_model = SentenceTransformer('bert-base-nli-stsb-mean-tokens')
sentence_embeddings = sbert_model.encode(data_train['text'].tolist())
sentence_embeddings.shape

100%|██████████| 405M/405M [00:17<00:00, 23.5MB/s]


(8437, 768)

In [None]:
pd.DataFrame(sentence_embeddings).to_csv(dataset_name+"bert_train.csv")

In [None]:
sbert_model = SentenceTransformer('distilroberta-base-paraphrase-v1')
sentence_embeddings = sbert_model.encode(data_train['text'].tolist())
sentence_embeddings.shape

100%|██████████| 306M/306M [00:12<00:00, 25.4MB/s]


(8437, 768)

In [None]:
pd.DataFrame(sentence_embeddings).to_csv(dataset_name+"distil_train.csv")

### GloVe Embeddings

In [None]:
"""
sbert_model = SentenceTransformer('average_word_embeddings_glove.840B.300d')
sentence_embeddings = sbert_model.encode(data_train['text'].tolist())
sentence_embeddings.shape

pd.DataFrame(sentence_embeddings).to_csv(dataset_name+"glove840B_train.csv")
"""

100%|██████████| 2.43G/2.43G [03:43<00:00, 10.9MB/s]


(68811, 300)

In [None]:
sbert_model = SentenceTransformer('average_word_embeddings_glove.6B.300d')
sentence_embeddings = sbert_model.encode(data_train['text'].tolist())
sentence_embeddings.shape

100%|██████████| 441M/441M [00:16<00:00, 26.9MB/s]


(8437, 300)

In [None]:
pd.DataFrame(sentence_embeddings).to_csv(dataset_name+"glove6B_train.csv")

### InferSent (too long; dropped)

In [None]:
'''
! mkdir encoder
! curl -Lo encoder/infersent2.pkl https://dl.fbaipublicfiles.com/infersent/infersent2.pkl
! mkdir GloVe
! curl -Lo GloVe/glove.840B.300d.zip http://nlp.stanford.edu/data/glove.840B.300d.zip
! unzip GloVe/glove.840B.300d.zip -d GloVe/
'''

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  146M  100  146M    0     0  48.4M      0  0:00:03  0:00:03 --:--:-- 48.4M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0   315    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0   352    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 2075M  100 2075M    0     0  2095k      0  0:16:54  0:16:54 --:--:-- 1905k
Archive:  GloVe/glove.840B.300d.zip
  inflating: GloVe/glove.840B.300d.txt  


In [None]:
from models import InferSent
import torch

V = 2
MODEL_PATH = 'encoder/infersent%s.pkl' % V
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))

W2V_PATH = 'GloVe/glove.840B.300d.txt'
model.set_w2v_path(W2V_PATH)

In [None]:
model.build_vocab(data_train['text'].tolist()+data_test['text'].tolist(), tokenize=True)

Found 28258(/28636) words with w2v vectors
Vocab size : 28258


In [None]:
sentence_list = data_train['text'].tolist()
print(len(sentence_list))
print()
sentence_embeddings = []
for i in range(len(sentence_list)):
  print(i)
  sentence_embeddings.append(model.encode(sentence_list[i])[0])

In [None]:
pd.DataFrame(np.array(sentence_embeddings)).to_csv(dataset_name+"infersent_train.csv")

### Universal Sentence Encoder

In [None]:
!pip3 install --upgrade tensorflow-gpu
# Install TF-Hub.
!pip3 install tensorflow-hub

Collecting tensorflow-gpu
[?25l  Downloading https://files.pythonhosted.org/packages/85/cc/a27e73cf8b23f2ce4bdd2b7089a42a7819ce6dd7366dceba406ddc5daa9c/tensorflow_gpu-2.4.1-cp37-cp37m-manylinux2010_x86_64.whl (394.3MB)
[K     |████████████████████████████████| 394.3MB 44kB/s 
Installing collected packages: tensorflow-gpu
Successfully installed tensorflow-gpu-2.4.1


In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

In [None]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" 
model = hub.load(module_url)
print ("module %s loaded" % module_url)

INFO:absl:Using /tmp/tfhub_modules to cache modules.
INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/4'.
INFO:absl:Downloading https://tfhub.dev/google/universal-sentence-encoder/4: 820.00MB
INFO:absl:Downloaded https://tfhub.dev/google/universal-sentence-encoder/4, Total size: 987.47MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/4'.


module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [None]:
sentence_list = data_train['text'].tolist()
len(sentence_list)

8437

In [None]:
sentence_embeddings = []
for i in range(len(sentence_list)):
  sentence_embeddings.append(np.array(model([sentence_list[i]])[0]))
np.array(sentence_embeddings).shape

(8437, 512)

In [None]:
pd.DataFrame(np.array(sentence_embeddings)).to_csv(dataset_name+"universal_train.csv")

## Dimension Reduction

In [None]:
cvec_train = pd.read_csv(dataset_name+'cvec_train.csv', index_col=0)
tfidf_train =  pd.read_csv(dataset_name+'tfidf_train.csv', index_col=0)

### PCA, UMAP, NMF

In [None]:
from sklearn.decomposition import PCA
pd.DataFrame(PCA(n_components=16).fit_transform(cvec_train.drop(columns=['label']))).to_csv(dataset_name+'cvec_pca16_train.csv')
pd.DataFrame(PCA(n_components=16).fit_transform(tfidf_train.drop(columns=['label']))).to_csv(dataset_name+'tfidf_pca16_train.csv')

In [None]:
import umap
pd.DataFrame(umap.UMAP(n_components=16).fit_transform(cvec_train.drop(columns=['label']))).to_csv(dataset_name+'cvec_umap16_train.csv')
pd.DataFrame(umap.UMAP(n_components=16).fit_transform(tfidf_train.drop(columns=['label']))).to_csv(dataset_name+'tfidf_umap16_train.csv')

In [None]:
from sklearn.decomposition import NMF
pd.DataFrame(NMF(n_components=16, init='nndsvd').fit_transform(cvec_train.drop(columns=['label']))).to_csv(dataset_name+'cvec_nmf16_train.csv')
pd.DataFrame(NMF(n_components=16, init='nndsvd').fit_transform(tfidf_train.drop(columns=['label']))).to_csv(dataset_name+'tfidf_nmf16_train.csv')

* BERT
* DistilBERT
* RoBERTa
* Universal Sentence Encoder
* Glove6B
* pca16-cvec
* pca16-tfidf
* umap16-cvec
* umap16-tfidf
* nmf16-cvec
* nmf16-tfidf
* tsne16-cvec
* tsne16-tfidf