In [2]:
import gensim
import gensim.downloader as api
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn import decomposition
import altair as alt

import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

from main import preprocess

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

all_data = pd.concat([train, test])

In [4]:
wnl = WordNetLemmatizer()
stops = set(stopwords.words('english'))
def clean(x):
    a = pos_tag(word_tokenize(x.lower()))
    wnpos = lambda e: ('a' if e[0].lower() == 'j' else e[0].lower()) if e[0].lower() in ['n', 'r', 'v'] else 'n'
    b = [wnl.lemmatize(y[0], wnpos(y[1])) for y in a]
    return [y for y in b if y not in stops]

sentences = all_data.text.apply(clean).tolist()

In [4]:
print(sentences[:3])

[['feel', 'awful', 'job', 'get', 'position', 'succeed', 'happen'], ['im', 'alone', 'feel', 'awful'], ['ive', 'probably', 'mention', 'really', 'feel', 'proud', 'actually', 'keep', 'new', 'year', 'resolution', 'monthly', 'weekly', 'goal']]


In [5]:
def tagged_document(list_of_list_of_words):
   for i, list_of_words in enumerate(list_of_list_of_words):
      yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])

def expand_vector(x):
   d = {'id': x.id}
   for i in range(len(x.text)):
      d[f'vec_{i}'] = x.text[i]
   return pd.Series(d)

In [6]:
# train doc2vec model using provided data
tweet_train_data = list(tagged_document(sentences))
tweet_model = gensim.models.doc2vec.Doc2Vec(vector_size=40, min_count=2, epochs=30)
tweet_model.build_vocab(tweet_train_data)
tweet_model.train(tweet_train_data, total_examples=tweet_model.corpus_count, epochs=tweet_model.epochs)
tweet_embeddings = pd.DataFrame(columns=['id', 'text'])
for idx, row in all_data.iterrows():
    tweet_embeddings.loc[len(tweet_embeddings)] = pd.Series({'id': row.id, 'text': list(tweet_model.infer_vector(clean(row.text)))})

tweet_embeddings.apply(expand_vector, axis=1).to_csv('tweet_embeddings.csv', index=False)

In [7]:
# tain doc2vec model using text8
# text8_train_data = list(tagged_document([d for d in api.load('wiki-english-20171001')]))
# text8_model = gensim.models.doc2vec.Doc2Vec(vector_size=40, min_count=2, epochs=30)
# text8_model.build_vocab(text8_train_data)
# text8_model.train(text8_train_data, total_examples=text8_model.corpus_count, epochs=text8_model.epochs)
# text8_embeddings = pd.DataFrame(columns=['id', 'text'])
# for idx, row in all_data.iterrows():
#     text8_embeddings.loc[len(text8_embeddings)] = pd.Series({'id': row.id, 'text': list(text8_model.infer_vector(clean(row.text)))})

# text8_embeddings.apply(expand_vector, axis=1).to_csv('text8_embeddings.csv', index=False)

In [6]:
# naive document model using glove-200
naive_glove_embeddings = pd.DataFrame(columns=['id', 'text'])
model = api.load('glove-twitter-200')
for idx, row in all_data.iterrows():
    c = clean(row.text)
    naive_glove_embeddings.loc[len(naive_glove_embeddings)] = pd.Series({'id': row.id, 'text': list(np.sum([model[w] if w in model else np.zeros(200) for w in c], axis=0) / len(c))})

naive_glove_embeddings.drop_duplicates(subset=['id'], keep='first').apply(expand_vector, axis=1).to_csv('naive_glove_embeddings.csv', index=False)

In [11]:
# PCA document model use glove-50
pca_glove_embeddings = pd.DataFrame(columns=['id', 'text'])
model = api.load('glove-twitter-50')
# for idx, row in all_data.iterrows():
    # pca_glove_embeddings.loc[len(pca_glove_embeddings)] = pd.Series({'id': row.id, 'text': np.nan_to_num(np.concatenate([model[w] if w in model else np.zeros(50) for w in clean(row.text)]).flat)})
df, _, __= preprocess(train, remove_stopwords=True, lemmatize=True)
del df['id']
del df['emotions']

def blow_up(x):
    d = dict()
    for i,y in x.iteritems():
        v = model[i] if i in model else np.zeros(50)
        for j in range(len(v)):
            d[f'{i}_{j}'] = v[j]
    return pd.Series(d)

pca_glove_embeddings = df.apply(blow_up, axis=1)

# pca_glove_embeddings = pca_glove_embeddings.apply(expand_vector, axis=1)
print(pca_glove_embeddings)
vector_matrix = np.matrix(pca_glove_embeddings.loc[:, ~pca_glove_embeddings.columns.isin(['id'])])

pca = decomposition.PCA(1200)
transformed_matrix = pca.fit_transform(vector_matrix)

df = pd.DataFrame(transformed_matrix)
df['id'] = train.id

      _befriend_0  _befriend_1  _befriend_2  _befriend_3  _befriend_4  \
0             0.0          0.0          0.0          0.0          0.0   
1             0.0          0.0          0.0          0.0          0.0   
2             0.0          0.0          0.0          0.0          0.0   
3             0.0          0.0          0.0          0.0          0.0   
4             0.0          0.0          0.0          0.0          0.0   
...           ...          ...          ...          ...          ...   
1195          0.0          0.0          0.0          0.0          0.0   
1196          0.0          0.0          0.0          0.0          0.0   
1197          0.0          0.0          0.0          0.0          0.0   
1198          0.0          0.0          0.0          0.0          0.0   
1199          0.0          0.0          0.0          0.0          0.0   

      _befriend_5  _befriend_6  _befriend_7  _befriend_8  _befriend_9  ...  \
0             0.0          0.0          0.0  

  self.explained_variance_ratio_ = self.explained_variance_ / total_var.sum()
  self.explained_variance_ratio_ = self.explained_variance_ / total_var.sum()
  self.explained_variance_ratio_ = self.explained_variance_ / total_var.sum()
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var


In [12]:

df.to_csv('pca_glove_embeddings.csv', index=False)