In [None]:
import gc

import numpy as np
import pandas as pd
import spacy
import textacy
import utils
import utils_clean
import utils_text
from gensim.models import KeyedVectors
from keras.preprocessing import sequence, text
from nltk.corpus import stopwords
from tqdm import tqdm

In [None]:
train, test = utils.load_data(src, mode='BasicClean')
data = pd.concat([train, test]).reset_index(drop=True)

sentences = train.comment_text.tolist() + test.comment_text.tolist()

In [None]:
# corpus = textacy.Corpus(spacy.load('en'), texts = sentences)
corpus = textacy.Corpus.load('../data/features/feature_Textacy_Corpus')
terms_list = (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True) for doc in corpus)


vectorizer = textacy.vsm.Vectorizer(
    weighting='tfidf', normalize=True, smooth_idf=True,
    min_df=2, max_df=0.95, max_n_terms=100000)

doc_term_matrix = vectorizer.fit_transform(terms_list)
id2term = vectorizer.id_to_term

In [None]:
info_content = textacy.vsm.get_information_content(doc_term_matrix)
doc_freq = textacy.vsm.get_doc_freqs(doc_term_matrix)
term_freq = textacy.vsm.get_term_freqs(doc_term_matrix)

In [None]:
import time
from textacy import keyterms

t = time.time()

docfreq_dict = {}
for i, val in id2term.items():
    docfreq_dict[val] = doc_freq[i]
    
termfreq_dict = {}
for i, val in id2term.items():
    termfreq_dict[val] = term_freq[i]
    
info_dict = {}
for i, val in id2term.items():
    info_dict[val] = info_content[i]
    
    
keyterms_dict = {}
for i in corpus:
    try:
        keyterms_dict[i.text] = keyterms.key_terms_from_semantic_network(i)[0][1]
    except IndexError:
        keyterms_dict[i.text] = 0.0
    except ValueError:
        keyterms_dict[i.text] = 0.0
        
"""
keyterms_sgrank_dict = {}
for i in corpus:
    try:
        keyterms_sgrank_dict[i.text] = keyterms.sgrank(i)[0][1]
    except IndexError:
        keyterms_sgrank_dict[i.text] = 0.0
    except ValueError:
        keyterms_sgrank_dict[i.text] = 0.0
"""
print('Time it took to create dictionaries:', time.time() - t)

In [None]:
def apply_dict(x, dict_to_apply):
    new_x = []
    for i in x:
        try:
            i = dict_to_apply[(str(i))]
        except KeyError:
            i = 0
        new_x.append(i)
    return np.array(new_x)

In [None]:
df_features = pd.DataFrame()

df_features['comment_text_doc'] = data['comment_text'].apply(lambda x: textacy.doc.Doc(x, lang = 'en'))
df_features['comment_text_doc'] = data['comment_text_doc'].apply(lambda x: list(textacy.extract.ngrams(x, 3)))

df_features['q1_docfreq_max'] = data['comment_text_doc'].apply(lambda x: np.max(apply_dict(x, docfreq_dict)) if len(x) > 0 else 0)
df_features['q1_termfreq_max'] = data['comment_text_doc'].apply(lambda x: np.max(apply_dict(x, termfreq_dict)) if len(x) > 0 else 0)
df_features['q1_infocontent_max'] = data['comment_text_doc'].apply(lambda x: np.max(apply_dict(x, info_dict)) if len(x) > 0 else 0)
df_features['q1_keyterms_max'] = data['comment_text'].map(keyterms_dict)

df_features['q1_docfreq_mean'] = data['comment_text_doc'].apply(lambda x: np.mean(apply_dict(x, docfreq_dict)) if len(x) > 0 else 0)
df_features['q1_termfreq_mean'] = data['comment_text_doc'].apply(lambda x: np.mean(apply_dict(x, termfreq_dict)) if len(x) > 0 else 0)
df_features['q1_infocontent_mean'] = data['comment_text_doc'].apply(lambda x: np.mean(apply_dict(x, info_dict)) if len(x) > 0 else 0)

df_features['q1_keyterms_mean'] = data['comment_text'].map(keyterms_dict)


#df_features['q1_sgrank_mean'] = data['comment_text'].map(keyterms_sgrank_dict)
#df_features['q1_sgrank_max'] = data['comment_text'].map(keyterms_sgrank_dict)

df_features

In [None]:
df_features.to_pickle('../data/features/data_TextacyFeatures.pkl')