In [None]:
import numpy as np
import pandas as pd
import itertools as it
import pickle
import glob
import os
import string
import gc
import re
import time
import nltk
import spacy
import textacy
import en_core_web_md
import sematch
import gensim
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

from nltk import word_tokenize
from nltk.corpus import wordnet as wn
from tqdm import tqdm, tqdm_notebook

from cleaning_utils import *

import warnings
warnings.filterwarnings('ignore')

In [None]:
src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/scripts/features/'

trdf =  pd.read_csv(src + 'df_test_lemmatfullcleanSTEMMED.csv').iloc[:, :-1]
tedf =  pd.read_csv(src + 'df_train_lemmatfullcleanSTEMMED.csv').iloc[:, 4:]

#trdf =  pd.read_csv(src + 'df_train_spacylemmat_fullclean.csv').iloc[:, :-1]
#tedf =  pd.read_csv(src + 'df_test_spacylemmat_fullclean.csv').iloc[:, 4:]

trdf.fillna('NULL', inplace = True)
tedf.fillna('NULL', inplace = True)

questions = trdf.question1.tolist() + trdf.question2.tolist() + \
    tedf.question1.tolist() + tedf.question2.tolist()
    
tr_questions = trdf.question1.tolist() + trdf.question2.tolist()

In [None]:
corpus = textacy.Corpus(en_core_web_md.load(), texts = questions)

doc_term_matrix, id2term = textacy.vsm.doc_term_matrix(
    (doc.to_terms_list(ngrams=3, named_entities=True, as_strings=True) for doc in corpus),
     weighting='tfidf', normalize=True, smooth_idf=True, min_df=2, max_df=0.95)

print(corpus, doc_term_matrix.shape)

In [None]:
info_content = textacy.vsm.get_information_content(doc_term_matrix)
doc_freq = textacy.vsm.get_doc_freqs(doc_term_matrix)
term_freq = textacy.vsm.get_term_freqs(doc_term_matrix)

In [None]:
tr_feats = pd.DataFrame()
tr_feats['textacy_jarowinkler'] = trdf.apply(lambda x: textacy.similarity.jaro_winkler(x['question1'], x['question2']), axis = 1)
tr_feats['textacy_token_sort_ratio'] = trdf.apply(lambda x: textacy.similarity.token_sort_ratio(x['question1'], x['question2']), axis = 1)
tr_feats['textacy_word2vec'] = trdf.apply(lambda x: textacy.similarity.word2vec(textacy.Doc(x['question1'], lang = 'en'), textacy.Doc(x['question2'], lang = 'en')), axis = 1)
tr_feats.to_csv('train_textacy_similarity_feats.csv', index = False)

In [None]:
te_feats = pd.DataFrame()
te_feats['textacy_jarowinkler'] = tedf.apply(lambda x: textacy.similarity.jaro_winkler(x['question1'], x['question2']), axis = 1)
te_feats['textacy_token_sort_ratio'] = tedf.apply(lambda x: textacy.similarity.token_sort_ratio(x['question1'], x['question2']), axis = 1)
te_feats['textacy_word2vec'] = tedf.apply(lambda x: textacy.similarity.word2vec(textacy.Doc(x['question1'], lang = 'en'), textacy.Doc(x['question2'], lang = 'en')), axis = 1)
te_feats.to_csv('test_textacy_similarity_feats.csv', index = False)

In [None]:
t = time.time()

docfreq_dict = {}
for i, val in id2term.items():
    docfreq_dict[val] = doc_freq[i]
    
termfreq_dict = {}
for i, val in id2term.items():
    termfreq_dict[val] = term_freq[i]
    
info_dict = {}
for i, val in id2term.items():
    info_dict[val] = info_content[i]
    
    
keyterms_dict = {}
for i in corpus:
    try:
        keyterms_dict[i.text] = textacy.keyterms.key_terms_from_semantic_network(i)[0][1]
    except IndexError:
        keyterms_dict[i.text] = 0.0
        

keyterms_sgrank_dict = {}
for i in corpus:
    try:
        keyterms_sgrank_dict[i.text] = textacy.keyterms.sgrank(i)[0][1]
    except IndexError:
        keyterms_sgrank_dict[i.text] = 0.0

print('Time it took to create dictionaries:', time.time() - t)

In [None]:
def apply_dict(x, dict_to_apply):
    new_x = []
    for i in x:
        try:
            i = dict_to_apply[(str(i))]
        except KeyError:
            i = 0
        new_x.append(i)
    return np.array(new_x)

In [None]:
trdf =  pd.read_csv(src + 'df_train_spacylemmat_fullclean.csv').iloc[:, :-1]
trdf.fillna('NULL', inplace = True)
df = trdf.copy()

t = time.time()

df['question1_doc'] = df['question1'].apply(lambda x: textacy.doc.Doc(x, lang = 'en'))
df['question1_doc'] = df['question1_doc'].apply(lambda x: list(textacy.extract.ngrams(x, 3)))

df['q1_docfreq_max'] = df['question1_doc'].apply(lambda x: np.max(apply_dict(x, docfreq_dict)) if len(x) > 0 else 0)
df['q1_termfreq_max'] = df['question1_doc'].apply(lambda x: np.max(apply_dict(x, termfreq_dict)) if len(x) > 0 else 0)
df['q1_infocontent_max'] = df['question1_doc'].apply(lambda x: np.max(apply_dict(x, info_dict)) if len(x) > 0 else 0)

df['q1_keyterms_max'] = df['question1'].map(keyterms_dict)
df['q1_sgrank_max'] = df['question1'].map(keyterms_sgrank_dict)


df['question2_doc'] = df['question2'].apply(lambda x: textacy.doc.Doc(x, lang = 'en'))
df['question2_doc'] = df['question2_doc'].apply(lambda x: list(textacy.extract.ngrams(x, 3)))

df['q2_docfreq_max'] = df['question2_doc'].apply(lambda x: np.max(apply_dict(x, docfreq_dict)) if len(x) > 0 else 0)
df['q2_termfreq_max'] = df['question2_doc'].apply(lambda x: np.max(apply_dict(x, termfreq_dict)) if len(x) > 0 else 0)
df['q2_infocontent_max'] = df['question2_doc'].apply(lambda x: np.max(apply_dict(x, info_dict)) if len(x) > 0 else 0)

df['q2_keyterms_max'] = df['question2'].map(keyterms_dict)
df['q2_sgrank_max'] = df['question2'].map(keyterms_sgrank_dict)

df2 = df.iloc[:, 7:]
df2.drop(['question2_doc'], axis = 1, inplace = True)
df2.to_csv('train_textacymax_features.csv', index = False)

print('Time it took:', time.time() - t)

In [None]:
trdf =  pd.read_csv(src + 'df_train_spacylemmat_fullclean.csv').iloc[:, :-1]
trdf.fillna('NULL', inplace = True)
df = trdf.copy()

t = time.time()

df['question1_doc'] = df['question1'].apply(lambda x: textacy.doc.Doc(x, lang = 'en'))
df['question1_doc'] = df['question1_doc'].apply(lambda x: list(textacy.extract.ngrams(x, 3)))

df['q1_docfreq_mean'] = df['question1_doc'].apply(lambda x: np.mean(apply_dict(x, docfreq_dict)) if len(x) > 0 else 0)
df['q1_termfreq_mean'] = df['question1_doc'].apply(lambda x: np.mean(apply_dict(x, termfreq_dict)) if len(x) > 0 else 0)
df['q1_infocontent_mean'] = df['question1_doc'].apply(lambda x: np.mean(apply_dict(x, info_dict)) if len(x) > 0 else 0)

df['q1_keyterms_mean'] = df['question1'].map(keyterms_dict)
df['q1_sgrank_mean'] = df['question1'].map(keyterms_sgrank_dict)


df['question2_doc'] = df['question2'].apply(lambda x: textacy.doc.Doc(x, lang = 'en'))
df['question2_doc'] = df['question2_doc'].apply(lambda x: list(textacy.extract.ngrams(x, 3)))

df['q2_docfreq_mean'] = df['question2_doc'].apply(lambda x: np.mean(apply_dict(x, docfreq_dict)) if len(x) > 0 else 0)
df['q2_termfreq_mean'] = df['question2_doc'].apply(lambda x: np.mean(apply_dict(x, termfreq_dict)) if len(x) > 0 else 0)
df['q2_infocontent_mean'] = df['question2_doc'].apply(lambda x: np.mean(apply_dict(x, info_dict)) if len(x) > 0 else 0)

df['q2_keyterms_mean'] = df['question2'].map(keyterms_dict)
df['q2_sgrank_mean'] = df['question2'].map(keyterms_sgrank_dict)

df2 = df.iloc[:, 7:]
df2.drop(['question2_doc'], axis = 1, inplace = True)
df2.to_csv('train_textacymean_features.csv', index = False)

print('Time it took:', time.time() - t)