In [1]:
! pip install numpy pandas scikit-learn matplotlib xgboost gensim fuzzywuzzy nltk python-Levenshtein



In [2]:
import re
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import xgboost as xgb

from nltk import word_tokenize, download
from nltk.corpus import stopwords
from tqdm import tqdm_notebook
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.model_selection import cross_validate, ShuffleSplit
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from scipy import sparse
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
from gensim.models import Word2Vec, KeyedVectors
from fuzzywuzzy import fuzz

download('punkt')
download('stopwords')


train_df = pd.read_csv('../assist_material/datasets/extracted/q2b/train.csv', sep=',')
train_df.columns = ['id', 'q1', 'q2', 'is_dup']

norm_model = KeyedVectors.load_word2vec_format('./word2Vec/GoogleNews-vectors-negative300.bin.gz', binary=True)
norm_model.init_sims(replace=True)


[nltk_data] Downloading package punkt to /home/vangelis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vangelis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


KeyboardInterrupt: 

In [None]:
def norm_wmd(q1, q2):
    """
    Word Mover’s Distance between two questions. WMD use word embeddings to calculate the distance so that it
    can calculate even though there is no common word. The assumption is that similar words should have similar vectors.
    :param q1:
    :param q2:
    :return:
    """
    q1 = str(q1).lower().split()
    q2 = str(q2).lower().split()
    stop_words = stopwords.words('english')
    q1 = [w for w in q1 if w not in stop_words]
    q2 = [w for w in q2 if w not in stop_words]
    return norm_model.wmdistance(q1, q2)


def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    stop_words = stopwords.words('english')
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(norm_model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())


def feature_engineering(df):
    """
    Create features from the given data
    :param df: The dataframe that contains questions
    :return: None
    """
    df['len_q1'] = df.q1.apply(lambda x: len(str(x)))
    df['len_q2'] = df.q2.apply(lambda x: len(str(x)))
    df['diff_len'] = df.len_q1 - df.len_q2
    df['len_char_q1'] = df.q1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
    df['len_char_q2'] = df.q2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))
    df['len_word_q1'] = df.q1.apply(lambda x: len(str(x).split()))
    df['len_word_q2'] = df.q2.apply(lambda x: len(str(x).split()))
    df['common_words'] = df.apply(lambda x: len(set(str(x['q1']).lower().split()).intersection(set(str(x['q2']).lower()
                                                                                                   .split()))), axis=1)
    df['fuzz_ratio'] = df.apply(lambda x: fuzz.ratio(str(x['q1']), str(x['q2'])), axis=1)
    df['fuzz_partial_ratio'] = df.apply(lambda x: fuzz.partial_ratio(str(x['q1']), str(x['q2'])), axis=1)
    df['fuzz_partial_token_set_ratio'] = df.apply(lambda x: fuzz.partial_token_set_ratio(str(x['q1']), str(x['q2'])),
                                                  axis=1)
    df['fuzz_partial_token_sort_ratio'] = df.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['q1']), str(x['q2'])),
                                                   axis=1)
    df['fuzz_token_set_ratio'] = df.apply(lambda x: fuzz.token_set_ratio(str(x['q1']), str(x['q2'])), axis=1)
    df['fuzz_token_sort_ratio'] = df.apply(lambda x: fuzz.token_sort_ratio(str(x['q1']), str(x['q2'])), axis=1)


def distances(df):
    
    q1_vectors = np.zeros((df.shape[0], 300))
    for i, q in enumerate(tqdm_notebook(df.q1.values)):
        q1_vectors[i, :] = sent2vec(q)

    q2_vectors  = np.zeros((df.shape[0], 300))
    for i, q in enumerate(tqdm_notebook(df.q2.values)):
        q2_vectors[i, :] = sent2vec(q)
        
    df['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(q1_vectors),
                                                            np.nan_to_num(q2_vectors))]
    df['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(q1_vectors),
                                                                  np.nan_to_num(q2_vectors))]
    df['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(q1_vectors),
                                                              np.nan_to_num(q2_vectors))]
    df['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(q1_vectors),
                                                                np.nan_to_num(q2_vectors))]
    df['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(q1_vectors),
                                                                  np.nan_to_num(q2_vectors))]
    df['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(q1_vectors),
                                                                     np.nan_to_num(q2_vectors))]
    df['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(q1_vectors),
                                                                    np.nan_to_num(q2_vectors))]
    df['skew_q1vec'] = [skew(x) for x in np.nan_to_num(q1_vectors)]
    df['skew_q2vec'] = [skew(x) for x in np.nan_to_num(q2_vectors)]
    df['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(q1_vectors)]
    df['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(q2_vectors)]