In [0]:
#===从GCS加载数据
import warnings
warnings.filterwarnings("ignore")

project_id = 'amazing-firefly-153908'
bucket_name = 'nlp_final'

# 获取授权
from google.colab import auth
auth.authenticate_user()

#从GCS下载数据至tmp
from googleapiclient.discovery import build
gcs_service = build('storage', 'v1')
from apiclient.http import MediaIoBaseDownload

In [0]:
def download_from_GCS(file_name):
  with open(file_name, 'wb') as f:
    request = gcs_service.objects().get_media(bucket=bucket_name,
                                              object=file_name)
    media = MediaIoBaseDownload(f, request)

    done = False
    while not done:
      # _ is a placeholder for a progress object that we ignore.
      # (Our file is small, so we skip reporting progress.)
      _, done = media.next_chunk()
  print(file_name + ' downloaded')

In [0]:
from googleapiclient.http import MediaFileUpload
def upload_to_GCS(file_name):
  
  media = MediaFileUpload(file_name, 
                          mimetype='text/csv',
                          resumable=True)

  request = gcs_service.objects().insert(bucket=bucket_name, 
                                         name=file_name,
                                         media_body=media)

  response = None
  while response is None:
    # _ is a placeholder for a progress object that we ignore.
    # (Our file is small, so we skip reporting progress.)
    _, response = request.next_chunk()
  print(file_name + ' uploaded')

In [0]:
import numpy as np
import pandas as pd

from tqdm.auto import tqdm
tqdm.pandas()

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine, cityblock, euclidean

import time

import pickle

In [0]:
def get_time():
  seconds = time.time()
  named_tuple = time.gmtime(seconds) # get struct_time
  hour = named_tuple.tm_hour + 8
  if hour > 24:
    hour = hour - 24
  time_str = str(hour) + ':' + str(named_tuple.tm_min) + ':' + str(named_tuple.tm_sec)
  return time_str

In [0]:
def read_as_data(store_path):
  data = pd.read_csv(store_path)
  print(data.isnull().sum())
  data = data.fillna(" ")
  return data

In [0]:
def store_featured_data(data, suffix, file_name):
  new_file_name = file_name[:-8] + suffix +'.csv'
  data.to_csv(new_file_name, index = False)
  return new_file_name

In [0]:
#stop_words
from nltk.corpus import stopwords
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [0]:
import gensim

In [0]:
!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

--2019-08-21 13:46:38--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.16.139
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.16.139|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz’


2019-08-21 13:48:16 (16.3 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



In [0]:
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)


In [0]:
download_from_GCS('model.pickle')
model = pickle.load(open("model.pickle", "rb")) #读取保存的 model

model.pickle downloaded


In [0]:
download_from_GCS('norm_model.pickle')
norm_model = pickle.load(open("norm_model.pickle", "rb")) #读取保存的 model

norm_model.pickle downloaded


In [0]:
model.init_sims(replace=True) # normalizes vectors
distance = model.wmdistance("string 1".split(), "string 2".split())  # Compute WMD as normal.

In [0]:
distance

0.1968112707310915

In [0]:
pickle.dump(model, open("model.pickle", "wb")) #保存 q_dict

In [0]:
file_name = "model.pickle"
media = MediaFileUpload(file_name, 
                          mimetype='text/csv',
                          resumable=True)

request = gcs_service.objects().insert(bucket=bucket_name, 
                                         name=file_name,
                                         media_body=media)

response = None
while response is None:
    # _ is a placeholder for a progress object that we ignore.
    # (Our file is small, so we skip reporting progress.)
  _, response = request.next_chunk()
print(file_name + ' uploaded')

model.pickle uploaded


In [0]:
pickle.dump(norm_model, open("norm_model.pickle", "wb")) #保存 w2v norm_model
file_name = "norm_model.pickle"
media = MediaFileUpload(file_name, 
                          mimetype='text/csv',
                          resumable=True)

request = gcs_service.objects().insert(bucket=bucket_name, 
                                         name=file_name,
                                         media_body=media)

response = None
while response is None:
    # _ is a placeholder for a progress object that we ignore.
    # (Our file is small, so we skip reporting progress.)
  _, response = request.next_chunk()
print(file_name + ' uploaded')

norm_model.pickle uploaded


In [0]:
print(distance)

0.1968112707310915


In [0]:
from scipy.stats import skew, kurtosis
from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis
from nltk import word_tokenize
stop_words = stopwords.words('english')

In [0]:
def wmd(s1, s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    stop_words = stopwords.words('english')
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return model.wmdistance(s1, s2)


def norm_wmd(s1, s2):
    s1 = str(s1).lower().split()
    s2 = str(s2).lower().split()
    stop_words = stopwords.words('english')
    s1 = [w for w in s1 if w not in stop_words]
    s2 = [w for w in s2 if w not in stop_words]
    return norm_model.wmdistance(s1, s2)


def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())

In [0]:
# norm_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
norm_model.init_sims(replace=True)

In [0]:
from tqdm._tqdm_notebook import tqdm_notebook

tqdm_notebook.pandas()

# train w2v

In [0]:
def ft_w2v(file_name):
  store_path =  file_name
  
#   download_from_GCS(file_name)

  data = read_as_data(store_path)
#   data = data.head()    # testing
  print(file_name + ' read as data')
  
  time_str = get_time()
  print('getting w2v features started at ' + time_str)
  
  data['wmd'] = data.progress_apply(lambda x: wmd(x['question1'], x['question2']), axis=1)

  data['norm_wmd'] = data.progress_apply(lambda x: norm_wmd(x['question1'], x['question2']), axis=1)

  question1_vectors = np.zeros((data.shape[0], 300))
  error_count = 0

  time_str = get_time()
  print('vectoring started at ' + time_str)
  for i, q in tqdm(enumerate(data.question1.values)):
      question1_vectors[i, :] = sent2vec(q)

  question2_vectors  = np.zeros((data.shape[0], 300))
  for i, q in tqdm(enumerate(data.question2.values)):
      question2_vectors[i, :] = sent2vec(q)

  time_str = get_time()
  print('getting cos_w2v feature started at ' + time_str)
  data['cos_w2v'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                            np.nan_to_num(question2_vectors))]

  time_str = get_time()
  print('getting L1_w2v feature started at ' + time_str)
  data['L1_w2v'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                            np.nan_to_num(question2_vectors))]

  time_str = get_time()
  print('getting canberra_w2v feature started at ' + time_str)
  data['canberra_w2v'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                            np.nan_to_num(question2_vectors))]

  time_str = get_time()
  print('getting L2_w2v feature started at ' + time_str)
  data['L2_w2v'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                            np.nan_to_num(question2_vectors))]

  time_str = get_time()
  print('getting minkowski_w2v feature started at ' + time_str)
  data['minkowski_w2v'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                            np.nan_to_num(question2_vectors))]

  time_str = get_time()
  print('getting jaccard_w2v feature started at ' + time_str)
  data['braycurtis_w2v'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                            np.nan_to_num(question2_vectors))]

  time_str = get_time()
  print('getting skew_q1vec feature started at ' + time_str)
  data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
  data['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
  data['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
  data['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]
  
  time_str = get_time()
  print('getting w2v features ended at ' + time_str)

  new_file_name = store_featured_data(data, suffix, file_name)

  new_store_path = '/tmp/data.csv'

  upload_to_GCS(new_file_name)

In [0]:
suffix = '_ft_w2v'
ft_w2v('x_train_tk_cvt.csv')

id            0
question1    16
question2    10
dtype: int64
x_train_tk_cvt.csv read as data
getting w2v features started at 22:27:35


HBox(children=(IntProgress(value=0, max=404290), HTML(value='')))




HBox(children=(IntProgress(value=0, max=404290), HTML(value='')))


vectoring started at 22:41:26


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


getting cos_w2v feature started at 22:44:17
getting L1_w2v feature started at 22:44:40
getting canberra_w2v feature started at 22:44:46
getting L2_w2v feature started at 22:45:0
getting minkowski_w2v feature started at 22:45:9
getting jaccard_w2v feature started at 22:45:30
getting skew_q1vec feature started at 22:45:38
getting w2v features ended at 22:50:12
x_train_tk_ft_w2v.csv uploaded


In [0]:
train_ft_w2v = pd.read_csv('/tmp/x_train_tk_ft_w2v.csv')
train_ft_w2v.head()

Unnamed: 0,id,question1,question2,wmd,norm_wmd,cos_w2v,L1_w2v,canberra_w2v,L2_w2v,minkowski_w2v,braycurtis_w2v,skew_q1vec,skew_q2vec,kur_q1vec,kur_q2vec
0,0,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,0.198042,0.198042,0.037908,3.774843,75.949313,0.275348,0.125323,0.137314,0.008893,-0.099771,0.108845,0.344742
1,1,what is the story of kohinoor koh i noor diamond,what would happen if the indian government sto...,0.87794,0.87794,0.376926,12.33788,168.641909,0.868246,0.389172,0.498692,0.006689,0.118289,0.185005,0.206283
2,2,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...,0.694896,0.694896,0.215223,8.840496,135.849172,0.656084,0.305829,0.332821,0.247069,0.15255,0.0429,-0.489377
3,3,why am i mentally very lonely how can i solve it,find the remainder when math 23 24 math is div...,1.261312,1.261312,0.635212,15.691485,189.60097,1.127131,0.501912,0.677309,0.013645,0.068055,-0.230252,-0.102497
4,4,which one dissolve in water quickly sugar salt...,which fish would survive in salt water,0.998063,0.998063,0.33355,11.23584,149.646926,0.816762,0.366127,0.43538,-0.082484,0.074702,-0.237995,-0.360966


In [0]:
train_ft_w2v = train_ft_w2v.drop(['id', 'question1', 'question2'], axis=1)
train_ft_w2v.head()

Unnamed: 0,wmd,norm_wmd,cos_w2v,L1_w2v,canberra_w2v,L2_w2v,minkowski_w2v,braycurtis_w2v,skew_q1vec,skew_q2vec,kur_q1vec,kur_q2vec
0,0.198042,0.198042,0.037908,3.774843,75.949313,0.275348,0.125323,0.137314,0.008893,-0.099771,0.108845,0.344742
1,0.87794,0.87794,0.376926,12.33788,168.641909,0.868246,0.389172,0.498692,0.006689,0.118289,0.185005,0.206283
2,0.694896,0.694896,0.215223,8.840496,135.849172,0.656084,0.305829,0.332821,0.247069,0.15255,0.0429,-0.489377
3,1.261312,1.261312,0.635212,15.691485,189.60097,1.127131,0.501912,0.677309,0.013645,0.068055,-0.230252,-0.102497
4,0.998063,0.998063,0.33355,11.23584,149.646926,0.816762,0.366127,0.43538,-0.082484,0.074702,-0.237995,-0.360966


In [0]:
train_ft_all = pd.concat([train_ft_all, train_ft_w2v], axis=1)
train_ft_all.head()

Unnamed: 0,len_word_dff_org,len_word_dff_rt_org,len_char_dff_org,len_char_dff_rt_org,common_words_org,common_rate_org,jaccard_similarity_org,tfidf_L1_org,tfidf_L2_org,tfidf_cosine_org,len_word_dff_stm,len_word_dff_rt_stm,len_char_dff_stm,len_char_dff_rt_stm,common_words_stm,common_rate_stm,jaccard_similarity_stm,tfidf_L1_stm,tfidf_L2_stm,tfidf_cosine_stm,len_word_dff_lmtz,len_word_dff_rt_lmtz,len_char_dff_lmtz,len_char_dff_rt_lmtz,common_words_lmtz,common_rate_lmtz,jaccard_similarity_lmtz,tfidf_L1_lmtz,tfidf_L2_lmtz,tfidf_cosine_lmtz,encode_L1,encode_L2,encode_cos,encode_cos_log,common_words_stwd,common_rate_stwd,jaccard_similarity_stwd,tfidf_L1_stwd,tfidf_L2_stwd,cos_glv,L1_glv,L2_glv,same_start_word,diff_sen_neg,diff_sen_neu,diff_sen_pos,diff_sen_com,fuzz_qratio,fuzz_WRatio,fuzz_partial_ratio,fuzz_partial_token_set_ratio,fuzz_partial_token_sort_ratio,fuzz_token_set_ratio,fuzz_token_sort_ratio,repeat,tfidf_cosine_stwd,wmd,norm_wmd,cos_w2v,L1_w2v,canberra_w2v,L2_w2v,minkowski_w2v,braycurtis_w2v,skew_q1vec,skew_q2vec,kur_q1vec,kur_q2vec
0,1.098612,0.143101,2.079442,0.134819,2.484907,0.916667,0.613104,0.308918,0.042975,0.978513,1.098612,0.143101,2.079442,0.137471,2.484907,0.916667,0.613104,0.336295,0.050317,0.974841,1.098612,0.143101,2.079442,0.137471,2.484907,0.916667,0.613104,0.336295,0.050317,0.974841,2.133778,0.269064,0.963802,0.674883,1.791759,0.833333,0.538997,0.205387,0.036422,0.031762,3.456409,0.25204,1,0.0,0.022,0.022,0.0,93,95,100,100,89,100,93,0,0.981789,0.198042,0.198042,0.037908,3.774843,75.949313,0.275348,0.125323,0.137314,0.008893,-0.099771,0.108845,0.344742
1,1.791759,0.336472,3.496508,0.458575,2.079442,0.56,0.328504,1.218322,0.499939,0.75003,1.791759,0.336472,3.367296,0.424157,2.079442,0.56,0.328504,1.209643,0.486068,0.756966,1.791759,0.336472,3.367296,0.424157,2.079442,0.56,0.328504,1.209611,0.486033,0.756984,3.548461,0.724755,0.737365,0.55237,1.609438,0.5,0.287682,1.101979,0.475765,0.266555,8.548248,0.730144,1,0.0,0.075,0.075,0.0,65,86,73,100,75,86,63,0,0.762118,0.87794,0.87794,0.376926,12.33788,168.641909,0.868246,0.389172,0.498692,0.006689,0.118289,0.185005,0.206283
2,1.609438,0.287682,2.397895,0.169899,1.609438,0.32,0.174353,1.667734,1.490253,0.254874,1.609438,0.287682,2.397895,0.189242,1.791759,0.4,0.223144,1.512309,1.215205,0.392397,1.609438,0.287682,2.397895,0.192904,1.791759,0.4,0.223144,1.512309,1.215203,0.392398,2.765091,0.4568,0.895667,0.639571,1.098612,0.333333,0.182322,1.430197,1.44716,0.1189,6.461987,0.487647,1,0.0,0.016,0.016,0.045,54,63,53,100,71,66,66,0,0.27642,0.694896,0.694896,0.215223,8.840496,135.849172,0.656084,0.305829,0.332821,0.247069,0.15255,0.0429,-0.489377
3,1.098612,0.154151,2.302585,0.192078,0.0,0.0,0.0,1.855883,2.0,0.0,1.098612,0.154151,2.397895,0.233615,0.0,0.0,0.0,1.859033,2.0,0.0,1.098612,0.154151,2.397895,0.233615,0.0,0.0,0.0,1.860761,2.0,0.0,4.717258,1.238057,0.233607,0.209942,0.0,0.0,0.0,1.615345,2.0,0.619671,14.745557,1.113257,0,0.241,0.396,0.155,0.2484,36,36,40,37,38,36,36,0,0.0,1.261312,1.261312,0.635212,15.691485,189.60097,1.127131,0.501912,0.677309,0.013645,0.068055,-0.230252,-0.102497
4,1.94591,0.470004,3.433987,0.493658,1.609438,0.380952,0.211309,1.674485,1.443889,0.278056,1.94591,0.470004,3.367296,0.483797,1.609438,0.380952,0.211309,1.671665,1.428437,0.285782,1.94591,0.470004,3.367296,0.483797,1.609438,0.380952,0.211309,1.671665,1.428437,0.285782,4.118772,0.973915,0.525745,0.422483,1.098612,0.25,0.133531,1.649376,1.504143,0.211085,8.885255,0.649746,1,0.0,0.0,0.0,0.0,45,86,55,100,63,67,46,0,0.247928,0.998063,0.998063,0.33355,11.23584,149.646926,0.816762,0.366127,0.43538,-0.082484,0.074702,-0.237995,-0.360966


In [0]:
train_ft_all.shape[1]

68

In [0]:
train_ft_all.to_csv('/tmp/train_ft_all.csv', index=False)
upload_to_GCS('train_ft_all.csv')

train_ft_all.csv uploaded


# test w2v

In [0]:
suffix = '_ft_bsc'
ft_bsc('x_test_tk_cvt.csv')

In [0]:
download_from_GCS('x_test_tk_cvt.csv')
test_data = pd.read_csv('x_test_tk_cvt.csv')
test_data.head()

x_test_tk_cvt.csv downloaded


Unnamed: 0,test_id,question1,question2
0,0,how does the surface pro himself 4 compare wit...,why did microsoft choose core m3 and not core ...
1,1,should i have a hair transplant at age 24 how ...,how much cost does hair transplant require
2,2,what but is the best way to send money from ch...,what you send money to china
3,3,which food not emulsifiers,what foods fibre
4,4,how aberystwyth start reading,how their can i start reading


In [0]:
test_data = test_data.fillna(" ")


In [0]:

batch_size = 400000
batch_num = test_data.shape[0]//batch_size

ft_test = pd.DataFrame({'wmd':0, 'norm_wmd':0, 'cos_w2v':0, 'L1_w2v':0, 
                        'canberra_w2v':0, 'L2_w2v':0, 'minkowski_w2v':0, 
                       'braycurtis_w2v':0, 'skew_q1vec':0, 'skew_q2vec':0, 
                       'kur_q1vec':0, 'kur_q2vec':0}, index=[0])

In [0]:
def compute_ft_in_batch(start_index, end_index, ft_test):
  
  data = test_data.iloc[start_index:end_index, :]
  
  time_str = get_time()
  print('sentance 2 vector for sentances {} to {} started at '.format(start_index, end_index) + time_str)
  
  data['wmd'] = data.progress_apply(lambda x: wmd(x['question1'], x['question2']), axis=1)

  data['norm_wmd'] = data.progress_apply(lambda x: norm_wmd(x['question1'], x['question2']), axis=1)

  question1_vectors = np.zeros((data.shape[0], 300))
  error_count = 0

  time_str = get_time()
  print('vectoring started at ' + time_str)
  for i, q in tqdm(enumerate(data.question1.values)):
      question1_vectors[i, :] = sent2vec(q)

  question2_vectors  = np.zeros((data.shape[0], 300))
  for i, q in tqdm(enumerate(data.question2.values)):
      question2_vectors[i, :] = sent2vec(q)

  time_str = get_time()
  print('getting cos_w2v feature started at ' + time_str)
  data['cos_w2v'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                            np.nan_to_num(question2_vectors))]

  time_str = get_time()
  print('getting L1_w2v feature started at ' + time_str)
  data['L1_w2v'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                            np.nan_to_num(question2_vectors))]

  time_str = get_time()
  print('getting canberra_w2v feature started at ' + time_str)
  data['canberra_w2v'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                            np.nan_to_num(question2_vectors))]

  time_str = get_time()
  print('getting L2_w2v feature started at ' + time_str)
  data['L2_w2v'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                            np.nan_to_num(question2_vectors))]

  time_str = get_time()
  print('getting minkowski_w2v feature started at ' + time_str)
  data['minkowski_w2v'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                            np.nan_to_num(question2_vectors))]

  time_str = get_time()
  print('getting jaccard_w2v feature started at ' + time_str)
  data['braycurtis_w2v'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors),
                                                            np.nan_to_num(question2_vectors))]

  time_str = get_time()
  print('getting skew_q1vec feature started at ' + time_str)
  data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]
  data['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]
  data['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]
  data['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]
  
  time_str = get_time()
  print('getting w2v features ended at ' + time_str)
  
  data = data.drop(['test_id', 'question1', 'question2'], axis=1)
  
  return ft_test.append(data, ignore_index=True )

In [0]:
for i in range(0, batch_num + 1):
  start_index = i*batch_size
  end_index = (i+1)*batch_size
  if end_index > test_data.shape[0]:
    end_index = test_data.shape[0]
  ft_test = compute_ft_in_batch(start_index, end_index, ft_test)

In [0]:
ft_test.to_csv('test_ft_w2v.csv', index=False)
upload_to_GCS('test_ft_w2v.csv')

test_ft_w2v.csv uploaded


In [0]:
# download_from_GCS('train_ft_all.csv')
train_ft_all = pd.read_csv('/tmp/train_ft_all.csv')
train_ft_all.head()

Unnamed: 0,len_word_dff_org,len_word_dff_rt_org,len_char_dff_org,len_char_dff_rt_org,common_words_org,common_rate_org,jaccard_similarity_org,tfidf_L1_org,tfidf_L2_org,tfidf_cosine_org,len_word_dff_stm,len_word_dff_rt_stm,len_char_dff_stm,len_char_dff_rt_stm,common_words_stm,common_rate_stm,jaccard_similarity_stm,tfidf_L1_stm,tfidf_L2_stm,tfidf_cosine_stm,len_word_dff_lmtz,len_word_dff_rt_lmtz,len_char_dff_lmtz,len_char_dff_rt_lmtz,common_words_lmtz,common_rate_lmtz,jaccard_similarity_lmtz,tfidf_L1_lmtz,tfidf_L2_lmtz,tfidf_cosine_lmtz,encode_L1,encode_L2,encode_cos,encode_cos_log,common_words_stwd,common_rate_stwd,jaccard_similarity_stwd,tfidf_L1_stwd,tfidf_L2_stwd,cos_glv,L1_glv,L2_glv,same_start_word,diff_sen_neg,diff_sen_neu,diff_sen_pos,diff_sen_com,fuzz_qratio,fuzz_WRatio,fuzz_partial_ratio,fuzz_partial_token_set_ratio,fuzz_partial_token_sort_ratio,fuzz_token_set_ratio,fuzz_token_sort_ratio,repeat,tfidf_cosine_stwd,wmd,norm_wmd,cos_w2v,L1_w2v,canberra_w2v,L2_w2v,minkowski_w2v,braycurtis_w2v,skew_q1vec,skew_q2vec,kur_q1vec,kur_q2vec
0,1.098612,0.143101,2.079442,0.134819,2.484907,0.916667,0.613104,0.308918,0.042975,0.978513,1.098612,0.143101,2.079442,0.137471,2.484907,0.916667,0.613104,0.336295,0.050317,0.974841,1.098612,0.143101,2.079442,0.137471,2.484907,0.916667,0.613104,0.336295,0.050317,0.974841,2.133778,0.269064,0.963802,0.674883,1.791759,0.833333,0.538997,0.205387,0.036422,0.031762,3.456409,0.25204,1,0.0,0.022,0.022,0.0,93,95,100,100,89,100,93,0,0.981789,0.198042,0.198042,0.037908,3.774843,75.949313,0.275348,0.125323,0.137314,0.008893,-0.099771,0.108845,0.344742
1,1.791759,0.336472,3.496508,0.458575,2.079442,0.56,0.328504,1.218322,0.499939,0.75003,1.791759,0.336472,3.367296,0.424157,2.079442,0.56,0.328504,1.209643,0.486068,0.756966,1.791759,0.336472,3.367296,0.424157,2.079442,0.56,0.328504,1.209611,0.486033,0.756984,3.548461,0.724755,0.737365,0.55237,1.609438,0.5,0.287682,1.101979,0.475765,0.266555,8.548248,0.730144,1,0.0,0.075,0.075,0.0,65,86,73,100,75,86,63,0,0.762118,0.87794,0.87794,0.376926,12.33788,168.641909,0.868246,0.389172,0.498692,0.006689,0.118289,0.185005,0.206283
2,1.609438,0.287682,2.397895,0.169899,1.609438,0.32,0.174353,1.667734,1.490253,0.254874,1.609438,0.287682,2.397895,0.189242,1.791759,0.4,0.223144,1.512309,1.215205,0.392397,1.609438,0.287682,2.397895,0.192904,1.791759,0.4,0.223144,1.512309,1.215203,0.392398,2.765091,0.4568,0.895667,0.639571,1.098612,0.333333,0.182322,1.430197,1.44716,0.1189,6.461987,0.487647,1,0.0,0.016,0.016,0.045,54,63,53,100,71,66,66,0,0.27642,0.694896,0.694896,0.215223,8.840496,135.849172,0.656084,0.305829,0.332821,0.247069,0.15255,0.0429,-0.489377
3,1.098612,0.154151,2.302585,0.192078,0.0,0.0,0.0,1.855883,2.0,0.0,1.098612,0.154151,2.397895,0.233615,0.0,0.0,0.0,1.859033,2.0,0.0,1.098612,0.154151,2.397895,0.233615,0.0,0.0,0.0,1.860761,2.0,0.0,4.717258,1.238057,0.233607,0.209942,0.0,0.0,0.0,1.615345,2.0,0.619671,14.745557,1.113257,0,0.241,0.396,0.155,0.2484,36,36,40,37,38,36,36,0,0.0,1.261312,1.261312,0.635212,15.691485,189.60097,1.127131,0.501912,0.677309,0.013645,0.068055,-0.230252,-0.102497
4,1.94591,0.470004,3.433987,0.493658,1.609438,0.380952,0.211309,1.674485,1.443889,0.278056,1.94591,0.470004,3.367296,0.483797,1.609438,0.380952,0.211309,1.671665,1.428437,0.285782,1.94591,0.470004,3.367296,0.483797,1.609438,0.380952,0.211309,1.671665,1.428437,0.285782,4.118772,0.973915,0.525745,0.422483,1.098612,0.25,0.133531,1.649376,1.504143,0.211085,8.885255,0.649746,1,0.0,0.0,0.0,0.0,45,86,55,100,63,67,46,0,0.247928,0.998063,0.998063,0.33355,11.23584,149.646926,0.816762,0.366127,0.43538,-0.082484,0.074702,-0.237995,-0.360966


In [0]:
test_data = pd.read_csv('/tmp/x_test_tk_cvt.csv')
test_data.head()

Unnamed: 0,test_id,question1,question2
0,0,how does the surface pro himself 4 compare wit...,why did microsoft choose core m3 and not core ...
1,1,should i have a hair transplant at age 24 how ...,how much cost does hair transplant require
2,2,what but is the best way to send money from ch...,what you send money to china
3,3,which food not emulsifiers,what foods fibre
4,4,how aberystwyth start reading,how their can i start reading


In [0]:
download_from_GCS('x_train_tk_cvt.csv')
train_data = pd.read_csv('/tmp/x_train_tk_cvt.csv')
train_data.head()

x_train_tk_cvt.csv downloaded


Unnamed: 0,id,question1,question2
0,0,what is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...
1,1,what is the story of kohinoor koh i noor diamond,what would happen if the indian government sto...
2,2,how can i increase the speed of my internet co...,how can internet speed be increased by hacking...
3,3,why am i mentally very lonely how can i solve it,find the remainder when math 23 24 math is div...
4,4,which one dissolve in water quickly sugar salt...,which fish would survive in salt water
