# set up

In [0]:
#===从GCS加载数据

project_id = 'amazing-firefly-153908'
bucket_name = 'nlp_final'

# 获取授权
from google.colab import auth
auth.authenticate_user()

#从GCS下载数据至tmp
from googleapiclient.discovery import build
gcs_service = build('storage', 'v1')
from apiclient.http import MediaIoBaseDownload

In [0]:
def download_from_GCS(file_name):
  with open(file_name, 'wb') as f:
    request = gcs_service.objects().get_media(bucket=bucket_name,
                                              object=file_name)
    media = MediaIoBaseDownload(f, request)

    done = False
    while not done:
      # _ is a placeholder for a progress object that we ignore.
      # (Our file is small, so we skip reporting progress.)
      _, done = media.next_chunk()
  print(file_name + ' downloaded')

In [0]:
from googleapiclient.http import MediaFileUpload
def upload_to_GCS(file_name): 
  media = MediaFileUpload(file_name, 
                          mimetype='text/csv',
                          resumable=True)

  request = gcs_service.objects().insert(bucket=bucket_name, 
                                         name=file_name,
                                         media_body=media)

  response = None
  while response is None:
    # _ is a placeholder for a progress object that we ignore.
    # (Our file is small, so we skip reporting progress.)
    _, response = request.next_chunk()
  print(file_name + ' uploaded')

In [0]:
import numpy as np
import pandas as pd
import time
from tqdm.auto import tqdm
tqdm.pandas()

In [0]:
def get_time():
  seconds = time.time()
  named_tuple = time.gmtime(seconds) # get struct_time
  hour = named_tuple.tm_hour + 8
  if hour > 24:
    hour = hour - 24
  time_str = str(hour) + ':' + str(named_tuple.tm_min) + ':' + str(named_tuple.tm_sec)
  return time_str

# 合并特征 -- train

In [0]:
download_from_GCS('train_ft_all2.csv')
train_ft_all2 = pd.read_csv('train_ft_all2.csv')
train_ft_all2.head()

train_ft_all2.csv downloaded


Unnamed: 0,len_word_dff_org,len_word_dff_rt_org,len_char_dff_org,len_char_dff_rt_org,common_words_org,common_rate_org,jaccard_similarity_org,tfidf_L1_org,tfidf_L2_org,tfidf_cosine_org,len_word_dff_stm,len_word_dff_rt_stm,len_char_dff_stm,len_char_dff_rt_stm,common_words_stm,common_rate_stm,jaccard_similarity_stm,tfidf_L1_stm,tfidf_L2_stm,tfidf_cosine_stm,len_word_dff_lmtz,len_word_dff_rt_lmtz,len_char_dff_lmtz,len_char_dff_rt_lmtz,common_words_lmtz,common_rate_lmtz,jaccard_similarity_lmtz,tfidf_L1_lmtz,tfidf_L2_lmtz,tfidf_cosine_lmtz,encode_L1,encode_L2,encode_cos,encode_cos_log,common_words_stwd,common_rate_stwd,jaccard_similarity_stwd,tfidf_L1_stwd,tfidf_L2_stwd,cos_glv,L1_glv,L2_glv,same_start_word,diff_sen_neg,diff_sen_neu,diff_sen_pos,diff_sen_com,fuzz_qratio,fuzz_WRatio,fuzz_partial_ratio,fuzz_partial_token_set_ratio,fuzz_partial_token_sort_ratio,fuzz_token_set_ratio,fuzz_token_sort_ratio,repeat,tfidf_cosine_stwd,wmd,norm_wmd,cos_w2v,L1_w2v,canberra_w2v,L2_w2v,minkowski_w2v,braycurtis_w2v,skew_q1vec,skew_q2vec,kur_q1vec,kur_q2vec,q1_freq,q2_freq,q1_q2_freq_average,canberra_glv,minkowski_glv,braycurtis_glv,skew_q1vec_glv,skew_q2vec_glv,kur_q1vec_glv,kur_q2vec_glv
0,1.098612,0.143101,2.079442,0.134819,2.484907,0.916667,0.613104,0.308918,0.042975,0.978513,1.098612,0.143101,2.079442,0.137471,2.484907,0.916667,0.613104,0.336295,0.050317,0.974841,1.098612,0.143101,2.079442,0.137471,2.484907,0.916667,0.613104,0.336295,0.050317,0.974841,2.133778,0.269064,0.963802,0.674883,1.791759,0.833333,0.538997,0.205387,0.036422,0.031762,3.456409,0.25204,1,0.0,0.022,0.022,0.0,93,95,100,100,89,100,93,0,0.981789,0.198042,0.198042,0.037908,3.774843,75.949313,0.275348,0.125323,0.137314,0.008893,-0.099771,0.108845,0.344742,1,2,1.5,91.349971,0.114783,0.166479,2.158091,2.455419,25.021147,26.141849
1,1.791759,0.336472,3.496508,0.458575,2.079442,0.56,0.328504,1.218322,0.499939,0.75003,1.791759,0.336472,3.367296,0.424157,2.079442,0.56,0.328504,1.209643,0.486068,0.756966,1.791759,0.336472,3.367296,0.424157,2.079442,0.56,0.328504,1.209611,0.486033,0.756984,3.548461,0.724755,0.737365,0.55237,1.609438,0.5,0.287682,1.101979,0.475765,0.266555,8.548248,0.730144,1,0.0,0.075,0.075,0.0,65,86,73,100,75,86,63,0,0.762118,0.87794,0.87794,0.376926,12.33788,168.641909,0.868246,0.389172,0.498692,0.006689,0.118289,0.185005,0.206283,8,3,5.5,140.639989,0.443313,0.370294,-0.257926,-0.622981,1.54384,20.745921
2,1.609438,0.287682,2.397895,0.169899,1.609438,0.32,0.174353,1.667734,1.490253,0.254874,1.609438,0.287682,2.397895,0.189242,1.791759,0.4,0.223144,1.512309,1.215205,0.392397,1.609438,0.287682,2.397895,0.192904,1.791759,0.4,0.223144,1.512309,1.215203,0.392398,2.765091,0.4568,0.895667,0.639571,1.098612,0.333333,0.182322,1.430197,1.44716,0.1189,6.461987,0.487647,1,0.0,0.016,0.016,0.045,54,63,53,100,71,66,66,0,0.27642,0.694896,0.694896,0.215223,8.840496,135.849172,0.656084,0.305829,0.332821,0.247069,0.15255,0.0429,-0.489377,2,1,1.5,129.054466,0.230156,0.286149,1.10994,-0.135832,16.728574,9.117428
3,1.098612,0.154151,2.302585,0.192078,0.0,0.0,0.0,1.855883,2.0,0.0,1.098612,0.154151,2.397895,0.233615,0.0,0.0,0.0,1.859033,2.0,0.0,1.098612,0.154151,2.397895,0.233615,0.0,0.0,0.0,1.860761,2.0,0.0,4.717258,1.238057,0.233607,0.209942,0.0,0.0,0.0,1.615345,2.0,0.619671,14.745557,1.113257,0,0.241,0.396,0.155,0.2484,36,36,40,37,38,36,36,0,0.0,1.261312,1.261312,0.635212,15.691485,189.60097,1.127131,0.501912,0.677309,0.013645,0.068055,-0.230252,-0.102497,1,1,1.0,198.947606,0.532528,0.803537,0.898184,2.416472,17.295623,16.170487
4,1.94591,0.470004,3.433987,0.493658,1.609438,0.380952,0.211309,1.674485,1.443889,0.278056,1.94591,0.470004,3.367296,0.483797,1.609438,0.380952,0.211309,1.671665,1.428437,0.285782,1.94591,0.470004,3.367296,0.483797,1.609438,0.380952,0.211309,1.671665,1.428437,0.285782,4.118772,0.973915,0.525745,0.422483,1.098612,0.25,0.133531,1.649376,1.504143,0.211085,8.885255,0.649746,1,0.0,0.0,0.0,0.0,45,86,55,100,63,67,46,0,0.247928,0.998063,0.998063,0.33355,11.23584,149.646926,0.816762,0.366127,0.43538,-0.082484,0.074702,-0.237995,-0.360966,3,1,2.0,156.294233,0.299003,0.425088,0.791283,0.917872,13.634911,17.349398


In [0]:
train_ft_all2.shape

(404290, 78)

In [0]:
download_from_GCS('train_ft_encode.csv')
train_ft_encode = pd.read_csv('train_ft_encode.csv')
train_ft_encode.head()

train_ft_encode.csv downloaded


Unnamed: 0,encode_cos,encode_L1,encode_L2,canberra_encode,minkowski_encode,braycurtis_encode,skew_q1vec_encode,skew_q2vec_encode,kur_q1vec_encode,kur_q2vec_encode
0,0.036198,4.553006,0.269064,124.261119,0.118751,0.120247,-0.076135,-0.042752,-0.989944,-0.958032
1,0.262635,12.591572,0.724755,221.270339,0.309102,0.347132,-0.05611,-0.026842,-0.942674,-1.278017
2,0.104333,7.645729,0.4568,163.702191,0.198237,0.202258,-0.035711,0.067302,-1.103482,-1.130741
3,0.766393,22.25252,1.238057,325.741225,0.507923,0.757637,-0.073181,-0.100329,-0.788452,-0.822424
4,0.474255,16.964287,0.973915,262.853599,0.409228,0.504699,-0.018119,-0.107851,-1.244781,-0.965395


In [0]:
train_ft_all2['encode_L1'] = train_ft_encode['encode_L1']
train_ft_all2['encode_L2'] = train_ft_encode['encode_L2']
train_ft_all2['encode_cos'] = train_ft_encode['encode_cos']
train_ft_all2.head()

Unnamed: 0,len_word_dff_org,len_word_dff_rt_org,len_char_dff_org,len_char_dff_rt_org,common_words_org,common_rate_org,jaccard_similarity_org,tfidf_L1_org,tfidf_L2_org,tfidf_cosine_org,len_word_dff_stm,len_word_dff_rt_stm,len_char_dff_stm,len_char_dff_rt_stm,common_words_stm,common_rate_stm,jaccard_similarity_stm,tfidf_L1_stm,tfidf_L2_stm,tfidf_cosine_stm,len_word_dff_lmtz,len_word_dff_rt_lmtz,len_char_dff_lmtz,len_char_dff_rt_lmtz,common_words_lmtz,common_rate_lmtz,jaccard_similarity_lmtz,tfidf_L1_lmtz,tfidf_L2_lmtz,tfidf_cosine_lmtz,encode_L1,encode_L2,encode_cos,encode_cos_log,common_words_stwd,common_rate_stwd,jaccard_similarity_stwd,tfidf_L1_stwd,tfidf_L2_stwd,cos_glv,L1_glv,L2_glv,same_start_word,diff_sen_neg,diff_sen_neu,diff_sen_pos,diff_sen_com,fuzz_qratio,fuzz_WRatio,fuzz_partial_ratio,fuzz_partial_token_set_ratio,fuzz_partial_token_sort_ratio,fuzz_token_set_ratio,fuzz_token_sort_ratio,repeat,tfidf_cosine_stwd,wmd,norm_wmd,cos_w2v,L1_w2v,canberra_w2v,L2_w2v,minkowski_w2v,braycurtis_w2v,skew_q1vec,skew_q2vec,kur_q1vec,kur_q2vec,q1_freq,q2_freq,q1_q2_freq_average,canberra_glv,minkowski_glv,braycurtis_glv,skew_q1vec_glv,skew_q2vec_glv,kur_q1vec_glv,kur_q2vec_glv
0,1.098612,0.143101,2.079442,0.134819,2.484907,0.916667,0.613104,0.308918,0.042975,0.978513,1.098612,0.143101,2.079442,0.137471,2.484907,0.916667,0.613104,0.336295,0.050317,0.974841,1.098612,0.143101,2.079442,0.137471,2.484907,0.916667,0.613104,0.336295,0.050317,0.974841,4.553006,0.269064,0.036198,0.674883,1.791759,0.833333,0.538997,0.205387,0.036422,0.031762,3.456409,0.25204,1,0.0,0.022,0.022,0.0,93,95,100,100,89,100,93,0,0.981789,0.198042,0.198042,0.037908,3.774843,75.949313,0.275348,0.125323,0.137314,0.008893,-0.099771,0.108845,0.344742,1,2,1.5,91.349971,0.114783,0.166479,2.158091,2.455419,25.021147,26.141849
1,1.791759,0.336472,3.496508,0.458575,2.079442,0.56,0.328504,1.218322,0.499939,0.75003,1.791759,0.336472,3.367296,0.424157,2.079442,0.56,0.328504,1.209643,0.486068,0.756966,1.791759,0.336472,3.367296,0.424157,2.079442,0.56,0.328504,1.209611,0.486033,0.756984,12.591572,0.724755,0.262635,0.55237,1.609438,0.5,0.287682,1.101979,0.475765,0.266555,8.548248,0.730144,1,0.0,0.075,0.075,0.0,65,86,73,100,75,86,63,0,0.762118,0.87794,0.87794,0.376926,12.33788,168.641909,0.868246,0.389172,0.498692,0.006689,0.118289,0.185005,0.206283,8,3,5.5,140.639989,0.443313,0.370294,-0.257926,-0.622981,1.54384,20.745921
2,1.609438,0.287682,2.397895,0.169899,1.609438,0.32,0.174353,1.667734,1.490253,0.254874,1.609438,0.287682,2.397895,0.189242,1.791759,0.4,0.223144,1.512309,1.215205,0.392397,1.609438,0.287682,2.397895,0.192904,1.791759,0.4,0.223144,1.512309,1.215203,0.392398,7.645729,0.4568,0.104333,0.639571,1.098612,0.333333,0.182322,1.430197,1.44716,0.1189,6.461987,0.487647,1,0.0,0.016,0.016,0.045,54,63,53,100,71,66,66,0,0.27642,0.694896,0.694896,0.215223,8.840496,135.849172,0.656084,0.305829,0.332821,0.247069,0.15255,0.0429,-0.489377,2,1,1.5,129.054466,0.230156,0.286149,1.10994,-0.135832,16.728574,9.117428
3,1.098612,0.154151,2.302585,0.192078,0.0,0.0,0.0,1.855883,2.0,0.0,1.098612,0.154151,2.397895,0.233615,0.0,0.0,0.0,1.859033,2.0,0.0,1.098612,0.154151,2.397895,0.233615,0.0,0.0,0.0,1.860761,2.0,0.0,22.25252,1.238057,0.766393,0.209942,0.0,0.0,0.0,1.615345,2.0,0.619671,14.745557,1.113257,0,0.241,0.396,0.155,0.2484,36,36,40,37,38,36,36,0,0.0,1.261312,1.261312,0.635212,15.691485,189.60097,1.127131,0.501912,0.677309,0.013645,0.068055,-0.230252,-0.102497,1,1,1.0,198.947606,0.532528,0.803537,0.898184,2.416472,17.295623,16.170487
4,1.94591,0.470004,3.433987,0.493658,1.609438,0.380952,0.211309,1.674485,1.443889,0.278056,1.94591,0.470004,3.367296,0.483797,1.609438,0.380952,0.211309,1.671665,1.428437,0.285782,1.94591,0.470004,3.367296,0.483797,1.609438,0.380952,0.211309,1.671665,1.428437,0.285782,16.964287,0.973915,0.474255,0.422483,1.098612,0.25,0.133531,1.649376,1.504143,0.211085,8.885255,0.649746,1,0.0,0.0,0.0,0.0,45,86,55,100,63,67,46,0,0.247928,0.998063,0.998063,0.33355,11.23584,149.646926,0.816762,0.366127,0.43538,-0.082484,0.074702,-0.237995,-0.360966,3,1,2.0,156.294233,0.299003,0.425088,0.791283,0.917872,13.634911,17.349398


In [0]:
train_ft_all3 = pd.concat([train_ft_all2, train_ft_encode.drop(['encode_L1', 'encode_L2', 'encode_cos'], axis=1)], axis=1)
train_ft_all3.head()

Unnamed: 0,len_word_dff_org,len_word_dff_rt_org,len_char_dff_org,len_char_dff_rt_org,common_words_org,common_rate_org,jaccard_similarity_org,tfidf_L1_org,tfidf_L2_org,tfidf_cosine_org,len_word_dff_stm,len_word_dff_rt_stm,len_char_dff_stm,len_char_dff_rt_stm,common_words_stm,common_rate_stm,jaccard_similarity_stm,tfidf_L1_stm,tfidf_L2_stm,tfidf_cosine_stm,len_word_dff_lmtz,len_word_dff_rt_lmtz,len_char_dff_lmtz,len_char_dff_rt_lmtz,common_words_lmtz,common_rate_lmtz,jaccard_similarity_lmtz,tfidf_L1_lmtz,tfidf_L2_lmtz,tfidf_cosine_lmtz,encode_L1,encode_L2,encode_cos,encode_cos_log,common_words_stwd,common_rate_stwd,jaccard_similarity_stwd,tfidf_L1_stwd,tfidf_L2_stwd,cos_glv,...,diff_sen_pos,diff_sen_com,fuzz_qratio,fuzz_WRatio,fuzz_partial_ratio,fuzz_partial_token_set_ratio,fuzz_partial_token_sort_ratio,fuzz_token_set_ratio,fuzz_token_sort_ratio,repeat,tfidf_cosine_stwd,wmd,norm_wmd,cos_w2v,L1_w2v,canberra_w2v,L2_w2v,minkowski_w2v,braycurtis_w2v,skew_q1vec,skew_q2vec,kur_q1vec,kur_q2vec,q1_freq,q2_freq,q1_q2_freq_average,canberra_glv,minkowski_glv,braycurtis_glv,skew_q1vec_glv,skew_q2vec_glv,kur_q1vec_glv,kur_q2vec_glv,canberra_encode,minkowski_encode,braycurtis_encode,skew_q1vec_encode,skew_q2vec_encode,kur_q1vec_encode,kur_q2vec_encode
0,1.098612,0.143101,2.079442,0.134819,2.484907,0.916667,0.613104,0.308918,0.042975,0.978513,1.098612,0.143101,2.079442,0.137471,2.484907,0.916667,0.613104,0.336295,0.050317,0.974841,1.098612,0.143101,2.079442,0.137471,2.484907,0.916667,0.613104,0.336295,0.050317,0.974841,4.553006,0.269064,0.036198,0.674883,1.791759,0.833333,0.538997,0.205387,0.036422,0.031762,...,0.022,0.0,93,95,100,100,89,100,93,0,0.981789,0.198042,0.198042,0.037908,3.774843,75.949313,0.275348,0.125323,0.137314,0.008893,-0.099771,0.108845,0.344742,1,2,1.5,91.349971,0.114783,0.166479,2.158091,2.455419,25.021147,26.141849,124.261119,0.118751,0.120247,-0.076135,-0.042752,-0.989944,-0.958032
1,1.791759,0.336472,3.496508,0.458575,2.079442,0.56,0.328504,1.218322,0.499939,0.75003,1.791759,0.336472,3.367296,0.424157,2.079442,0.56,0.328504,1.209643,0.486068,0.756966,1.791759,0.336472,3.367296,0.424157,2.079442,0.56,0.328504,1.209611,0.486033,0.756984,12.591572,0.724755,0.262635,0.55237,1.609438,0.5,0.287682,1.101979,0.475765,0.266555,...,0.075,0.0,65,86,73,100,75,86,63,0,0.762118,0.87794,0.87794,0.376926,12.33788,168.641909,0.868246,0.389172,0.498692,0.006689,0.118289,0.185005,0.206283,8,3,5.5,140.639989,0.443313,0.370294,-0.257926,-0.622981,1.54384,20.745921,221.270339,0.309102,0.347132,-0.05611,-0.026842,-0.942674,-1.278017
2,1.609438,0.287682,2.397895,0.169899,1.609438,0.32,0.174353,1.667734,1.490253,0.254874,1.609438,0.287682,2.397895,0.189242,1.791759,0.4,0.223144,1.512309,1.215205,0.392397,1.609438,0.287682,2.397895,0.192904,1.791759,0.4,0.223144,1.512309,1.215203,0.392398,7.645729,0.4568,0.104333,0.639571,1.098612,0.333333,0.182322,1.430197,1.44716,0.1189,...,0.016,0.045,54,63,53,100,71,66,66,0,0.27642,0.694896,0.694896,0.215223,8.840496,135.849172,0.656084,0.305829,0.332821,0.247069,0.15255,0.0429,-0.489377,2,1,1.5,129.054466,0.230156,0.286149,1.10994,-0.135832,16.728574,9.117428,163.702191,0.198237,0.202258,-0.035711,0.067302,-1.103482,-1.130741
3,1.098612,0.154151,2.302585,0.192078,0.0,0.0,0.0,1.855883,2.0,0.0,1.098612,0.154151,2.397895,0.233615,0.0,0.0,0.0,1.859033,2.0,0.0,1.098612,0.154151,2.397895,0.233615,0.0,0.0,0.0,1.860761,2.0,0.0,22.25252,1.238057,0.766393,0.209942,0.0,0.0,0.0,1.615345,2.0,0.619671,...,0.155,0.2484,36,36,40,37,38,36,36,0,0.0,1.261312,1.261312,0.635212,15.691485,189.60097,1.127131,0.501912,0.677309,0.013645,0.068055,-0.230252,-0.102497,1,1,1.0,198.947606,0.532528,0.803537,0.898184,2.416472,17.295623,16.170487,325.741225,0.507923,0.757637,-0.073181,-0.100329,-0.788452,-0.822424
4,1.94591,0.470004,3.433987,0.493658,1.609438,0.380952,0.211309,1.674485,1.443889,0.278056,1.94591,0.470004,3.367296,0.483797,1.609438,0.380952,0.211309,1.671665,1.428437,0.285782,1.94591,0.470004,3.367296,0.483797,1.609438,0.380952,0.211309,1.671665,1.428437,0.285782,16.964287,0.973915,0.474255,0.422483,1.098612,0.25,0.133531,1.649376,1.504143,0.211085,...,0.0,0.0,45,86,55,100,63,67,46,0,0.247928,0.998063,0.998063,0.33355,11.23584,149.646926,0.816762,0.366127,0.43538,-0.082484,0.074702,-0.237995,-0.360966,3,1,2.0,156.294233,0.299003,0.425088,0.791283,0.917872,13.634911,17.349398,262.853599,0.409228,0.504699,-0.018119,-0.107851,-1.244781,-0.965395


In [0]:
train_ft_all3.to_csv('train_ft_all3.csv', index=False)
upload_to_GCS('train_ft_all3.csv')

train_ft_all3.csv uploaded


# 合并特征 -- test

In [7]:
download_from_GCS('test_ft_all2.csv')
test_ft_all2 = pd.read_csv('test_ft_all2.csv')
test_ft_all2.head()

test_ft_all2.csv downloaded


Unnamed: 0,len_word_dff_org,len_word_dff_rt_org,len_char_dff_org,len_char_dff_rt_org,common_words_org,common_rate_org,jaccard_similarity_org,tfidf_L1_org,tfidf_L2_org,tfidf_cosine_org,len_word_dff_stm,len_word_dff_rt_stm,len_char_dff_stm,len_char_dff_rt_stm,common_words_stm,common_rate_stm,jaccard_similarity_stm,tfidf_L1_stm,tfidf_L2_stm,tfidf_cosine_stm,len_word_dff_lmtz,len_word_dff_rt_lmtz,len_char_dff_lmtz,len_char_dff_rt_lmtz,common_words_lmtz,common_rate_lmtz,jaccard_similarity_lmtz,tfidf_L1_lmtz,tfidf_L2_lmtz,tfidf_cosine_lmtz,encode_L1,encode_L2,encode_cos,encode_cos_log,common_words_stwd,common_rate_stwd,jaccard_similarity_stwd,tfidf_L1_stwd,tfidf_L2_stwd,cos_glv,L1_glv,L2_glv,same_start_word,diff_sen_neg,diff_sen_neu,diff_sen_pos,diff_sen_com,fuzz_qratio,fuzz_WRatio,fuzz_partial_ratio,fuzz_partial_token_set_ratio,fuzz_partial_token_sort_ratio,fuzz_token_set_ratio,fuzz_token_sort_ratio,repeat,tfidf_cosine_stwd,wmd,norm_wmd,cos_w2v,L1_w2v,canberra_w2v,L2_w2v,minkowski_w2v,braycurtis_w2v,skew_q1vec,skew_q2vec,kur_q1vec,kur_q2vec,q1_freq,q2_freq,q1_q2_freq_average
0,1.386294,0.215111,2.197225,0.14842,1.386294,0.25,0.133531,1.719823,1.43517,0.282415,1.386294,0.215111,2.302585,0.173511,1.386294,0.25,0.133531,1.734239,1.47502,0.26249,1.386294,0.215111,2.197225,0.157186,1.386294,0.25,0.133531,1.689093,1.45219,0.273905,3.637099,0.784084,0.692606,0.526269,1.386294,0.4,0.223144,1.487714,1.332189,0.212743,8.859499,0.652293,0,0.0,0.0,0.0,0.0,46,55,45,100,59,58,55,0,0.333906,0.880386,0.880386,0.386389,12.053253,161.856459,0.879077,0.39778,0.480725,0.060989,0.071506,0.206236,-0.317566,1,1,1.0
1,2.079442,0.510826,2.772589,0.296266,1.791759,0.454545,0.257829,1.312116,0.789032,0.605484,2.079442,0.510826,2.890372,0.336472,1.791759,0.454545,0.257829,1.318978,0.733234,0.633383,2.079442,0.510826,2.890372,0.336472,1.791759,0.454545,0.257829,1.318978,0.733234,0.633383,2.865691,0.488314,0.880775,0.631684,1.609438,0.615385,0.367725,1.000223,0.661058,0.082187,5.341142,0.40543,0,0.0,0.0,0.0,0.0,49,86,57,100,64,82,58,0,0.669471,0.421667,0.421667,0.136669,7.267052,120.948291,0.522817,0.234587,0.274652,-0.018201,-0.004014,0.150158,0.010465,2,2,2.0
2,2.079442,0.552069,3.258097,0.533111,1.791759,0.526316,0.305382,1.260385,0.601267,0.699367,2.079442,0.552069,3.258097,0.533111,1.791759,0.526316,0.305382,1.259183,0.597581,0.701209,2.079442,0.552069,3.258097,0.533111,1.791759,0.526316,0.305382,1.28469,0.623996,0.688002,3.039441,0.545916,0.850988,0.61572,1.386294,0.6,0.356675,0.852124,0.390172,0.080816,5.350192,0.402034,1,0.0,0.326,0.326,0.7783,59,86,82,100,68,92,55,0,0.804914,0.630674,0.630674,0.204235,9.068272,137.24531,0.639116,0.284338,0.341486,-0.012669,-0.066157,-0.393252,-0.371144,1,1,1.0
3,0.693147,0.251314,2.302585,0.396415,0.0,0.0,0.0,1.449417,2.0,0.0,0.693147,0.251314,2.197225,0.405465,0.693147,0.25,0.133531,1.233261,1.586042,0.206979,0.693147,0.251314,2.197225,0.405465,0.693147,0.25,0.133531,1.233261,1.586042,0.206979,3.665707,0.754944,0.71503,0.53943,0.0,0.0,0.0,1.317023,2.0,0.391609,12.590033,0.884996,0,0.0,0.0,0.0,0.0,52,53,56,62,62,52,52,0,0.0,0.918765,0.918765,0.288832,10.312107,153.68088,0.760042,0.349093,0.412119,0.094811,0.069599,-0.150121,0.774025,1,1,1.0
4,1.098612,0.336472,1.098612,0.076961,1.386294,0.545455,0.318454,0.676052,0.30544,0.84728,1.098612,0.336472,1.098612,0.087011,1.386294,0.545455,0.318454,0.747521,0.38207,0.808965,1.098612,0.336472,1.098612,0.087011,1.386294,0.545455,0.318454,0.747521,0.38207,0.808965,3.948551,0.896182,0.598429,0.469021,1.098612,0.666667,0.405465,0.0,0.0,0.247688,9.438157,0.703829,1,0.0,0.0,0.0,0.0,69,70,69,100,66,74,66,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.003651,-0.003651,-0.100281,-0.100281,1,1,1.0


In [8]:
download_from_GCS('test_ft_encode.csv')
test_ft_encode = pd.read_csv('test_ft_encode.csv')
test_ft_encode.head()

test_ft_encode.csv downloaded


Unnamed: 0,encode_cos,encode_L1,encode_L2,canberra_encode,minkowski_encode,braycurtis_encode,skew_q1vec_encode,skew_q2vec_encode,kur_q1vec_encode,kur_q2vec_encode
0,0.307394,13.228489,0.784084,230.45908,0.335352,0.373265,-0.009512,0.00034,-1.244313,-1.23378
1,0.119225,8.212183,0.488314,167.320022,0.209699,0.216852,0.043952,0.034034,-1.034802,-1.156297
2,0.149012,9.238203,0.545916,186.358232,0.234621,0.254429,0.005193,0.0233,-0.798762,-0.700285
3,0.28497,13.437407,0.754944,248.603338,0.315308,0.387871,0.054974,0.111906,-0.414613,-1.089561
4,0.401571,15.591051,0.896182,269.073492,0.381538,0.475944,0.02018,-0.040126,-0.705323,-0.276419


In [25]:
test_ft_all3 = test_ft_all2
test_ft_all3['encode_L1'] = test_ft_encode['encode_L1']
test_ft_all3['encode_L2'] = test_ft_encode['encode_L2']
test_ft_all3['encode_cos'] = test_ft_encode['encode_cos']
test_ft_all3.head()

Unnamed: 0,len_word_dff_org,len_word_dff_rt_org,len_char_dff_org,len_char_dff_rt_org,common_words_org,common_rate_org,jaccard_similarity_org,tfidf_L1_org,tfidf_L2_org,tfidf_cosine_org,len_word_dff_stm,len_word_dff_rt_stm,len_char_dff_stm,len_char_dff_rt_stm,common_words_stm,common_rate_stm,jaccard_similarity_stm,tfidf_L1_stm,tfidf_L2_stm,tfidf_cosine_stm,len_word_dff_lmtz,len_word_dff_rt_lmtz,len_char_dff_lmtz,len_char_dff_rt_lmtz,common_words_lmtz,common_rate_lmtz,jaccard_similarity_lmtz,tfidf_L1_lmtz,tfidf_L2_lmtz,tfidf_cosine_lmtz,encode_L1,encode_L2,encode_cos,encode_cos_log,common_words_stwd,common_rate_stwd,jaccard_similarity_stwd,tfidf_L1_stwd,tfidf_L2_stwd,cos_glv,L1_glv,L2_glv,same_start_word,diff_sen_neg,diff_sen_neu,diff_sen_pos,diff_sen_com,fuzz_qratio,fuzz_WRatio,fuzz_partial_ratio,fuzz_partial_token_set_ratio,fuzz_partial_token_sort_ratio,fuzz_token_set_ratio,fuzz_token_sort_ratio,repeat,tfidf_cosine_stwd,wmd,norm_wmd,cos_w2v,L1_w2v,canberra_w2v,L2_w2v,minkowski_w2v,braycurtis_w2v,skew_q1vec,skew_q2vec,kur_q1vec,kur_q2vec,q1_freq,q2_freq,q1_q2_freq_average
0,1.386294,0.215111,2.197225,0.14842,1.386294,0.25,0.133531,1.719823,1.43517,0.282415,1.386294,0.215111,2.302585,0.173511,1.386294,0.25,0.133531,1.734239,1.47502,0.26249,1.386294,0.215111,2.197225,0.157186,1.386294,0.25,0.133531,1.689093,1.45219,0.273905,13.228489,0.784084,0.307394,0.268036,1.386294,0.4,0.223144,1.487714,1.332189,0.212743,8.859499,0.652293,0,0.0,0.0,0.0,0.0,46,55,45,100,59,58,55,0,0.333906,0.880386,0.880386,0.386389,12.053253,161.856466,0.879077,0.39778,0.480725,0.060989,0.071506,0.206236,-0.317566,1,1,1.0
1,2.079442,0.510826,2.772589,0.296266,1.791759,0.454545,0.257829,1.312116,0.789032,0.605484,2.079442,0.510826,2.890372,0.336472,1.791759,0.454545,0.257829,1.318978,0.733234,0.633383,2.079442,0.510826,2.890372,0.336472,1.791759,0.454545,0.257829,1.318978,0.733234,0.633383,8.212183,0.488314,0.119225,0.112637,1.609438,0.615385,0.367725,1.000223,0.661058,0.082187,5.341142,0.40543,0,0.0,0.0,0.0,0.0,49,86,57,100,64,82,58,0,0.669471,0.421667,0.421667,0.136669,7.267052,120.948289,0.522817,0.234587,0.274652,-0.018201,-0.004014,0.150158,0.010465,2,2,2.0
2,2.079442,0.552069,3.258097,0.533111,1.791759,0.526316,0.305382,1.260385,0.601267,0.699367,2.079442,0.552069,3.258097,0.533111,1.791759,0.526316,0.305382,1.259183,0.597581,0.701209,2.079442,0.552069,3.258097,0.533111,1.791759,0.526316,0.305382,1.28469,0.623996,0.688002,9.238203,0.545916,0.149012,0.138903,1.386294,0.6,0.356675,0.852124,0.390172,0.080816,5.350192,0.402034,1,0.0,0.326,0.326,0.7783,59,86,82,100,68,92,55,0,0.804914,0.630674,0.630674,0.204235,9.068271,137.245312,0.639116,0.284338,0.341486,-0.012669,-0.066156,-0.393252,-0.371144,1,1,1.0
3,0.693147,0.251314,2.302585,0.396415,0.0,0.0,0.0,1.449417,2.0,0.0,0.693147,0.251314,2.197225,0.405465,0.693147,0.25,0.133531,1.233261,1.586042,0.206979,0.693147,0.251314,2.197225,0.405465,0.693147,0.25,0.133531,1.233261,1.586042,0.206979,13.437407,0.754944,0.28497,0.250736,0.0,0.0,0.0,1.317023,2.0,0.391609,12.590033,0.884996,0,0.0,0.0,0.0,0.0,52,53,56,62,62,52,52,0,0.0,0.918765,0.918765,0.288832,10.312107,153.68088,0.760042,0.349093,0.412119,0.094811,0.069599,-0.150121,0.774025,1,1,1.0
4,1.098612,0.336472,1.098612,0.076961,1.386294,0.545455,0.318454,0.676052,0.30544,0.84728,1.098612,0.336472,1.098612,0.087011,1.386294,0.545455,0.318454,0.747521,0.38207,0.808965,1.098612,0.336472,1.098612,0.087011,1.386294,0.545455,0.318454,0.747521,0.38207,0.808965,15.591051,0.896182,0.401571,0.337594,1.098612,0.666667,0.405465,0.0,0.0,0.247688,9.438157,0.703829,1,0.0,0.0,0.0,0.0,69,70,69,100,66,74,66,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.003651,-0.003651,-0.100281,-0.100281,1,1,1.0


In [9]:
download_from_GCS('test_ft_glv.csv')
test_ft_glv = pd.read_csv('test_ft_glv.csv')
test_ft_glv.head()

test_ft_glv.csv downloaded


Unnamed: 0,cos_glv,L1_glv,L2_glv,canberra_glv,minkowski_glv,braycurtis_glv,skew_q1vec_glv,skew_q2vec_glv,kur_q1vec_glv,kur_q2vec_glv
0,0.212743,8.859499,0.652293,157.912241,0.297087,0.420838,1.060148,2.278803,14.769823,25.698478
1,0.082187,5.341142,0.40543,126.617151,0.200035,0.261458,3.529328,2.363401,37.377708,25.717867
2,0.080816,5.350192,0.402034,112.623821,0.18842,0.246372,0.661936,0.434966,28.877237,12.525225
3,0.391609,12.590033,0.884996,177.057767,0.394867,0.542606,-0.385024,0.07923,2.058123,6.054589
4,0.247688,9.438157,0.703829,146.50971,0.33099,0.39548,0.006737,1.313208,1.708022,13.959816


In [26]:
test_ft_all3['cos_glv'] = test_ft_glv['cos_glv']
test_ft_all3['L1_glv'] = test_ft_glv['L1_glv']
test_ft_all3['L2_glv'] = test_ft_glv['L2_glv']
test_ft_all3.head()

Unnamed: 0,len_word_dff_org,len_word_dff_rt_org,len_char_dff_org,len_char_dff_rt_org,common_words_org,common_rate_org,jaccard_similarity_org,tfidf_L1_org,tfidf_L2_org,tfidf_cosine_org,len_word_dff_stm,len_word_dff_rt_stm,len_char_dff_stm,len_char_dff_rt_stm,common_words_stm,common_rate_stm,jaccard_similarity_stm,tfidf_L1_stm,tfidf_L2_stm,tfidf_cosine_stm,len_word_dff_lmtz,len_word_dff_rt_lmtz,len_char_dff_lmtz,len_char_dff_rt_lmtz,common_words_lmtz,common_rate_lmtz,jaccard_similarity_lmtz,tfidf_L1_lmtz,tfidf_L2_lmtz,tfidf_cosine_lmtz,encode_L1,encode_L2,encode_cos,encode_cos_log,common_words_stwd,common_rate_stwd,jaccard_similarity_stwd,tfidf_L1_stwd,tfidf_L2_stwd,cos_glv,L1_glv,L2_glv,same_start_word,diff_sen_neg,diff_sen_neu,diff_sen_pos,diff_sen_com,fuzz_qratio,fuzz_WRatio,fuzz_partial_ratio,fuzz_partial_token_set_ratio,fuzz_partial_token_sort_ratio,fuzz_token_set_ratio,fuzz_token_sort_ratio,repeat,tfidf_cosine_stwd,wmd,norm_wmd,cos_w2v,L1_w2v,canberra_w2v,L2_w2v,minkowski_w2v,braycurtis_w2v,skew_q1vec,skew_q2vec,kur_q1vec,kur_q2vec,q1_freq,q2_freq,q1_q2_freq_average
0,1.386294,0.215111,2.197225,0.14842,1.386294,0.25,0.133531,1.719823,1.43517,0.282415,1.386294,0.215111,2.302585,0.173511,1.386294,0.25,0.133531,1.734239,1.47502,0.26249,1.386294,0.215111,2.197225,0.157186,1.386294,0.25,0.133531,1.689093,1.45219,0.273905,13.228489,0.784084,0.307394,0.268036,1.386294,0.4,0.223144,1.487714,1.332189,0.212743,8.859499,0.652293,0,0.0,0.0,0.0,0.0,46,55,45,100,59,58,55,0,0.333906,0.880386,0.880386,0.386389,12.053253,161.856466,0.879077,0.39778,0.480725,0.060989,0.071506,0.206236,-0.317566,1,1,1.0
1,2.079442,0.510826,2.772589,0.296266,1.791759,0.454545,0.257829,1.312116,0.789032,0.605484,2.079442,0.510826,2.890372,0.336472,1.791759,0.454545,0.257829,1.318978,0.733234,0.633383,2.079442,0.510826,2.890372,0.336472,1.791759,0.454545,0.257829,1.318978,0.733234,0.633383,8.212183,0.488314,0.119225,0.112637,1.609438,0.615385,0.367725,1.000223,0.661058,0.082187,5.341142,0.40543,0,0.0,0.0,0.0,0.0,49,86,57,100,64,82,58,0,0.669471,0.421667,0.421667,0.136669,7.267052,120.948289,0.522817,0.234587,0.274652,-0.018201,-0.004014,0.150158,0.010465,2,2,2.0
2,2.079442,0.552069,3.258097,0.533111,1.791759,0.526316,0.305382,1.260385,0.601267,0.699367,2.079442,0.552069,3.258097,0.533111,1.791759,0.526316,0.305382,1.259183,0.597581,0.701209,2.079442,0.552069,3.258097,0.533111,1.791759,0.526316,0.305382,1.28469,0.623996,0.688002,9.238203,0.545916,0.149012,0.138903,1.386294,0.6,0.356675,0.852124,0.390172,0.080816,5.350192,0.402034,1,0.0,0.326,0.326,0.7783,59,86,82,100,68,92,55,0,0.804914,0.630674,0.630674,0.204235,9.068271,137.245312,0.639116,0.284338,0.341486,-0.012669,-0.066156,-0.393252,-0.371144,1,1,1.0
3,0.693147,0.251314,2.302585,0.396415,0.0,0.0,0.0,1.449417,2.0,0.0,0.693147,0.251314,2.197225,0.405465,0.693147,0.25,0.133531,1.233261,1.586042,0.206979,0.693147,0.251314,2.197225,0.405465,0.693147,0.25,0.133531,1.233261,1.586042,0.206979,13.437407,0.754944,0.28497,0.250736,0.0,0.0,0.0,1.317023,2.0,0.391609,12.590033,0.884996,0,0.0,0.0,0.0,0.0,52,53,56,62,62,52,52,0,0.0,0.918765,0.918765,0.288832,10.312107,153.68088,0.760042,0.349093,0.412119,0.094811,0.069599,-0.150121,0.774025,1,1,1.0
4,1.098612,0.336472,1.098612,0.076961,1.386294,0.545455,0.318454,0.676052,0.30544,0.84728,1.098612,0.336472,1.098612,0.087011,1.386294,0.545455,0.318454,0.747521,0.38207,0.808965,1.098612,0.336472,1.098612,0.087011,1.386294,0.545455,0.318454,0.747521,0.38207,0.808965,15.591051,0.896182,0.401571,0.337594,1.098612,0.666667,0.405465,0.0,0.0,0.247688,9.438157,0.703829,1,0.0,0.0,0.0,0.0,69,70,69,100,66,74,66,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.003651,-0.003651,-0.100281,-0.100281,1,1,1.0


In [13]:
test_ft_all3.columns

Index(['len_word_dff_org', 'len_word_dff_rt_org', 'len_char_dff_org',
       'len_char_dff_rt_org', 'common_words_org', 'common_rate_org',
       'jaccard_similarity_org', 'tfidf_L1_org', 'tfidf_L2_org',
       'tfidf_cosine_org', 'len_word_dff_stm', 'len_word_dff_rt_stm',
       'len_char_dff_stm', 'len_char_dff_rt_stm', 'common_words_stm',
       'common_rate_stm', 'jaccard_similarity_stm', 'tfidf_L1_stm',
       'tfidf_L2_stm', 'tfidf_cosine_stm', 'len_word_dff_lmtz',
       'len_word_dff_rt_lmtz', 'len_char_dff_lmtz', 'len_char_dff_rt_lmtz',
       'common_words_lmtz', 'common_rate_lmtz', 'jaccard_similarity_lmtz',
       'tfidf_L1_lmtz', 'tfidf_L2_lmtz', 'tfidf_cosine_lmtz', 'encode_L1',
       'encode_L2', 'encode_cos', 'encode_cos_log', 'common_words_stwd',
       'common_rate_stwd', 'jaccard_similarity_stwd', 'tfidf_L1_stwd',
       'tfidf_L2_stwd', 'cos_glv', 'L1_glv', 'L2_glv', 'same_start_word',
       'diff_sen_neg', 'diff_sen_neu', 'diff_sen_pos', 'diff_sen_com',
       'f

In [10]:
download_from_GCS('test_ft_w2v.csv')
test_ft_w2v = pd.read_csv('test_ft_w2v.csv')
test_ft_w2v.head()

test_ft_w2v.csv downloaded


Unnamed: 0,wmd,norm_wmd,cos_w2v,L1_w2v,canberra_w2v,L2_w2v,minkowski_w2v,braycurtis_w2v,skew_q1vec,skew_q2vec,kur_q1vec,kur_q2vec
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.880386,0.880386,0.386389,12.053253,161.856466,0.879077,0.39778,0.480725,0.060989,0.071506,0.206236,-0.317566
2,0.421667,0.421667,0.136669,7.267052,120.948289,0.522817,0.234587,0.274652,-0.018201,-0.004014,0.150158,0.010465
3,0.630674,0.630674,0.204235,9.068271,137.245312,0.639116,0.284338,0.341486,-0.012669,-0.066156,-0.393252,-0.371144
4,0.918765,0.918765,0.288832,10.312107,153.68088,0.760042,0.349093,0.412119,0.094811,0.069599,-0.150121,0.774025


In [16]:
test_ft_w2v.shape

(2345797, 12)

In [19]:
test_ft_w2v = pd.read_csv('test_ft_w2v.csv')
test_ft_w2v = test_ft_w2v.iloc[1:test_ft_w2v.shape[0],:]
test_ft_w2v.reset_index(inplace=True)
test_ft_w2v = test_ft_w2v.drop('index', axis=1)
test_ft_w2v.head()

Unnamed: 0,wmd,norm_wmd,cos_w2v,L1_w2v,canberra_w2v,L2_w2v,minkowski_w2v,braycurtis_w2v,skew_q1vec,skew_q2vec,kur_q1vec,kur_q2vec
0,0.880386,0.880386,0.386389,12.053253,161.856466,0.879077,0.39778,0.480725,0.060989,0.071506,0.206236,-0.317566
1,0.421667,0.421667,0.136669,7.267052,120.948289,0.522817,0.234587,0.274652,-0.018201,-0.004014,0.150158,0.010465
2,0.630674,0.630674,0.204235,9.068271,137.245312,0.639116,0.284338,0.341486,-0.012669,-0.066156,-0.393252,-0.371144
3,0.918765,0.918765,0.288832,10.312107,153.68088,0.760042,0.349093,0.412119,0.094811,0.069599,-0.150121,0.774025
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.003651,-0.003651,-0.100281,-0.100281


In [27]:
test_ft_all3.loc[:,['wmd', 'norm_wmd', 'cos_w2v', 'L1_w2v',
       'canberra_w2v', 'L2_w2v', 'minkowski_w2v', 'braycurtis_w2v',
       'skew_q1vec', 'skew_q2vec', 'kur_q1vec', 'kur_q2vec']] = test_ft_w2v.loc[:,['wmd', 'norm_wmd', 'cos_w2v', 'L1_w2v',
       'canberra_w2v', 'L2_w2v', 'minkowski_w2v', 'braycurtis_w2v',
       'skew_q1vec', 'skew_q2vec', 'kur_q1vec', 'kur_q2vec']]
test_ft_all3.head()

Unnamed: 0,len_word_dff_org,len_word_dff_rt_org,len_char_dff_org,len_char_dff_rt_org,common_words_org,common_rate_org,jaccard_similarity_org,tfidf_L1_org,tfidf_L2_org,tfidf_cosine_org,len_word_dff_stm,len_word_dff_rt_stm,len_char_dff_stm,len_char_dff_rt_stm,common_words_stm,common_rate_stm,jaccard_similarity_stm,tfidf_L1_stm,tfidf_L2_stm,tfidf_cosine_stm,len_word_dff_lmtz,len_word_dff_rt_lmtz,len_char_dff_lmtz,len_char_dff_rt_lmtz,common_words_lmtz,common_rate_lmtz,jaccard_similarity_lmtz,tfidf_L1_lmtz,tfidf_L2_lmtz,tfidf_cosine_lmtz,encode_L1,encode_L2,encode_cos,encode_cos_log,common_words_stwd,common_rate_stwd,jaccard_similarity_stwd,tfidf_L1_stwd,tfidf_L2_stwd,cos_glv,L1_glv,L2_glv,same_start_word,diff_sen_neg,diff_sen_neu,diff_sen_pos,diff_sen_com,fuzz_qratio,fuzz_WRatio,fuzz_partial_ratio,fuzz_partial_token_set_ratio,fuzz_partial_token_sort_ratio,fuzz_token_set_ratio,fuzz_token_sort_ratio,repeat,tfidf_cosine_stwd,wmd,norm_wmd,cos_w2v,L1_w2v,canberra_w2v,L2_w2v,minkowski_w2v,braycurtis_w2v,skew_q1vec,skew_q2vec,kur_q1vec,kur_q2vec,q1_freq,q2_freq,q1_q2_freq_average
0,1.386294,0.215111,2.197225,0.14842,1.386294,0.25,0.133531,1.719823,1.43517,0.282415,1.386294,0.215111,2.302585,0.173511,1.386294,0.25,0.133531,1.734239,1.47502,0.26249,1.386294,0.215111,2.197225,0.157186,1.386294,0.25,0.133531,1.689093,1.45219,0.273905,13.228489,0.784084,0.307394,0.268036,1.386294,0.4,0.223144,1.487714,1.332189,0.212743,8.859499,0.652293,0,0.0,0.0,0.0,0.0,46,55,45,100,59,58,55,0,0.333906,0.880386,0.880386,0.386389,12.053253,161.856466,0.879077,0.39778,0.480725,0.060989,0.071506,0.206236,-0.317566,1,1,1.0
1,2.079442,0.510826,2.772589,0.296266,1.791759,0.454545,0.257829,1.312116,0.789032,0.605484,2.079442,0.510826,2.890372,0.336472,1.791759,0.454545,0.257829,1.318978,0.733234,0.633383,2.079442,0.510826,2.890372,0.336472,1.791759,0.454545,0.257829,1.318978,0.733234,0.633383,8.212183,0.488314,0.119225,0.112637,1.609438,0.615385,0.367725,1.000223,0.661058,0.082187,5.341142,0.40543,0,0.0,0.0,0.0,0.0,49,86,57,100,64,82,58,0,0.669471,0.421667,0.421667,0.136669,7.267052,120.948289,0.522817,0.234587,0.274652,-0.018201,-0.004014,0.150158,0.010465,2,2,2.0
2,2.079442,0.552069,3.258097,0.533111,1.791759,0.526316,0.305382,1.260385,0.601267,0.699367,2.079442,0.552069,3.258097,0.533111,1.791759,0.526316,0.305382,1.259183,0.597581,0.701209,2.079442,0.552069,3.258097,0.533111,1.791759,0.526316,0.305382,1.28469,0.623996,0.688002,9.238203,0.545916,0.149012,0.138903,1.386294,0.6,0.356675,0.852124,0.390172,0.080816,5.350192,0.402034,1,0.0,0.326,0.326,0.7783,59,86,82,100,68,92,55,0,0.804914,0.630674,0.630674,0.204235,9.068271,137.245312,0.639116,0.284338,0.341486,-0.012669,-0.066156,-0.393252,-0.371144,1,1,1.0
3,0.693147,0.251314,2.302585,0.396415,0.0,0.0,0.0,1.449417,2.0,0.0,0.693147,0.251314,2.197225,0.405465,0.693147,0.25,0.133531,1.233261,1.586042,0.206979,0.693147,0.251314,2.197225,0.405465,0.693147,0.25,0.133531,1.233261,1.586042,0.206979,13.437407,0.754944,0.28497,0.250736,0.0,0.0,0.0,1.317023,2.0,0.391609,12.590033,0.884996,0,0.0,0.0,0.0,0.0,52,53,56,62,62,52,52,0,0.0,0.918765,0.918765,0.288832,10.312107,153.68088,0.760042,0.349093,0.412119,0.094811,0.069599,-0.150121,0.774025,1,1,1.0
4,1.098612,0.336472,1.098612,0.076961,1.386294,0.545455,0.318454,0.676052,0.30544,0.84728,1.098612,0.336472,1.098612,0.087011,1.386294,0.545455,0.318454,0.747521,0.38207,0.808965,1.098612,0.336472,1.098612,0.087011,1.386294,0.545455,0.318454,0.747521,0.38207,0.808965,15.591051,0.896182,0.401571,0.337594,1.098612,0.666667,0.405465,0.0,0.0,0.247688,9.438157,0.703829,1,0.0,0.0,0.0,0.0,69,70,69,100,66,74,66,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.003651,-0.003651,-0.100281,-0.100281,1,1,1.0


In [28]:
test_ft_all3.iloc[-13:test_ft_all3.shape[0], :]

Unnamed: 0,len_word_dff_org,len_word_dff_rt_org,len_char_dff_org,len_char_dff_rt_org,common_words_org,common_rate_org,jaccard_similarity_org,tfidf_L1_org,tfidf_L2_org,tfidf_cosine_org,len_word_dff_stm,len_word_dff_rt_stm,len_char_dff_stm,len_char_dff_rt_stm,common_words_stm,common_rate_stm,jaccard_similarity_stm,tfidf_L1_stm,tfidf_L2_stm,tfidf_cosine_stm,len_word_dff_lmtz,len_word_dff_rt_lmtz,len_char_dff_lmtz,len_char_dff_rt_lmtz,common_words_lmtz,common_rate_lmtz,jaccard_similarity_lmtz,tfidf_L1_lmtz,tfidf_L2_lmtz,tfidf_cosine_lmtz,encode_L1,encode_L2,encode_cos,encode_cos_log,common_words_stwd,common_rate_stwd,jaccard_similarity_stwd,tfidf_L1_stwd,tfidf_L2_stwd,cos_glv,L1_glv,L2_glv,same_start_word,diff_sen_neg,diff_sen_neu,diff_sen_pos,diff_sen_com,fuzz_qratio,fuzz_WRatio,fuzz_partial_ratio,fuzz_partial_token_set_ratio,fuzz_partial_token_sort_ratio,fuzz_token_set_ratio,fuzz_token_sort_ratio,repeat,tfidf_cosine_stwd,wmd,norm_wmd,cos_w2v,L1_w2v,canberra_w2v,L2_w2v,minkowski_w2v,braycurtis_w2v,skew_q1vec,skew_q2vec,kur_q1vec,kur_q2vec,q1_freq,q2_freq,q1_q2_freq_average
2345783,2.833213,0.359374,4.382027,0.41081,2.302585,0.310345,0.168623,2.186061,1.597993,0.201003,2.833213,0.359374,4.174387,0.365601,2.302585,0.310345,0.168623,2.17892,1.571927,0.214036,2.833213,0.359374,4.189655,0.36931,2.302585,0.310345,0.168623,2.177192,1.60741,0.196295,21.887905,1.217412,0.741046,0.554486,0.693147,0.071429,0.036368,2.019223,1.80982,0.169717,7.985647,0.582609,1,0.079,0.046,0.125,0.754,42,86,48,100,53,54,53,0,0.09509,1.100431,1.100431,0.321171,11.097486,153.995164,0.801463,0.361096,0.436163,0.02038,0.048424,0.040814,-0.375181,1,1,1.0
2345784,0.693147,0.200671,1.386294,0.158224,1.609438,0.8,0.510826,0.249104,0.056565,0.971717,0.693147,0.200671,1.386294,0.167054,1.609438,0.8,0.510826,0.255141,0.059268,0.970366,0.693147,0.200671,1.386294,0.167054,1.609438,0.8,0.510826,0.255141,0.059268,0.970366,7.298631,0.441887,0.097632,0.093155,0.693147,0.666667,0.405465,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,90,95,100,100,79,100,90,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.020046,-0.020046,0.128642,0.128642,1,5,3.0
2345785,0.693147,0.111226,2.302585,0.225956,0.693147,0.111111,0.057158,1.740869,1.970196,0.014902,0.693147,0.111226,2.079442,0.189757,0.693147,0.111111,0.057158,1.740577,1.96849,0.015755,0.693147,0.111226,1.94591,0.167054,0.693147,0.111111,0.057158,1.745813,1.96906,0.01547,24.822374,1.3774,0.948616,0.667119,0.0,0.0,0.0,1.59997,2.0,0.581983,13.892348,1.078873,1,0.0,0.05,0.05,0.2757,44,45,42,100,45,47,47,0,0.0,1.284402,1.284402,0.689169,16.113905,190.493371,1.174026,0.534138,0.714795,0.015695,0.246982,-0.045998,-0.006974,1,1,1.0
2345786,0.693147,0.143101,1.791759,0.185717,0.693147,0.142857,0.074108,1.532796,1.478952,0.260524,0.693147,0.143101,1.791759,0.185717,0.693147,0.142857,0.074108,1.532787,1.477612,0.261194,0.693147,0.143101,1.609438,0.154151,0.693147,0.142857,0.074108,1.539753,1.490127,0.254936,19.893303,1.13394,0.64291,0.496469,0.693147,0.333333,0.182322,1.115218,1.328856,0.30285,10.367119,0.778268,0,0.0,0.0,0.0,0.0,37,45,41,100,41,47,37,0,0.335572,0.874798,0.874798,0.445123,13.194197,176.613459,0.943528,0.423556,0.551717,0.154236,0.073064,0.081606,-0.034445,1,1,1.0
2345787,0.693147,0.090972,3.433987,0.411099,0.0,0.0,0.0,1.956498,2.0,0.0,0.693147,0.090972,2.772589,0.274437,0.0,0.0,0.0,1.922617,2.0,0.0,0.693147,0.090972,2.772589,0.274437,0.0,0.0,0.0,1.922616,2.0,0.0,22.170279,1.230076,0.756544,0.563348,0.0,0.0,0.0,1.84393,2.0,0.41033,11.499119,0.905903,0,0.286,0.286,0.0,0.5574,39,39,42,38,38,34,34,0,0.0,1.285143,1.285143,0.592873,15.110735,185.722198,1.08892,0.490143,0.649007,0.160918,0.081306,0.521031,-0.084391,1,1,1.0
2345788,2.079442,0.372675,3.367296,0.324661,1.791759,0.333333,0.182322,1.760721,1.362068,0.318966,2.079442,0.372675,3.332205,0.336472,1.791759,0.333333,0.182322,1.807264,1.403851,0.298074,2.079442,0.372675,3.295837,0.340927,1.791759,0.333333,0.182322,1.830591,1.471667,0.264167,16.117073,0.914016,0.417713,0.349045,1.386294,0.315789,0.17185,1.581044,1.357799,0.169286,7.627158,0.58187,0,0.0,0.289,0.289,0.2163,53,86,63,100,66,61,63,0,0.3211,0.90918,0.90918,0.355251,11.468057,161.116861,0.842912,0.383261,0.459987,-0.019253,-0.19302,0.250527,0.345161,1,1,1.0
2345789,0.693147,0.040005,2.197225,0.08426,1.609438,0.190476,0.100083,2.1231,1.917366,0.041317,0.693147,0.040005,1.386294,0.036149,1.609438,0.195122,0.102654,2.10761,1.914591,0.042704,0.693147,0.040005,1.386294,0.036149,1.791759,0.25,0.133531,2.07745,1.861524,0.069238,17.489933,1.019381,0.519569,0.418427,0.0,0.0,0.0,1.966482,2.0,0.547357,12.738727,1.046286,0,0.0,0.0,0.0,0.0,39,45,40,100,43,47,43,0,0.0,1.284466,1.284466,0.647171,16.055525,195.902385,1.137692,0.506425,0.702426,-0.074447,-0.172877,-0.13827,-0.228354,1,1,1.0
2345790,2.890372,0.651762,4.127134,0.623394,1.609438,0.275862,0.14842,1.835111,1.774129,0.112935,2.890372,0.651762,4.094345,0.642256,1.791759,0.344828,0.189242,1.687388,1.29612,0.35194,2.890372,0.651762,4.110874,0.653926,1.94591,0.413793,0.231802,1.655829,1.247595,0.376203,20.903042,1.164076,0.677536,0.517326,0.693147,0.133333,0.068993,1.7186,1.831935,0.310712,10.37597,0.788305,1,0.0,0.0,0.0,0.0,36,86,47,100,49,54,42,0,0.084032,1.139397,1.139397,0.629165,15.675251,186.621139,1.121753,0.501653,0.679637,0.015262,0.149636,0.454326,0.082635,1,1,1.0
2345791,1.098612,0.182322,2.484907,0.245673,0.693147,0.095238,0.04879,1.67635,1.432886,0.283557,1.098612,0.182322,2.302585,0.215111,0.693147,0.095238,0.04879,1.714318,1.560342,0.219829,1.098612,0.182322,2.197225,0.195745,0.693147,0.1,0.051293,1.709162,1.56618,0.21691,18.053555,1.014984,0.515096,0.415479,0.693147,0.166667,0.087011,1.465798,1.379817,0.463771,12.648247,0.96309,0,0.32,0.32,0.0,0.6908,39,43,43,100,48,45,43,0,0.310091,1.096173,1.096173,0.592424,14.874222,177.014612,1.088507,0.496921,0.628627,0.000444,0.049164,-0.546281,-0.312402,1,1,1.0
2345792,1.609438,0.451985,1.791759,0.131769,0.693147,0.133333,0.068993,1.541286,1.405133,0.297434,1.609438,0.451985,2.079442,0.206614,0.693147,0.133333,0.068993,1.528475,1.3103,0.34485,1.609438,0.451985,2.079442,0.206614,0.693147,0.133333,0.068993,1.528435,1.309391,0.345305,11.19372,0.659808,0.217673,0.196942,0.693147,0.222222,0.117783,1.332691,1.350336,0.33868,11.090839,0.823019,0,0.0,0.0,0.0,0.0,46,46,46,100,46,48,43,0,0.324832,0.993352,0.993352,0.542961,14.251065,184.995636,1.042076,0.475223,0.616715,0.067357,0.038022,0.555009,0.25525,1,1,1.0


In [0]:
test_ft_all3['encode_cos_log'] = test_ft_all3['encode_cos'].apply(lambda x: np.log(x + 1))

In [30]:
test_ft_all_t = pd.concat([test_ft_all3, test_ft_glv.iloc[:, 3:test_ft_glv.shape[1]]], axis=1)
test_ft_all_t.head()

Unnamed: 0,len_word_dff_org,len_word_dff_rt_org,len_char_dff_org,len_char_dff_rt_org,common_words_org,common_rate_org,jaccard_similarity_org,tfidf_L1_org,tfidf_L2_org,tfidf_cosine_org,len_word_dff_stm,len_word_dff_rt_stm,len_char_dff_stm,len_char_dff_rt_stm,common_words_stm,common_rate_stm,jaccard_similarity_stm,tfidf_L1_stm,tfidf_L2_stm,tfidf_cosine_stm,len_word_dff_lmtz,len_word_dff_rt_lmtz,len_char_dff_lmtz,len_char_dff_rt_lmtz,common_words_lmtz,common_rate_lmtz,jaccard_similarity_lmtz,tfidf_L1_lmtz,tfidf_L2_lmtz,tfidf_cosine_lmtz,encode_L1,encode_L2,encode_cos,encode_cos_log,common_words_stwd,common_rate_stwd,jaccard_similarity_stwd,tfidf_L1_stwd,tfidf_L2_stwd,cos_glv,L1_glv,L2_glv,same_start_word,diff_sen_neg,diff_sen_neu,diff_sen_pos,diff_sen_com,fuzz_qratio,fuzz_WRatio,fuzz_partial_ratio,fuzz_partial_token_set_ratio,fuzz_partial_token_sort_ratio,fuzz_token_set_ratio,fuzz_token_sort_ratio,repeat,tfidf_cosine_stwd,wmd,norm_wmd,cos_w2v,L1_w2v,canberra_w2v,L2_w2v,minkowski_w2v,braycurtis_w2v,skew_q1vec,skew_q2vec,kur_q1vec,kur_q2vec,q1_freq,q2_freq,q1_q2_freq_average,canberra_glv,minkowski_glv,braycurtis_glv,skew_q1vec_glv,skew_q2vec_glv,kur_q1vec_glv,kur_q2vec_glv
0,1.386294,0.215111,2.197225,0.14842,1.386294,0.25,0.133531,1.719823,1.43517,0.282415,1.386294,0.215111,2.302585,0.173511,1.386294,0.25,0.133531,1.734239,1.47502,0.26249,1.386294,0.215111,2.197225,0.157186,1.386294,0.25,0.133531,1.689093,1.45219,0.273905,13.228489,0.784084,0.307394,0.268036,1.386294,0.4,0.223144,1.487714,1.332189,0.212743,8.859499,0.652293,0,0.0,0.0,0.0,0.0,46,55,45,100,59,58,55,0,0.333906,0.880386,0.880386,0.386389,12.053253,161.856466,0.879077,0.39778,0.480725,0.060989,0.071506,0.206236,-0.317566,1,1,1.0,157.912241,0.297087,0.420838,1.060148,2.278803,14.769823,25.698478
1,2.079442,0.510826,2.772589,0.296266,1.791759,0.454545,0.257829,1.312116,0.789032,0.605484,2.079442,0.510826,2.890372,0.336472,1.791759,0.454545,0.257829,1.318978,0.733234,0.633383,2.079442,0.510826,2.890372,0.336472,1.791759,0.454545,0.257829,1.318978,0.733234,0.633383,8.212183,0.488314,0.119225,0.112637,1.609438,0.615385,0.367725,1.000223,0.661058,0.082187,5.341142,0.40543,0,0.0,0.0,0.0,0.0,49,86,57,100,64,82,58,0,0.669471,0.421667,0.421667,0.136669,7.267052,120.948289,0.522817,0.234587,0.274652,-0.018201,-0.004014,0.150158,0.010465,2,2,2.0,126.617151,0.200035,0.261458,3.529328,2.363401,37.377708,25.717867
2,2.079442,0.552069,3.258097,0.533111,1.791759,0.526316,0.305382,1.260385,0.601267,0.699367,2.079442,0.552069,3.258097,0.533111,1.791759,0.526316,0.305382,1.259183,0.597581,0.701209,2.079442,0.552069,3.258097,0.533111,1.791759,0.526316,0.305382,1.28469,0.623996,0.688002,9.238203,0.545916,0.149012,0.138903,1.386294,0.6,0.356675,0.852124,0.390172,0.080816,5.350192,0.402034,1,0.0,0.326,0.326,0.7783,59,86,82,100,68,92,55,0,0.804914,0.630674,0.630674,0.204235,9.068271,137.245312,0.639116,0.284338,0.341486,-0.012669,-0.066156,-0.393252,-0.371144,1,1,1.0,112.623821,0.18842,0.246372,0.661936,0.434966,28.877237,12.525225
3,0.693147,0.251314,2.302585,0.396415,0.0,0.0,0.0,1.449417,2.0,0.0,0.693147,0.251314,2.197225,0.405465,0.693147,0.25,0.133531,1.233261,1.586042,0.206979,0.693147,0.251314,2.197225,0.405465,0.693147,0.25,0.133531,1.233261,1.586042,0.206979,13.437407,0.754944,0.28497,0.250736,0.0,0.0,0.0,1.317023,2.0,0.391609,12.590033,0.884996,0,0.0,0.0,0.0,0.0,52,53,56,62,62,52,52,0,0.0,0.918765,0.918765,0.288832,10.312107,153.68088,0.760042,0.349093,0.412119,0.094811,0.069599,-0.150121,0.774025,1,1,1.0,177.057767,0.394867,0.542606,-0.385024,0.07923,2.058123,6.054589
4,1.098612,0.336472,1.098612,0.076961,1.386294,0.545455,0.318454,0.676052,0.30544,0.84728,1.098612,0.336472,1.098612,0.087011,1.386294,0.545455,0.318454,0.747521,0.38207,0.808965,1.098612,0.336472,1.098612,0.087011,1.386294,0.545455,0.318454,0.747521,0.38207,0.808965,15.591051,0.896182,0.401571,0.337594,1.098612,0.666667,0.405465,0.0,0.0,0.247688,9.438157,0.703829,1,0.0,0.0,0.0,0.0,69,70,69,100,66,74,66,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.003651,-0.003651,-0.100281,-0.100281,1,1,1.0,146.50971,0.33099,0.39548,0.006737,1.313208,1.708022,13.959816


In [0]:
test_ft_all3 = test_ft_all_t

In [32]:
test_ft_all_t = pd.concat([test_ft_all3, test_ft_encode.iloc[:, 3:test_ft_encode.shape[1]]], axis=1)
test_ft_all_t.head()

Unnamed: 0,len_word_dff_org,len_word_dff_rt_org,len_char_dff_org,len_char_dff_rt_org,common_words_org,common_rate_org,jaccard_similarity_org,tfidf_L1_org,tfidf_L2_org,tfidf_cosine_org,len_word_dff_stm,len_word_dff_rt_stm,len_char_dff_stm,len_char_dff_rt_stm,common_words_stm,common_rate_stm,jaccard_similarity_stm,tfidf_L1_stm,tfidf_L2_stm,tfidf_cosine_stm,len_word_dff_lmtz,len_word_dff_rt_lmtz,len_char_dff_lmtz,len_char_dff_rt_lmtz,common_words_lmtz,common_rate_lmtz,jaccard_similarity_lmtz,tfidf_L1_lmtz,tfidf_L2_lmtz,tfidf_cosine_lmtz,encode_L1,encode_L2,encode_cos,encode_cos_log,common_words_stwd,common_rate_stwd,jaccard_similarity_stwd,tfidf_L1_stwd,tfidf_L2_stwd,cos_glv,...,diff_sen_pos,diff_sen_com,fuzz_qratio,fuzz_WRatio,fuzz_partial_ratio,fuzz_partial_token_set_ratio,fuzz_partial_token_sort_ratio,fuzz_token_set_ratio,fuzz_token_sort_ratio,repeat,tfidf_cosine_stwd,wmd,norm_wmd,cos_w2v,L1_w2v,canberra_w2v,L2_w2v,minkowski_w2v,braycurtis_w2v,skew_q1vec,skew_q2vec,kur_q1vec,kur_q2vec,q1_freq,q2_freq,q1_q2_freq_average,canberra_glv,minkowski_glv,braycurtis_glv,skew_q1vec_glv,skew_q2vec_glv,kur_q1vec_glv,kur_q2vec_glv,canberra_encode,minkowski_encode,braycurtis_encode,skew_q1vec_encode,skew_q2vec_encode,kur_q1vec_encode,kur_q2vec_encode
0,1.386294,0.215111,2.197225,0.14842,1.386294,0.25,0.133531,1.719823,1.43517,0.282415,1.386294,0.215111,2.302585,0.173511,1.386294,0.25,0.133531,1.734239,1.47502,0.26249,1.386294,0.215111,2.197225,0.157186,1.386294,0.25,0.133531,1.689093,1.45219,0.273905,13.228489,0.784084,0.307394,0.268036,1.386294,0.4,0.223144,1.487714,1.332189,0.212743,...,0.0,0.0,46,55,45,100,59,58,55,0,0.333906,0.880386,0.880386,0.386389,12.053253,161.856466,0.879077,0.39778,0.480725,0.060989,0.071506,0.206236,-0.317566,1,1,1.0,157.912241,0.297087,0.420838,1.060148,2.278803,14.769823,25.698478,230.45908,0.335352,0.373265,-0.009512,0.00034,-1.244313,-1.23378
1,2.079442,0.510826,2.772589,0.296266,1.791759,0.454545,0.257829,1.312116,0.789032,0.605484,2.079442,0.510826,2.890372,0.336472,1.791759,0.454545,0.257829,1.318978,0.733234,0.633383,2.079442,0.510826,2.890372,0.336472,1.791759,0.454545,0.257829,1.318978,0.733234,0.633383,8.212183,0.488314,0.119225,0.112637,1.609438,0.615385,0.367725,1.000223,0.661058,0.082187,...,0.0,0.0,49,86,57,100,64,82,58,0,0.669471,0.421667,0.421667,0.136669,7.267052,120.948289,0.522817,0.234587,0.274652,-0.018201,-0.004014,0.150158,0.010465,2,2,2.0,126.617151,0.200035,0.261458,3.529328,2.363401,37.377708,25.717867,167.320022,0.209699,0.216852,0.043952,0.034034,-1.034802,-1.156297
2,2.079442,0.552069,3.258097,0.533111,1.791759,0.526316,0.305382,1.260385,0.601267,0.699367,2.079442,0.552069,3.258097,0.533111,1.791759,0.526316,0.305382,1.259183,0.597581,0.701209,2.079442,0.552069,3.258097,0.533111,1.791759,0.526316,0.305382,1.28469,0.623996,0.688002,9.238203,0.545916,0.149012,0.138903,1.386294,0.6,0.356675,0.852124,0.390172,0.080816,...,0.326,0.7783,59,86,82,100,68,92,55,0,0.804914,0.630674,0.630674,0.204235,9.068271,137.245312,0.639116,0.284338,0.341486,-0.012669,-0.066156,-0.393252,-0.371144,1,1,1.0,112.623821,0.18842,0.246372,0.661936,0.434966,28.877237,12.525225,186.358232,0.234621,0.254429,0.005193,0.0233,-0.798762,-0.700285
3,0.693147,0.251314,2.302585,0.396415,0.0,0.0,0.0,1.449417,2.0,0.0,0.693147,0.251314,2.197225,0.405465,0.693147,0.25,0.133531,1.233261,1.586042,0.206979,0.693147,0.251314,2.197225,0.405465,0.693147,0.25,0.133531,1.233261,1.586042,0.206979,13.437407,0.754944,0.28497,0.250736,0.0,0.0,0.0,1.317023,2.0,0.391609,...,0.0,0.0,52,53,56,62,62,52,52,0,0.0,0.918765,0.918765,0.288832,10.312107,153.68088,0.760042,0.349093,0.412119,0.094811,0.069599,-0.150121,0.774025,1,1,1.0,177.057767,0.394867,0.542606,-0.385024,0.07923,2.058123,6.054589,248.603338,0.315308,0.387871,0.054974,0.111906,-0.414613,-1.089561
4,1.098612,0.336472,1.098612,0.076961,1.386294,0.545455,0.318454,0.676052,0.30544,0.84728,1.098612,0.336472,1.098612,0.087011,1.386294,0.545455,0.318454,0.747521,0.38207,0.808965,1.098612,0.336472,1.098612,0.087011,1.386294,0.545455,0.318454,0.747521,0.38207,0.808965,15.591051,0.896182,0.401571,0.337594,1.098612,0.666667,0.405465,0.0,0.0,0.247688,...,0.0,0.0,69,70,69,100,66,74,66,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.003651,-0.003651,-0.100281,-0.100281,1,1,1.0,146.50971,0.33099,0.39548,0.006737,1.313208,1.708022,13.959816,269.073492,0.381538,0.475944,0.02018,-0.040126,-0.705323,-0.276419


In [33]:
test_ft_all_t.to_csv('test_ft_all3.csv', index=False)
upload_to_GCS('test_ft_all3.csv')

test_ft_all3.csv uploaded


In [34]:
download_from_GCS('test_ft_snw.csv')
test_ft_snw = pd.read_csv('test_ft_snw.csv')
test_ft_snw.head()

test_ft_snw.csv downloaded


Unnamed: 0,same_end_word,id
0,0,0
1,0,1
2,0,2
3,0,3
4,1,4


In [35]:
test_ft_all3 = test_ft_all_t
test_ft_all_t = pd.concat([test_ft_all3, test_ft_snw], axis=1)
test_ft_all_t.head()

Unnamed: 0,len_word_dff_org,len_word_dff_rt_org,len_char_dff_org,len_char_dff_rt_org,common_words_org,common_rate_org,jaccard_similarity_org,tfidf_L1_org,tfidf_L2_org,tfidf_cosine_org,len_word_dff_stm,len_word_dff_rt_stm,len_char_dff_stm,len_char_dff_rt_stm,common_words_stm,common_rate_stm,jaccard_similarity_stm,tfidf_L1_stm,tfidf_L2_stm,tfidf_cosine_stm,len_word_dff_lmtz,len_word_dff_rt_lmtz,len_char_dff_lmtz,len_char_dff_rt_lmtz,common_words_lmtz,common_rate_lmtz,jaccard_similarity_lmtz,tfidf_L1_lmtz,tfidf_L2_lmtz,tfidf_cosine_lmtz,encode_L1,encode_L2,encode_cos,encode_cos_log,common_words_stwd,common_rate_stwd,jaccard_similarity_stwd,tfidf_L1_stwd,tfidf_L2_stwd,cos_glv,...,fuzz_qratio,fuzz_WRatio,fuzz_partial_ratio,fuzz_partial_token_set_ratio,fuzz_partial_token_sort_ratio,fuzz_token_set_ratio,fuzz_token_sort_ratio,repeat,tfidf_cosine_stwd,wmd,norm_wmd,cos_w2v,L1_w2v,canberra_w2v,L2_w2v,minkowski_w2v,braycurtis_w2v,skew_q1vec,skew_q2vec,kur_q1vec,kur_q2vec,q1_freq,q2_freq,q1_q2_freq_average,canberra_glv,minkowski_glv,braycurtis_glv,skew_q1vec_glv,skew_q2vec_glv,kur_q1vec_glv,kur_q2vec_glv,canberra_encode,minkowski_encode,braycurtis_encode,skew_q1vec_encode,skew_q2vec_encode,kur_q1vec_encode,kur_q2vec_encode,same_end_word,id
0,1.386294,0.215111,2.197225,0.14842,1.386294,0.25,0.133531,1.719823,1.43517,0.282415,1.386294,0.215111,2.302585,0.173511,1.386294,0.25,0.133531,1.734239,1.47502,0.26249,1.386294,0.215111,2.197225,0.157186,1.386294,0.25,0.133531,1.689093,1.45219,0.273905,13.228489,0.784084,0.307394,0.268036,1.386294,0.4,0.223144,1.487714,1.332189,0.212743,...,46,55,45,100,59,58,55,0,0.333906,0.880386,0.880386,0.386389,12.053253,161.856466,0.879077,0.39778,0.480725,0.060989,0.071506,0.206236,-0.317566,1,1,1.0,157.912241,0.297087,0.420838,1.060148,2.278803,14.769823,25.698478,230.45908,0.335352,0.373265,-0.009512,0.00034,-1.244313,-1.23378,0,0
1,2.079442,0.510826,2.772589,0.296266,1.791759,0.454545,0.257829,1.312116,0.789032,0.605484,2.079442,0.510826,2.890372,0.336472,1.791759,0.454545,0.257829,1.318978,0.733234,0.633383,2.079442,0.510826,2.890372,0.336472,1.791759,0.454545,0.257829,1.318978,0.733234,0.633383,8.212183,0.488314,0.119225,0.112637,1.609438,0.615385,0.367725,1.000223,0.661058,0.082187,...,49,86,57,100,64,82,58,0,0.669471,0.421667,0.421667,0.136669,7.267052,120.948289,0.522817,0.234587,0.274652,-0.018201,-0.004014,0.150158,0.010465,2,2,2.0,126.617151,0.200035,0.261458,3.529328,2.363401,37.377708,25.717867,167.320022,0.209699,0.216852,0.043952,0.034034,-1.034802,-1.156297,0,1
2,2.079442,0.552069,3.258097,0.533111,1.791759,0.526316,0.305382,1.260385,0.601267,0.699367,2.079442,0.552069,3.258097,0.533111,1.791759,0.526316,0.305382,1.259183,0.597581,0.701209,2.079442,0.552069,3.258097,0.533111,1.791759,0.526316,0.305382,1.28469,0.623996,0.688002,9.238203,0.545916,0.149012,0.138903,1.386294,0.6,0.356675,0.852124,0.390172,0.080816,...,59,86,82,100,68,92,55,0,0.804914,0.630674,0.630674,0.204235,9.068271,137.245312,0.639116,0.284338,0.341486,-0.012669,-0.066156,-0.393252,-0.371144,1,1,1.0,112.623821,0.18842,0.246372,0.661936,0.434966,28.877237,12.525225,186.358232,0.234621,0.254429,0.005193,0.0233,-0.798762,-0.700285,0,2
3,0.693147,0.251314,2.302585,0.396415,0.0,0.0,0.0,1.449417,2.0,0.0,0.693147,0.251314,2.197225,0.405465,0.693147,0.25,0.133531,1.233261,1.586042,0.206979,0.693147,0.251314,2.197225,0.405465,0.693147,0.25,0.133531,1.233261,1.586042,0.206979,13.437407,0.754944,0.28497,0.250736,0.0,0.0,0.0,1.317023,2.0,0.391609,...,52,53,56,62,62,52,52,0,0.0,0.918765,0.918765,0.288832,10.312107,153.68088,0.760042,0.349093,0.412119,0.094811,0.069599,-0.150121,0.774025,1,1,1.0,177.057767,0.394867,0.542606,-0.385024,0.07923,2.058123,6.054589,248.603338,0.315308,0.387871,0.054974,0.111906,-0.414613,-1.089561,0,3
4,1.098612,0.336472,1.098612,0.076961,1.386294,0.545455,0.318454,0.676052,0.30544,0.84728,1.098612,0.336472,1.098612,0.087011,1.386294,0.545455,0.318454,0.747521,0.38207,0.808965,1.098612,0.336472,1.098612,0.087011,1.386294,0.545455,0.318454,0.747521,0.38207,0.808965,15.591051,0.896182,0.401571,0.337594,1.098612,0.666667,0.405465,0.0,0.0,0.247688,...,69,70,69,100,66,74,66,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.003651,-0.003651,-0.100281,-0.100281,1,1,1.0,146.50971,0.33099,0.39548,0.006737,1.313208,1.708022,13.959816,269.073492,0.381538,0.475944,0.02018,-0.040126,-0.705323,-0.276419,1,4


In [36]:
test_ft_all_t.shape

(2345796, 87)

In [37]:
test_ft_all_t.to_csv('test_ft_all3.csv', index=False)
upload_to_GCS('test_ft_all3.csv')

test_ft_all3.csv uploaded
