In [1751]:
import pandas as pd
import numpy as np
import glob
import jieba
from sklearn.metrics.pairwise import cosine_similarity
from jaccard_index.jaccard import jaccard_index
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
import pickle
from scipy import spatial
from scipy.spatial import distance
import nltk
import lightgbm as lgb
from sklearn.externals import joblib

In [2]:
from jslearn import nlp
embedding = nlp.TencentWordEmbedding()
embedding.load_pretrained('./')

  from numpy.core.umath_tests import inner1d


Loading model
Finish loading model


# get all data in one frame

In [3]:
glued_data = pd.DataFrame()
for file_name in glob.glob('./data/'+'*.xlsx'):
    x = pd.read_excel(file_name)
    glued_data = pd.concat([glued_data,x],axis=0)

In [4]:
glued_data.to_csv('df.csv')

In [5]:
data = pd.read_csv('df.csv').reset_index(drop=True)[['name','product_path','S_SEGMENT_ITEM']]

In [592]:
data

Unnamed: 0,name,product_path,S_SEGMENT_ITEM,vector
0,其他,0x0x,其他业务,"[0.303986, 0.243021, -0.185901, 0.503245, -0.4..."
1,其他,0x0x,其他主营业务,"[0.21007699, -0.109142505, 0.054424997, 0.4106..."
2,其他,0x0x,其他设备,"[0.040454, -0.140426, -0.10517, 0.514267, 0.02..."
3,其他,0x0x,其他收入,"[0.360702, -0.237471, 0.496045, 0.37673, 0.018..."
4,其他,0x0x,其他金属产品,"[0.0133914985, -0.093531005, 0.284247, 0.60189..."
...,...,...,...,...
185238,非传统电信运营商,WC003>WC00201,互联网带宽资源租赁,"[0.09264301, -0.23342049, -0.23131675, 0.13014..."
185239,互联网接入服务,WC003>WC00201>WC0020101,有线宽带业务收入,"[0.16999866, -0.14123634, -0.28885368, 0.41308..."
185240,互联网接入服务,WC003>WC00201>WC0020101,互联网接入,"[0.121864, 0.365356, -0.34149, 0.280265, -0.30..."
185241,互联网接入服务,WC003>WC00201>WC0020101,数据业务收入,"[0.100543, 0.158805, -0.15554, 0.514435, -0.26..."


# function for vector calculation and cosine similarity

In [6]:
def get_vec(x):
    '''get the vector from name, if name not found in tencent trained,
    split words and try to get vector from each splitted then avereage,
    if it does not exist, return null'''
    try:
        vector = embedding.tc_wv.get_vector(x)
#         print(x + 'full line processed')
    except:
        lst = list(jieba.cut(x))
        to_avg = []
        to_div = 0
        for x in lst:
            try:
                to_avg += [embedding.tc_wv.get_vector(x)]
                to_div += 1
            except:
                pass
        if to_div != 0:
            vector = sum(to_avg)/to_div
#             print(x + 'trimmed processed')
        else:
            vector = None
#             print('word does not match')
    return vector

In [1154]:
# %timeit
# data_unique_item['cos_sim_best_match'] = data['vector'].apply(lambda x: get_score(x,unique_name))

# get y and its vector, drop rows where vector cannot be calculated

In [14]:
unique_name = data['name'].drop_duplicates().to_frame()

In [15]:
unique_name['y_vec'] = unique_name['name'].apply(lambda x: get_vec(x))

In [16]:
unique_name = unique_name[unique_name['y_vec'].notnull()].reset_index(drop = True)

# get x and its vector, drop rows where vector cannot be calculated

In [17]:
data['vector'] = data['S_SEGMENT_ITEM'].apply(lambda x: get_vec(x))

In [18]:
data = data[data['vector'].notnull()] #remove null

In [19]:
data_unique_item = data.drop_duplicates(subset = 'S_SEGMENT_ITEM') #to merge back to the df later, save runtime this way

# get data for training

In [631]:
unique_name

Unnamed: 0,name,y_vec
0,其他,"[-0.087862, -0.174683, 0.140601, 0.450342, 0.2..."
1,内部抵消,"[0.394045, 0.079835, -0.167608, 0.22675, 0.367..."
2,汽车玻璃升降器,"[0.559684, 0.302653, 0.233626, 0.473966, -0.14..."
3,汽车发动机整机,"[0.125825, -0.08049067, -0.22326434, 0.3616963..."
4,汽车活塞,"[0.577598, -0.039549, -0.005468, 0.363049, 0.1..."
...,...,...
3519,原纸,"[0.681907, -0.891124, -0.133084, 0.833789, -0...."
3520,复方甘草酸苷,"[0.099956, -0.778768, 0.051812, 0.117857, -0.3..."
3521,辐照技术服务,"[0.040421665, 0.06281, -0.079647996, 0.1520723..."
3522,冷却系统,"[0.269672, 0.235671, -0.247349, 0.249258, 0.28..."


In [1715]:
data_unique_item

Unnamed: 0,name,product_path,S_SEGMENT_ITEM,vector
0,其他,0x0x,其他业务,"[0.303986, 0.243021, -0.185901, 0.503245, -0.4..."
1,其他,0x0x,其他主营业务,"[0.21007699, -0.109142505, 0.054424997, 0.4106..."
2,其他,0x0x,其他设备,"[0.040454, -0.140426, -0.10517, 0.514267, 0.02..."
3,其他,0x0x,其他收入,"[0.360702, -0.237471, 0.496045, 0.37673, 0.018..."
4,其他,0x0x,其他金属产品,"[0.0133914985, -0.093531005, 0.284247, 0.60189..."
...,...,...,...,...
184253,物业管理,RE004>RE002002,其他(含来料加工、物业管理等),"[-0.11124044, -0.10276186, 0.004679567, 0.1835..."
184381,商品房开发,RE005>RE003>RE00304,房产销售收入-陆家嘴花园,"[0.19557619, -0.31296283, -0.020488398, 0.2395..."
184384,商品房开发,RE005>RE003>RE00304,房产销售收入-闵浦新苑四村,"[0.148133, -0.23593426, -0.025224999, 0.247650..."
184410,商品房开发,RE005>RE003>RE00304,玉龙宫项目房产销售,"[0.1999072, -0.2212204, -0.0047128024, 0.13620..."


In [992]:
#create dataset for filtering
ml_ready_df = pd.DataFrame()
for row_tuple in data_unique_item.iterrows():
    row = row_tuple[1]
    def jac_calc(y):
        try:
            return jaccard_index(row['S_SEGMENT_ITEM'],y['name'])
        except:
            return 0
    unique_name_c  = unique_name.copy()
    temp = unique_name_c[unique_name_c['name'] == row['name']] #get row with correct x and y match
    temp = temp.append(unique_name_c[unique_name_c['name'] != row['name']].sample(100)) # get all the other rows with incorrect match of x and y
#     not_sampled = unique_name_c.drop(temp.index).index
#     print(row['vector'])
    cos_score = temp.apply(lambda y: 1 - spatial.distance.cosine(row['vector'],y['y_vec']), axis = 1)# get cosine similarity score for semantic
    jac_score = temp.apply(lambda y: jac_calc(y),axis =1) # get jaccard similarity score for vocab distance
    lev_distance = temp.apply(lambda y: nltk.edit_distance(row['S_SEGMENT_ITEM'],y['name']), axis = 1)

    xy_df = pd.DataFrame()
    xy_df['y_name'] = temp['name']
    xy_df['x_name'] = row['S_SEGMENT_ITEM']
    xy_df['cos'] = cos_score
    xy_df['jac'] = jac_score
    xy_df['lev'] = lev_distance
    xy_df['len(x)'] = len(row['S_SEGMENT_ITEM'])
    xy_df['len(y)'] = temp['name'].apply(lambda y: len(y))
    xy_df['y'] = (temp['name'] == row['name']).astype(int)
    ml_ready_df = ml_ready_df.append(xy_df)
ml_ready_df = ml_ready_df.reset_index(drop = True)
ml_ready_df.to_csv('ml_ready_df.csv')

In [1737]:
ml_ready_df = pd.read_csv('ml_ready_df.csv').iloc[:,1:]
ml_ready_df.shape

(2444574, 8)

In [1433]:
ml_ready_df

Unnamed: 0,y_name,x_name,cos,jac,lev,len(x),len(y),y
1,车用空调,其他业务,0.293106,0.0,4,4,4,0
2,汽车租赁,其他业务,0.498689,0.0,4,4,4,0
3,注射器,其他业务,0.187009,0.0,4,4,3,0
4,导电银浆,其他业务,0.262048,0.0,4,4,4,0
5,交通运输系统集成,其他业务,0.504732,0.0,8,4,8,0
...,...,...,...,...,...,...,...,...
2444468,船舶电气设备,玉龙宫项目房产销售,0.252388,0.0,9,9,6,0
2444469,热轧钢板,玉龙宫项目房产销售,0.287268,0.0,9,9,4,0
2444470,中药批发,玉龙宫项目房产销售,0.338719,0.0,9,9,4,0
2444471,观光电车,玉龙宫项目房产销售,0.381770,0.0,9,9,4,0


# train

### filtering: removed words that cannot be detected like x:普度 to y:其他

In [1752]:
def train_n_filter(ml_ready_df):
    '''
    给予一个数据框架拥有y_name,x_name,cos,jac,lev,len(x),len(y),y 列，过滤掉FN的数据两次并训练模型
    
    - drop y from greater than 5% of incorrectly predicted y out of all count of y
    - drop x that have lowest 30 percent cosine similarity with original y word
    Parameters:
    ml_ready_df (pandas dataframe): a dataframe that has y_name,x_name,cos,jac,lev,len(x),len(y),y columns
    
    Returns:
    bst trained model, a list of x to remove in inference, a list of y to remove in inference
    '''
    #set removed series
    y_removed = pd.Series()
    x_removed = pd.Series()
    for i in range(2):
        X_train, X_test, y_train, y_test = train_test_split(ml_ready_df.drop(columns = ['y','x_name','y_name']),ml_ready_df['y'], test_size=0.30)
        bst = lgb.LGBMClassifier(max_bin = 510).fit(X_train,y_train)
        y_pred = bst.predict(X_test)
        fn_index = y_test[(y_test-y_pred) == 1].index #get FN and filter it below
    
        proportion_to_drop = ml_ready_df.reindex(fn_index)['y_name'].value_counts() # the reindex grab
        proportion_to_drop = proportion_to_drop.reset_index().rename(columns = {'index':'y_name','y_name':'count'})
        to_div = proportion_to_drop.apply(lambda x: ml_ready_df[ml_ready_df['y_name'] == x['y_name']].shape[0],axis = 1)
        proportion_to_drop['perc'] = proportion_to_drop['count']/to_div
        proportion_to_drop = proportion_to_drop[proportion_to_drop['perc'] > 0.05]

        y_removed = y_removed.append(proportion_to_drop['y_name']).reset_index(drop = True)
        display(proportion_to_drop)

        words_to_remove = ml_ready_df['y_name']
        filter_comp = list(proportion_to_drop['y_name'])
        removed_boo = words_to_remove.apply(lambda x: all([False if i == x else True for i in filter_comp]))
        x_removed = x_removed.append(ml_ready_df[~removed_boo]['x_name']).reset_index(drop = True) #get x word removed from y removal
        print('x_removed appended')

        ml_ready_df = ml_ready_df[removed_boo]

        words_to_remove = ml_ready_df['x_name']
        falsely_pred = ml_ready_df.reindex(fn_index).dropna() #dropna because already dropped some y above
        #drop lowest 30 percent of cos sim
        x_to_drop = falsely_pred.sort_values(by = ['cos']).iloc[int(falsely_pred.shape[0]*0.3):]['x_name']
        filter_x = list(x_to_drop)

        x_removed = x_removed.append(x_to_drop).reset_index(drop = True)

        print('number of x being dropped: ' + str(len(filter_x)))

        removed_boo_x = words_to_remove.apply(lambda x: all([False if i == x else True for i in filter_x]))
        ml_ready_df = ml_ready_df[removed_boo_x]
    
    y_removed_final = y_removed.to_list()
    x_removed_final = x_removed.drop_duplicates().to_list()
    X_train, X_test, y_train, y_test = train_test_split(ml_ready_df.drop(columns = ['y','x_name','y_name']),ml_ready_df['y'], test_size=0.30)
    bst = lgb.LGBMClassifier(max_bin = 510)
    bst.fit(X_train,y_train)
    return bst, x_removed_final,y_removed_final

In [1754]:
bst,x_removed_final,y_removed_final = train_n_filter(ml_ready_df)

In [None]:
unique_name_saved = unique_name.copy() #save a copy of unique name just in case

In [None]:
unique_name = unique_name[unique_name['name'].apply(lambda x: True if x not in y_removed_final else False)] #drop y that is removed in training

# saving model and data needed for inference

In [None]:
bst.booster_.save_model('model.txt') #saving model
x_p = open('./x_removed_final.plk','wb')
pickle.dump(x_removed_final, open('./x_removed_final.plk','wb'))
x_p.close()
y_p = open('./y_removed_final.plk','wb')
pickle.dump(y_removed_final, open('./y_removed_final.plk','wb'))
y_p.close()
unique_name_p = open('./unique_name.plk','wb')
pickle.dump(unique_name, open('./unique_name.plk','wb'))
unique_name_p.close()
data_unique_item_p = open('./data_unique_item.plk','wb')
pickle.dump(data_unique_item, open('./data_unique_item.plk','wb'))
data_unique_item_p.close()

In [None]:
joblib.dump(bst,'model.pkl')