In [1]:
import pandas as pd
import numpy as np
import glob
import jieba
from sklearn.metrics.pairwise import cosine_similarity
from jaccard_index.jaccard import jaccard_index
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from scipy import spatial
from scipy.spatial import distance
import pickle
import nltk
import lightgbm as lgb
from sklearn.externals import joblib

In [21]:
from jslearn import nlp
embedding = nlp.TencentWordEmbedding()
embedding.load_pretrained('./')

  from numpy.core.umath_tests import inner1d


Loading model
Finish loading model


In [2]:
#loading x and y removed from training
x_r = open('./x_removed_final.plk','rb')
x_removed_final = pickle.load(open('./x_removed_final.plk','rb'))
x_r.close()
y_r = open('./y_removed_final.plk','rb')
y_removed_final = pickle.load(open('./y_removed_final.plk','rb'))
y_r.close()
unique_name_r = open('./unique_name.plk','rb')
unique_name = pickle.load(open('./unique_name.plk','rb'))
unique_name_r.close()
data_unique_item_r = open('./data_unique_item.plk','rb')
data_unique_item = pickle.load(open('./data_unique_item.plk','rb'))
data_unique_item_r.close()

In [3]:
bst = joblib.load('model.pkl') # load model

# inference

In [36]:
def get_vec(x):
    '''get the vector from name, if name not found in tencent trained,
    split words and try to get vector from each splitted then avereage,
    if it does not exist, return null'''
    try:
        vector = embedding.tc_wv.get_vector(x)
#         print(x + 'full line processed')
    except:
        lst = list(jieba.cut(x))
        to_avg = []
        to_div = 0
        for x in lst:
            try:
                to_avg += [embedding.tc_wv.get_vector(x)]
                to_div += 1
            except:
                pass
        if to_div != 0:
            vector = sum(to_avg)/to_div
#             print(x + 'trimmed processed')
        else:
            vector = None
#             print('word does not match')
    return vector

In [37]:
def single_item_df(prod_name,x_removed_final,y_removed_final):
    '''
    取得一个产品每一个可能的组合的类别,并得出语义cosine similarity,编辑距离, 跟利维坦距离
    extract the product_name and all its possible combination with y(all the unique categories name)'s cos sim, jac_score, and lev_distance.
    
    Parameters:
    prod_name (str): any product name
    x_removed_final (list): a list of x to remove (derived from train_n_filter)
    y_removed_final (list): a list of y to remove (derived from train_n_filter)
    
    Returns:
    dataframe of every possible categorical combination and its fellow scores, or None if the prod_name cannot be detected'''
    inference_df = pd.DataFrame()
    def jac_calc(y):
        try:
            return jaccard_index(prod_name,y['name'])
        except:
            return 0
    unique_name_c  = unique_name.copy()
    
    #remove x and y that were not trained
    if prod_name in x_removed_final:
        return 
    unique_name_c = unique_name_c[unique_name_c['name'].apply(lambda x: True if x not in y_removed_final else False)]
    
    cos_score = unique_name_c.apply(lambda y: 1 - spatial.distance.cosine(get_vec(prod_name),y['y_vec']), axis = 1)# get cosine similarity score for semantic
    jac_score = unique_name_c.apply(lambda y: jac_calc(y),axis =1) # get jaccard similarity score for vocab distance
    lev_distance = unique_name_c.apply(lambda y: nltk.edit_distance(prod_name,y['name']), axis = 1)

    xy_df = pd.DataFrame()
    xy_df['y_name'] = unique_name_c['name']
    xy_df['x_name'] = prod_name
    xy_df['cos'] = cos_score
    xy_df['jac'] = jac_score
    xy_df['lev'] = lev_distance
    xy_df['len(x)'] = len(prod_name)
    xy_df['len(y)'] = unique_name_c['name'].apply(lambda y: len(y))
    inference_df = inference_df.append(xy_df)
    inference_df = inference_df.reset_index(drop = True)
    return inference_df

In [38]:
def get_inference_data(data_unique_item,x_removed_final,y_removed_final):
    '''
    取得所有产品每一个可能的组合的类别,并得出语义cosine similarity,编辑距离, 跟利维坦距离
    extract the product_name and all its possible combination with y(all the unique categories name)'s cos sim, jac_score, and lev_distance
    
    Parameters:
    prod_name (str): any product name
    x_removed_final (list): a list of x to remove (derived from train_n_filter)
    y_removed_final (list): a list of y to remove (derived from train_n_filter)
    
    Returns:
    dataframe of every possible categorical combination and its fellow scores'''
    ml_ready_df = pd.DataFrame()
    for row_tuple in data_unique_item.iterrows():
        row = row_tuple[1]
        def jac_calc(y):
            try:
                return jaccard_index(row['S_SEGMENT_ITEM'],y['name'])
            except:
                return 0
        #remove x and y that were not trained
        if ((row['S_SEGMENT_ITEM'] in x_removed_final) | (row['name'] in y_removed_final)):
#             print(row['S_SEGMENT_ITEM'])
#             print(row['name'])
#             print('pass')
            continue
        unique_name_c  = unique_name.copy()

        cos_score = unique_name_c.apply(lambda y: 1 - spatial.distance.cosine(get_vec(row['S_SEGMENT_ITEM']),y['y_vec']), axis = 1)# get cosine similarity score for semantic
        jac_score = unique_name_c.apply(lambda y: jac_calc(y),axis =1) # get jaccard similarity score for vocab distance
        lev_distance = unique_name_c.apply(lambda y: nltk.edit_distance(row['S_SEGMENT_ITEM'],y['name']), axis = 1)

        xy_df = pd.DataFrame()
        xy_df['y_name'] = unique_name_c['name']
        xy_df['x_name'] = row['S_SEGMENT_ITEM']
        xy_df['cos'] = cos_score
        xy_df['jac'] = jac_score
        xy_df['lev'] = lev_distance
        xy_df['len(x)'] = len(row['S_SEGMENT_ITEM'])
        xy_df['len(y)'] = unique_name_c['name'].apply(lambda y: len(y))

        ml_ready_df = ml_ready_df.append(xy_df)
    ml_ready_df = ml_ready_df.reset_index(drop = True)

    inference_df = ml_ready_df.copy()
    return inference_df

In [39]:
inference_df = get_inference_data(data_unique_item,x_removed_final,y_removed_final)
inference_df.to_csv('inference_df_filtered.csv')

In [45]:
inference_df = pd.read_csv('inference_df.csv').iloc[:,1:]
inference_df

Unnamed: 0,y_name,x_name,cos,jac,lev,len(x),len(y),y
0,其他,其他业务,0.493589,0.333333,2,4,2,1
1,内部抵消,其他业务,0.371301,0.000000,4,4,4,0
2,汽车玻璃升降器,其他业务,0.258197,0.000000,7,4,7,0
3,汽车发动机整机,其他业务,0.390052,0.000000,7,4,7,0
4,汽车活塞,其他业务,0.182966,0.000000,4,4,4,0
...,...,...,...,...,...,...,...,...
85294891,原纸,移动互联网终端CPU芯片,0.394128,0.000000,12,12,2,0
85294892,复方甘草酸苷,移动互联网终端CPU芯片,0.307312,0.000000,12,12,6,0
85294893,辐照技术服务,移动互联网终端CPU芯片,0.680462,0.000000,12,12,6,0
85294894,冷却系统,移动互联网终端CPU芯片,0.420053,0.000000,12,12,4,0


In [46]:
inference_df = inference_df.drop(columns = ['y'])

In [47]:
def nlp_inference(inference_df,bst):
    '''
    用lightGBM来预测inference_df每个产品名的类别
    
    Parameters:
    inference_df (pandas dataframe): the dataframe with y_name, x_name, cos, jac, lev, len(x), len(y) columns derived from get_inference_data
    bst (trained lightGBM.LGBMClassifier model from train_n_filter function): trained LGBMClassifier
    
    Returns:
    dataframe for where each product belong and its accuracy
    '''
    inference_df_c = inference_df.copy()
    temp_test = bst.predict(inference_df_c.drop(columns = ['y_name','x_name']))
    acc = bst.predict_proba(inference_df_c.drop(columns = ['y_name','x_name']))
    inference_df_c['perc'] = [x[1] for x in acc]
    temp_result = inference_df_c.sort_values(by = ['x_name','perc'],ascending= False).drop_duplicates(subset = ['x_name'])
    temp_result = temp_result[temp_result['perc']> 0.5]
    return temp_result

In [49]:
# result = nlp_inference(inference_df,bst)

In [51]:
result = nlp_inference(single_item_df('矿泉水',x_removed_final,y_removed_final),bst)

  if diff:


In [52]:
result

Unnamed: 0,y_name,x_name,cos,jac,lev,len(x),len(y),perc
152,饮用水,矿泉水,0.73956,0.0,2,3,3,0.55106


# Chinese-companies-NLP

There are hundred of thousands of products from each company listed on the stock market. If your job is to categorize each product into a category, how would you do it?

This is what my NLP model aims to resolve. Given unmatched categories and company products, this NLP model predicts the product's category with a percentage of confidence.

This model takes into account five parameters for categories and products:
To take semantic into account:
- cosine similarity of the semantics (Based on TencentWordEmbedding)
To take syntactic into account:
- jaccard similarity of the words
- edit distance of the words
- length of the product string
- length of the categories string

Result:
This model is able to predict most of the products that are intuitive or vaguely representative to the categories with a high degree of accuracy (above 70% most of the time for intuitive products). Example: 矿泉水 (products) to 饮用水 (categories)
However, the model will not be able to predict specialized products such as: A100 (products) to 打印机 (categories)