# Tags Sentiment Analysis - 情感极性

## Dependencies

In [None]:
!pip install ktrain
!pip install https://github.com/amaiya/eli5-tf/archive/refs/heads/master.zip

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import ktrain
import eli5
import jieba
from ktrain import text
import matplotlib.pyplot as plt
import pickle

## Data Preparation

In [2]:
data_train = pd.read_csv('datasets/train.csv')
data_test = pd.read_csv('datasets/test.csv')
data_val = pd.read_csv('datasets/dev.csv')

data_train.drop(['id', 'star'], axis=1, inplace=True)
data_val.drop(['id', 'star'], axis=1, inplace=True)
data_test.drop(['id', 'star'], axis=1, inplace=True)

In [3]:
#构建label值
def sentiBin(score):
    if score > 0:
        return '1'
    elif score < 0:
        return '-1'

In [4]:
tag_list = [['Location#Transportation', 'Location#Downtown', 'Location#Easy_to_find'], 
            ['Service#Queue', 'Service#Hospitality', 'Service#Parking', 'Service#Timely'],
            ['Price#Level', 'Price#Cost_effective', 'Price#Discount'],
            ['Ambience#Decoration', 'Ambience#Noise', 'Ambience#Space', 'Ambience#Sanitary'],
            ['Food#Portion', 'Food#Taste', 'Food#Appearance', 'Food#Recommend']
            ]

for dataset in [data_train, data_val, data_test]:
    #-2转0
    dataset.replace(-2, 0, inplace=True)
    for l in tag_list:
        tmp = dataset[l].sum(axis=1)
        class_name = l[0].split('#')[0]
        print(class_name)
        dataset[class_name] = tmp.map(lambda x: sentiBin(x))
        print(dataset[class_name].value_counts())
        for cls in l:
            dataset.drop(cls, axis=1, inplace=True)

Location
1     12010
-1     1193
Name: Location, dtype: int64
Service
1     15165
-1     5050
Name: Service, dtype: int64
Price
1     11863
-1     4624
Name: Price, dtype: int64
Ambience
1     15591
-1     3181
Name: Ambience, dtype: int64
Food
1     24154
-1     3548
Name: Food, dtype: int64
Location
1     1590
-1     168
Name: Location, dtype: int64
Service
1     1987
-1     673
Name: Service, dtype: int64
Price
1     1660
-1     630
Name: Price, dtype: int64
Ambience
1     2148
-1     423
Name: Ambience, dtype: int64
Food
1     3155
-1     521
Name: Food, dtype: int64
Location
1     1597
-1     186
Name: Location, dtype: int64
Service
1     1961
-1     683
Name: Service, dtype: int64
Price
1     1571
-1     624
Name: Price, dtype: int64
Ambience
1     2029
-1     451
Name: Ambience, dtype: int64
Food
1     3155
-1     499
Name: Food, dtype: int64


In [5]:
Location_train = data_train[['review', 'Location']]
Location_train = Location_train.dropna()
Location_val = data_val[['review', 'Location']]
Location_val = Location_val.dropna()
Location_test = data_test[['review', 'Location']]
Location_test = Location_test.dropna()

Service_train = data_train[['review', 'Service']]
Service_train = Service_train.dropna()
Service_val = data_val[['review', 'Service']]
Service_val = Service_val.dropna()
Service_test = data_test[['review', 'Service']]
Service_test = Service_test.dropna()

Price_train = data_train[['review', 'Price']]
Price_train = Price_train.dropna()
Price_val = data_val[['review', 'Price']]
Price_val = Price_val.dropna()
Price_test = data_test[['review', 'Price']]
Price_test = Price_test.dropna()

Ambience_train = data_train[['review', 'Ambience']]
Ambience_train = Ambience_train.dropna()
Ambience_val = data_val[['review', 'Ambience']]
Ambience_val = Ambience_val.dropna()
Ambience_test = data_test[['review', 'Ambience']]
Ambience_test = Ambience_test.dropna()

Food_train = data_train[['review', 'Food']]
Food_train = Food_train.dropna()
Food_val = data_val[['review', 'Food']]
Food_val = Food_val.dropna()
Food_test = data_test[['review', 'Food']]
Food_test = Food_test.dropna()

In [6]:
mask_neg = Location_train['Location'] == '-1'

Location_train_neg = Location_train[mask_neg]

Location_train_oversamp = pd.concat([Location_train,Location_train_neg,Location_train_neg,Location_train_neg,Location_train_neg,Location_train_neg,Location_train_neg,Location_train_neg,Location_train_neg,Location_train_neg], ignore_index=True)
Location_train_oversamp.iloc[: , 1:].value_counts()

Location
1           12010
-1          11930
dtype: int64

In [7]:
mask_neg = Service_train['Service'] == '-1'

Service_train_neg = Service_train[mask_neg]

Service_train_oversamp = pd.concat([Service_train,Service_train_neg, Service_train_neg], ignore_index=True)
Service_train_oversamp.iloc[: , 1:].value_counts()

Service
1          15165
-1         15150
dtype: int64

In [8]:
mask_neg = Price_train['Price'] == '-1'

Price_train_neg = Price_train[mask_neg]

Price_train_oversamp = pd.concat([Price_train,Price_train_neg, Price_train_neg], ignore_index=True)
Price_train_oversamp.iloc[: , 1:].value_counts()

Price
-1       13872
1        11863
dtype: int64

In [9]:
mask_neg = Ambience_train['Ambience'] == '-1'

Ambience_train_neg = Ambience_train[mask_neg]

Ambience_train_oversamp = pd.concat([Ambience_train,Ambience_train_neg, Ambience_train_neg, Ambience_train_neg, Ambience_train_neg], ignore_index=True)
Ambience_train_oversamp.iloc[: , 1:].value_counts()

Ambience
-1          15905
1           15591
dtype: int64

In [10]:
mask_neg = Food_train['Food'] == '-1'

Food_train_neg = Food_train[mask_neg]

Food_train_oversamp = pd.concat([Food_train, Food_train_neg, Food_train_neg, Food_train_neg, Food_train_neg, Food_train_neg, Food_train_neg],ignore_index=True)
Food_train_oversamp.iloc[: , 1:].value_counts()

Food
-1      24836
1       24154
dtype: int64

## LR

### Tokenization

In [11]:
#引入停用词
infile = open("stopwords-zh.txt",encoding='utf-8')
stopwords_lst = infile.readlines()
stopwords = [x.strip() for x in stopwords_lst]

#中文分词
def tokenization_dataset(train_data):
    tokenized_df = train_data.apply(lambda x:' '.join(jieba.cut(x)))
    return tokenized_df

def tokenization_text(text):
    tokenized_text = ' '.join(jieba.cut(text))
    return tokenized_text

In [12]:
x_Location_train = tokenization_dataset(Location_train['review'])
x_Service_train = tokenization_dataset(Service_train['review'])
x_Price_train = tokenization_dataset(Price_train['review'])
x_Ambience_train = tokenization_dataset(Ambience_train['review'])
x_Food_train = tokenization_dataset(Food_train['review'])

y_Location_train = Location_train['Location']
y_Service_train = Service_train['Service']
y_Price_train = Price_train['Price']
y_Ambience_train = Ambience_train['Ambience']
y_Food_train = Food_train['Food']

x_Location_val = tokenization_dataset(Location_val['review'])
x_Service_val = tokenization_dataset(Service_val['review'])
x_Price_val = tokenization_dataset(Price_val['review'])
x_Ambience_val = tokenization_dataset(Ambience_val['review'])
x_Food_val = tokenization_dataset(Food_val['review'])

y_Location_val = Location_val['Location']
y_Service_val = Service_val['Service']
y_Price_val = Price_val['Price']
y_Ambience_val = Ambience_val['Ambience']
y_Food_val = Food_val['Food']

x_Location_test = tokenization_dataset(Location_test['review'])
x_Service_test = tokenization_dataset(Service_test['review'])
x_Price_test = tokenization_dataset(Price_test['review'])
x_Ambience_test = tokenization_dataset(Ambience_test['review'])
x_Food_test = tokenization_dataset(Food_test['review'])

y_Location_test = Location_test['Location']
y_Service_test = Service_test['Service']
y_Price_test = Price_test['Price']
y_Ambience_test = Ambience_test['Ambience']
y_Food_test = Food_test['Food']

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\miuGrey\AppData\Local\Temp\jieba.cache
Loading model cost 0.605 seconds.
Prefix dict has been built successfully.


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
#使用tf-idf把文本转为向量
tv_1 = TfidfVectorizer(stop_words=stopwords, max_features=30000, lowercase = False)
tv_2 = TfidfVectorizer(stop_words=stopwords, max_features=30000, lowercase = False)
tv_3 = TfidfVectorizer(stop_words=stopwords, max_features=30000, lowercase = False)
tv_4 = TfidfVectorizer(stop_words=stopwords, max_features=30000, lowercase = False)
tv_5 = TfidfVectorizer(stop_words=stopwords, max_features=30000, lowercase = False)

tv_1.fit(x_Location_train)
tv_2.fit(x_Service_train)
tv_3.fit(x_Price_train)
tv_4.fit(x_Ambience_train)
tv_5.fit(x_Food_train)

TfidfVectorizer(lowercase=False, max_features=30000,
                stop_words=['$', '0', '1', '2', '3', '4', '5', '6', '7', '8',
                            '9', '?', '_', '“', '”', '、', '。', '《', '》', '一',
                            '一些', '一何', '一切', '一则', '一方面', '一旦', '一来', '一样',
                            '一般', '一转眼', ...])

In [None]:
# 保存模型  
with open('LR_model_senti/tv_2_model.pickle', 'wb') as f:
    pickle.dump(tv_2, f)
    
with open('LR_model_senti/tv_3_model.pickle', 'wb') as f:
    pickle.dump(tv_3, f)
    
with open('LR_model_senti/tv_4_model.pickle', 'wb') as f:
    pickle.dump(tv_4, f)
    
with open('LR_model_senti/tv_5_model.pickle', 'wb') as f:
    pickle.dump(tv_5, f)

### Train

In [14]:
from sklearn.linear_model import LogisticRegression

In [15]:
model_Location = LogisticRegression(max_iter=10000)
model_Location.fit(tv_1.transform(x_Location_train), y_Location_train)
model_Location.score(tv_1.transform(x_Location_val), y_Location_val)

0.906712172923777

In [16]:
model_Service = LogisticRegression(max_iter=10000)
model_Service.fit(tv_2.transform(x_Service_train), y_Service_train)
model_Service.score(tv_2.transform(x_Service_val), y_Service_val)

0.856015037593985

In [17]:
model_Price = LogisticRegression(max_iter=10000)
model_Price.fit(tv_3.transform(x_Price_train), y_Price_train)
model_Price.score(tv_3.transform(x_Price_val), y_Price_val) 

0.8213973799126637

In [18]:
model_Ambience = LogisticRegression(max_iter=10000)
model_Ambience.fit(tv_4.transform(x_Ambience_train), y_Ambience_train)
model_Ambience.score(tv_4.transform(x_Ambience_val), y_Ambience_val)

0.8697005056398288

In [19]:
model_Food = LogisticRegression(max_iter=10000)
model_Food.fit(tv_5.transform(x_Food_train), y_Food_train)
model_Food.score(tv_5.transform(x_Food_val), y_Food_val)

0.8906420021762785

In [None]:
with open('LR_model_senti/model_Service_LR.pickle','wb') as f: 
    pickle.dump(model_Service,f) 

with open('LR_model_senti/model_Price_LR.pickle','wb') as f: 
    pickle.dump(model_Price,f)

with open('LR_model_senti/model_Ambience_LR.pickle','wb') as f: 
    pickle.dump(model_Ambience,f) 
    
with open('LR_model_senti/model_Food_LR.pickle','wb') as f: 
    pickle.dump(model_Food,f) 

### Evaluation for LR Model

In [20]:
y_Location_pred = model_Location.predict(tv_1.transform(x_Location_test))
y_Service_pred = model_Service.predict(tv_2.transform(x_Service_test))
y_Price_pred = model_Price.predict(tv_3.transform(x_Price_test))
y_Ambience_pred = model_Ambience.predict(tv_4.transform(x_Ambience_test))
y_Food_pred = model_Food.predict(tv_5.transform(x_Food_test))

In [21]:
# classification report
from sklearn.metrics import classification_report
print(classification_report(y_Location_test, y_Location_pred))
print(classification_report(y_Service_test, y_Service_pred))
print(classification_report(y_Price_test, y_Price_pred))
print(classification_report(y_Ambience_test, y_Ambience_pred))
print(classification_report(y_Food_test, y_Food_pred))

              precision    recall  f1-score   support

          -1       0.71      0.03      0.05       186
           1       0.90      1.00      0.95      1597

    accuracy                           0.90      1783
   macro avg       0.81      0.51      0.50      1783
weighted avg       0.88      0.90      0.85      1783

              precision    recall  f1-score   support

          -1       0.87      0.53      0.66       683
           1       0.85      0.97      0.91      1961

    accuracy                           0.86      2644
   macro avg       0.86      0.75      0.78      2644
weighted avg       0.86      0.86      0.84      2644

              precision    recall  f1-score   support

          -1       0.88      0.51      0.65       624
           1       0.83      0.97      0.90      1571

    accuracy                           0.84      2195
   macro avg       0.86      0.74      0.77      2195
weighted avg       0.85      0.84      0.83      2195

              preci

In [22]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_Location_test, y_Location_pred))
print(confusion_matrix(y_Service_test, y_Service_pred))
print(confusion_matrix(y_Price_test, y_Price_pred))
print(confusion_matrix(y_Ambience_test, y_Ambience_pred))
print(confusion_matrix(y_Food_test, y_Food_pred))

[[   5  181]
 [   2 1595]]
[[ 359  324]
 [  54 1907]]
[[ 321  303]
 [  45 1526]]
[[ 128  323]
 [  17 2012]]
[[ 156  343]
 [  23 3132]]


In [23]:
def lr_explain(model, sentence, vec):
    return eli5.show_prediction(model, tokenization_text(sentence), vec=vec, feature_names=vec.get_feature_names_out())

In [24]:
data_test.iloc[53]

review      虽然就在家附近，但很久没有来明记吃饭了。因为附近太多选择，什么上渡食家，凤厨等等这些比较出名...
Location                                                 None
Service                                                    -1
Price                                                      -1
Ambience                                                    1
Food                                                        1
Name: 53, dtype: object

In [25]:
display(lr_explain(model_Location, data_test['review'][53], tv_1))
display(lr_explain(model_Service, data_test['review'][53], tv_2))
display(lr_explain(model_Price, data_test['review'][53], tv_3))
display(lr_explain(model_Ambience, data_test['review'][53], tv_4))
display(lr_explain(model_Food, data_test['review'][53], tv_5))

Contribution?,Feature
2.295,<BIAS>
0.683,Highlighted in text (sum)


Contribution?,Feature
0.963,Highlighted in text (sum)
-0.615,<BIAS>


Contribution?,Feature
0.59,Highlighted in text (sum)
-0.532,<BIAS>


Contribution?,Feature
1.16,<BIAS>
0.034,Highlighted in text (sum)


Contribution?,Feature
1.669,<BIAS>
-0.201,Highlighted in text (sum)


In [26]:
test1 = '很好吃，环境好，所有员工的态度都很好，上菜快，服务也很好，味道好吃，都是用蒸馏水煮的，推荐，超好吃' #5星好评
test2 = '糯米外皮不绵滑，豆沙馅粗躁，没有香甜味。12元一碗不值。' #1星差评
# 4星
test3 = '昨儿晚上来凯德1818的绿茶吃饭，点了以下10个菜，这里面有好几个都是推荐菜品，咱挨个说啊～  第一张 “客家茄子煲”：一个字：咸、俩字：很咸、仨字：非常咸、四个字：咸（hou)死我了... ... 我很怀疑是不是师傅在放酱油的时候手抖了 然后整锅煲就是一锅酱色。里面的咸鱼粒很显然没有经过任何处理（泡一下水去掉部分盐分等），再加之菜本身很咸根本无法入口，整锅煲基本没动；第二张 “鱼头诱惑 ”：这道菜是翔哥点的 还不错 首先鱼头很新鲜 其次鱼头蒸的火候刚好 既入味还不老，建议除了小米辣再加上点泡椒，这样无论是颜色和口味上都会更棒的！这道菜基本消灭； 第三张：“小锅土豆”：厚片的土豆挂满酱汁，炉子在下面慢慢加热，后来加一片放到嘴里，几乎是入口即化、口感绵软，也是一道不错的下饭菜；第四张：“农家小菜”其实就是少了橄榄菜和肉末的豆角粒，味道还可以，如果把豆角再煸的干一点（表皮起皱）口感会更好；第五张“菜心金钩豆腐” 和没上图的“老乡浓汤”一起说了，两道分不清是汤、羹还是菜的东东，都是推荐菜 分不清也无所谓，重点是上菜的时候都不是热的 都是温的 “菜心”里面有蛋黄、“浓汤”里面有猪肚，这两种食材变冷后都会有腥味且口感不好，加之一碗有淀粉类的羹 温温的很难喝，建议出品后马上传菜到客人面前，口味上冷了就都不美味了；第六张：“麻酱油麦菜”很多地方也叫“麻酱凤尾” 麻酱用的像是麻酱和花生酱混合的甜口的酱料，重点是没有稀释！整个一坨粘在油麦菜上 根本拌不开 相比甜口的麻酱我更喜欢热干面那种咸口的、稀点儿的；第七张“面包诱惑”：这道是敬菜，口味还不错，冰淇淋也很好吃 就是卖相太一般了 既然要推广 至少要给点儿装饰，不用太复杂 拉个巧克力线条、给个蛋卷、水果粒稍稍点缀一下就会好看很多 面包本身口感和冰淇淋都很好吃 比很多外面装饰的那种美美的好吃多了 就差了一点装饰；第八张“绿茶烤鸡”很多人点这道菜，相比前几道的重口，烤鸡显得淡了一些。鸡皮的保护基本没有 所以整个鸡肉又干有柴、也挂不住调味料、下次有机会点个整只的或许会好一点；最后还有未上图的绿茶饼和糖醋里脊，绿茶饼还不错 可以一试 只要火候控制的好不炸过基本没问题，里面糯糯的馅儿很好吃；“糖醋里脊” 这是我吃过最难吃的版本，没有之一！上次有一小哥吐了一通槽 我还想就一个糖醋里脊也不是啥难菜 不至于吧 这次就抱着猎奇的心里点了一个 菜刚上来我就后悔了---老抽色（shai）的、外面一圈淀粉（粉面子）基本没吃着肉，糊哒哒粘成一坨 看起来和那个茄子煲没什么太大区别。建议找一个做糖醋里脊的店吃一次 看一看 就上个浆 过个油 浇个汁儿的事情 没那么难  肉类的菜品价格可以适当调高一些 我宁可多花点儿钱吃肉也不想吃这一坨坨粉面子。 总体而言环境还不错 价格很平民 服务中规中矩 但是菜品质量很是有待提高。看墙上的照片（不知道老板是不是做青旅起家的）菜品不会因为价格低而有销路 只会因为质量好才畅销（相比低价 现在的人们更喜欢质量好而有保障的食品 价格好一点反而是保证） 希望越做越好。地址在凯德1818  5楼手aaa扶梯旁。'

In [27]:
display(lr_explain(model_Location, test1, tv_1))
display(lr_explain(model_Service, test1, tv_2))
display(lr_explain(model_Price, test1, tv_3))
display(lr_explain(model_Ambience, test1, tv_4))
display(lr_explain(model_Food, test1, tv_5))

Contribution?,Feature
2.295,<BIAS>
0.215,Highlighted in text (sum)


Contribution?,Feature
1.529,Highlighted in text (sum)
0.615,<BIAS>


Contribution?,Feature
0.532,<BIAS>
0.243,Highlighted in text (sum)


Contribution?,Feature
2.3,Highlighted in text (sum)
1.16,<BIAS>


Contribution?,Feature
1.669,<BIAS>
0.897,Highlighted in text (sum)


In [28]:
display(lr_explain(model_Location, test2, tv_1))
display(lr_explain(model_Service, test2, tv_2))
display(lr_explain(model_Price, test2, tv_3))
display(lr_explain(model_Ambience, test2, tv_4))
display(lr_explain(model_Food, test2, tv_5))

Contribution?,Feature
2.295,<BIAS>
-0.293,Highlighted in text (sum)


Contribution?,Feature
0.747,Highlighted in text (sum)
-0.615,<BIAS>


Contribution?,Feature
2.253,Highlighted in text (sum)
-0.532,<BIAS>


Contribution?,Feature
1.395,Highlighted in text (sum)
-1.16,<BIAS>


Contribution?,Feature
2.013,Highlighted in text (sum)
-1.669,<BIAS>


In [29]:
display(lr_explain(model_Location, test3, tv_1))
display(lr_explain(model_Service, test3, tv_2))
display(lr_explain(model_Price, test3, tv_3))
display(lr_explain(model_Ambience, test3, tv_4))
display(lr_explain(model_Food, test3, tv_5))

Contribution?,Feature
2.295,<BIAS>
0.251,Highlighted in text (sum)


Contribution?,Feature
0.615,<BIAS>
0.459,Highlighted in text (sum)


Contribution?,Feature
0.532,<BIAS>
0.292,Highlighted in text (sum)


Contribution?,Feature
1.16,<BIAS>
0.658,Highlighted in text (sum)


Contribution?,Feature
1.669,<BIAS>
-0.927,Highlighted in text (sum)


## LR Oversampled

### Tokenization

In [30]:
#引入停用词
infile = open("stopwords-zh.txt",encoding='utf-8')
stopwords_lst = infile.readlines()
stopwords = [x.strip() for x in stopwords_lst]

#中文分词
def tokenization_dataset(train_data):
    tokenized_df = train_data.apply(lambda x:' '.join(jieba.cut(x)))
    return tokenized_df

def tokenization_text(text):
    tokenized_text = ' '.join(jieba.cut(text))
    return tokenized_text

In [31]:
x_Location_train_o = tokenization_dataset(Location_train_oversamp['review'])
x_Service_train_o = tokenization_dataset(Service_train_oversamp['review'])
x_Price_train_o = tokenization_dataset(Price_train_oversamp['review'])
x_Ambience_train_o = tokenization_dataset(Ambience_train_oversamp['review'])
x_Food_train_o = tokenization_dataset(Food_train_oversamp['review'])

y_Location_train_o = Location_train_oversamp['Location']
y_Service_train_o = Service_train_oversamp['Service']
y_Price_train_o = Price_train_oversamp['Price']
y_Ambience_train_o = Ambience_train_oversamp['Ambience']
y_Food_train_o = Food_train_oversamp['Food']

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
#使用tf-idf把文本转为向量
tv_1_o = TfidfVectorizer(stop_words=stopwords, max_features=30000, lowercase = False)
tv_2_o = TfidfVectorizer(stop_words=stopwords, max_features=30000, lowercase = False)
tv_3_o = TfidfVectorizer(stop_words=stopwords, max_features=30000, lowercase = False)
tv_4_o = TfidfVectorizer(stop_words=stopwords, max_features=30000, lowercase = False)
tv_5_o = TfidfVectorizer(stop_words=stopwords, max_features=30000, lowercase = False)

tv_1_o.fit(x_Location_train_o)
tv_2_o.fit(x_Service_train_o)
tv_3_o.fit(x_Price_train_o)
tv_4_o.fit(x_Ambience_train_o)
tv_5_o.fit(x_Food_train_o)

TfidfVectorizer(lowercase=False, max_features=30000,
                stop_words=['$', '0', '1', '2', '3', '4', '5', '6', '7', '8',
                            '9', '?', '_', '“', '”', '、', '。', '《', '》', '一',
                            '一些', '一何', '一切', '一则', '一方面', '一旦', '一来', '一样',
                            '一般', '一转眼', ...])

In [52]:
# 保存模型
with open('LR_model_senti/tv_1_model.pickle', 'wb') as f:
    pickle.dump(tv_1_o, f)
    
with open('LR_model_senti/tv_2_model.pickle', 'wb') as f:
    pickle.dump(tv_2_o, f)
    
with open('LR_model_senti/tv_3_model.pickle', 'wb') as f:
    pickle.dump(tv_3_o, f)
    
with open('LR_model_senti/tv_4_model.pickle', 'wb') as f:
    pickle.dump(tv_4_o, f)
    
with open('LR_model_senti/tv_5_model.pickle', 'wb') as f:
    pickle.dump(tv_5_o, f)

### Train

In [33]:
from sklearn.linear_model import LogisticRegression

In [34]:
model_Location_o = LogisticRegression(max_iter=10000)
model_Location_o.fit(tv_1_o.transform(x_Location_train_o), y_Location_train_o)
model_Location_o.score(tv_1_o.transform(x_Location_val), y_Location_val)

0.8987485779294653

In [35]:
model_Service_o = LogisticRegression(max_iter=10000)
model_Service_o.fit(tv_2_o.transform(x_Service_train_o), y_Service_train_o)
model_Service_o.score(tv_2_o.transform(x_Service_val), y_Service_val)

0.8466165413533835

In [36]:
model_Price_o = LogisticRegression(max_iter=10000)
model_Price_o.fit(tv_3_o.transform(x_Price_train_o), y_Price_train_o)
model_Price_o.score(tv_3_o.transform(x_Price_val), y_Price_val)

0.8126637554585153

In [37]:
model_Ambience_o = LogisticRegression(max_iter=10000)
model_Ambience_o.fit(tv_4_o.transform(x_Ambience_train_o), y_Ambience_train_o)
model_Ambience_o.score(tv_4_o.transform(x_Ambience_val), y_Ambience_val)

0.838973162193699

In [38]:
model_Food_o = LogisticRegression(max_iter=10000)
model_Food_o.fit(tv_5_o.transform(x_Food_train_o), y_Food_train_o)
model_Food_o.score(tv_5_o.transform(x_Food_val), y_Food_val)

0.8574537540805223

In [51]:
with open('LR_model_senti/model_Location_LR.pickle','wb') as f: 
    pickle.dump(model_Location_o,f) 

with open('LR_model_senti/model_Service_LR.pickle','wb') as f: 
    pickle.dump(model_Service_o,f) 

with open('LR_model_senti/model_Price_LR.pickle','wb') as f: 
    pickle.dump(model_Price_o,f) 

with open('LR_model_senti/model_Ambience_LR.pickle','wb') as f: 
    pickle.dump(model_Ambience_o,f) 

with open('LR_model_senti/model_Food_LR.pickle','wb') as f: 
    pickle.dump(model_Food_o,f) 

### Evaluation for LR Model

In [39]:
y_Location_pred_o = model_Location_o.predict(tv_1_o.transform(x_Location_test))
y_Service_pred_o = model_Service_o.predict(tv_2_o.transform(x_Service_test))
y_Price_pred_o = model_Price_o.predict(tv_3_o.transform(x_Price_test))
y_Ambience_pred_o = model_Ambience_o.predict(tv_4_o.transform(x_Ambience_test))
y_Food_pred_o = model_Food_o.predict(tv_5_o.transform(x_Food_test))

In [40]:
# classification report
from sklearn.metrics import classification_report
print(classification_report(y_Location_test, y_Location_pred_o))
print(classification_report(y_Service_test, y_Service_pred_o))
print(classification_report(y_Price_test, y_Price_pred_o))
print(classification_report(y_Ambience_test, y_Ambience_pred_o))
print(classification_report(y_Food_test, y_Food_pred_o))

              precision    recall  f1-score   support

          -1       0.48      0.59      0.53       186
           1       0.95      0.93      0.94      1597

    accuracy                           0.89      1783
   macro avg       0.72      0.76      0.73      1783
weighted avg       0.90      0.89      0.90      1783

              precision    recall  f1-score   support

          -1       0.65      0.77      0.71       683
           1       0.92      0.86      0.88      1961

    accuracy                           0.83      2644
   macro avg       0.78      0.81      0.80      2644
weighted avg       0.85      0.83      0.84      2644

              precision    recall  f1-score   support

          -1       0.63      0.83      0.72       624
           1       0.92      0.81      0.86      1571

    accuracy                           0.81      2195
   macro avg       0.78      0.82      0.79      2195
weighted avg       0.84      0.81      0.82      2195

              preci

In [41]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_Location_test, y_Location_pred_o))
print(confusion_matrix(y_Service_test, y_Service_pred_o))
print(confusion_matrix(y_Price_test, y_Price_pred_o))
print(confusion_matrix(y_Ambience_test, y_Ambience_pred_o))
print(confusion_matrix(y_Food_test, y_Food_pred_o))

[[ 110   76]
 [ 119 1478]]
[[ 527  156]
 [ 281 1680]]
[[ 519  105]
 [ 302 1269]]
[[ 343  108]
 [ 245 1784]]
[[ 360  139]
 [ 317 2838]]


In [42]:
def lr_explain(model, sentence, vec):
    return eli5.show_prediction(model, tokenization_text(sentence), vec=vec, feature_names=vec.get_feature_names_out())

In [43]:
data_test.iloc[53]

review      虽然就在家附近，但很久没有来明记吃饭了。因为附近太多选择，什么上渡食家，凤厨等等这些比较出名...
Location                                                 None
Service                                                    -1
Price                                                      -1
Ambience                                                    1
Food                                                        1
Name: 53, dtype: object

In [44]:
display(lr_explain(model_Location_o, data_test['review'][53], tv_1_o))
display(lr_explain(model_Service_o, data_test['review'][53], tv_2_o))
display(lr_explain(model_Price_o, data_test['review'][53], tv_3_o))
display(lr_explain(model_Ambience_o, data_test['review'][53], tv_4_o))
display(lr_explain(model_Food_o, data_test['review'][53], tv_5_o))

Contribution?,Feature
2.088,Highlighted in text (sum)
0.294,<BIAS>


Contribution?,Feature
0.621,<BIAS>
0.503,Highlighted in text (sum)


Contribution?,Feature
0.709,<BIAS>
0.31,Highlighted in text (sum)


Contribution?,Feature
1.296,Highlighted in text (sum)
-0.872,<BIAS>


Contribution?,Feature
1.526,Highlighted in text (sum)
-0.499,<BIAS>


In [45]:
test1 = '很好吃，环境好，所有员工的态度都很好，上菜快，服务也很好，味道好吃，都是用蒸馏水煮的，推荐，超好吃' #5星好评
test2 = '糯米外皮不绵滑，豆沙馅粗躁，没有香甜味。12元一碗不值。' #1星差评
# 4星
test3 = '昨儿晚上来凯德1818的绿茶吃饭，点了以下10个菜，这里面有好几个都是推荐菜品，咱挨个说啊～  第一张 “客家茄子煲”：一个字：咸、俩字：很咸、仨字：非常咸、四个字：咸（hou)死我了... ... 我很怀疑是不是师傅在放酱油的时候手抖了 然后整锅煲就是一锅酱色。里面的咸鱼粒很显然没有经过任何处理（泡一下水去掉部分盐分等），再加之菜本身很咸根本无法入口，整锅煲基本没动；第二张 “鱼头诱惑 ”：这道菜是翔哥点的 还不错 首先鱼头很新鲜 其次鱼头蒸的火候刚好 既入味还不老，建议除了小米辣再加上点泡椒，这样无论是颜色和口味上都会更棒的！这道菜基本消灭； 第三张：“小锅土豆”：厚片的土豆挂满酱汁，炉子在下面慢慢加热，后来加一片放到嘴里，几乎是入口即化、口感绵软，也是一道不错的下饭菜；第四张：“农家小菜”其实就是少了橄榄菜和肉末的豆角粒，味道还可以，如果把豆角再煸的干一点（表皮起皱）口感会更好；第五张“菜心金钩豆腐” 和没上图的“老乡浓汤”一起说了，两道分不清是汤、羹还是菜的东东，都是推荐菜 分不清也无所谓，重点是上菜的时候都不是热的 都是温的 “菜心”里面有蛋黄、“浓汤”里面有猪肚，这两种食材变冷后都会有腥味且口感不好，加之一碗有淀粉类的羹 温温的很难喝，建议出品后马上传菜到客人面前，口味上冷了就都不美味了；第六张：“麻酱油麦菜”很多地方也叫“麻酱凤尾” 麻酱用的像是麻酱和花生酱混合的甜口的酱料，重点是没有稀释！整个一坨粘在油麦菜上 根本拌不开 相比甜口的麻酱我更喜欢热干面那种咸口的、稀点儿的；第七张“面包诱惑”：这道是敬菜，口味还不错，冰淇淋也很好吃 就是卖相太一般了 既然要推广 至少要给点儿装饰，不用太复杂 拉个巧克力线条、给个蛋卷、水果粒稍稍点缀一下就会好看很多 面包本身口感和冰淇淋都很好吃 比很多外面装饰的那种美美的好吃多了 就差了一点装饰；第八张“绿茶烤鸡”很多人点这道菜，相比前几道的重口，烤鸡显得淡了一些。鸡皮的保护基本没有 所以整个鸡肉又干有柴、也挂不住调味料、下次有机会点个整只的或许会好一点；最后还有未上图的绿茶饼和糖醋里脊，绿茶饼还不错 可以一试 只要火候控制的好不炸过基本没问题，里面糯糯的馅儿很好吃；“糖醋里脊” 这是我吃过最难吃的版本，没有之一！上次有一小哥吐了一通槽 我还想就一个糖醋里脊也不是啥难菜 不至于吧 这次就抱着猎奇的心里点了一个 菜刚上来我就后悔了---老抽色（shai）的、外面一圈淀粉（粉面子）基本没吃着肉，糊哒哒粘成一坨 看起来和那个茄子煲没什么太大区别。建议找一个做糖醋里脊的店吃一次 看一看 就上个浆 过个油 浇个汁儿的事情 没那么难  肉类的菜品价格可以适当调高一些 我宁可多花点儿钱吃肉也不想吃这一坨坨粉面子。 总体而言环境还不错 价格很平民 服务中规中矩 但是菜品质量很是有待提高。看墙上的照片（不知道老板是不是做青旅起家的）菜品不会因为价格低而有销路 只会因为质量好才畅销（相比低价 现在的人们更喜欢质量好而有保障的食品 价格好一点反而是保证） 希望越做越好。地址在凯德1818  5楼手aaa扶梯旁。'

In [46]:
display(lr_explain(model_Location_o, test1, tv_1_o))
display(lr_explain(model_Service_o, test1, tv_2_o))
display(lr_explain(model_Price_o, test1, tv_3_o))
display(lr_explain(model_Ambience_o, test1, tv_4_o))
display(lr_explain(model_Food_o, test1, tv_5_o))

Contribution?,Feature
0.579,Highlighted in text (sum)
0.294,<BIAS>


Contribution?,Feature
1.846,Highlighted in text (sum)
-0.621,<BIAS>


Contribution?,Feature
0.709,<BIAS>
-0.543,Highlighted in text (sum)


Contribution?,Feature
4.071,Highlighted in text (sum)
-0.872,<BIAS>


Contribution?,Feature
1.467,Highlighted in text (sum)
-0.499,<BIAS>


In [47]:
display(lr_explain(model_Location_o, test2, tv_1_o))
display(lr_explain(model_Service_o, test2, tv_2_o))
display(lr_explain(model_Price_o, test2, tv_3_o))
display(lr_explain(model_Ambience_o, test2, tv_4_o))
display(lr_explain(model_Food_o, test2, tv_5_o))

Contribution?,Feature
0.535,Highlighted in text (sum)
-0.294,<BIAS>


Contribution?,Feature
0.665,Highlighted in text (sum)
0.621,<BIAS>


Contribution?,Feature
2.229,Highlighted in text (sum)
0.709,<BIAS>


Contribution?,Feature
2.035,Highlighted in text (sum)
0.872,<BIAS>


Contribution?,Feature
1.903,Highlighted in text (sum)
0.499,<BIAS>


In [48]:
display(lr_explain(model_Location_o, test3, tv_1_o))
display(lr_explain(model_Service_o, test3, tv_2_o))
display(lr_explain(model_Price_o, test3, tv_3_o))
display(lr_explain(model_Ambience_o, test3, tv_4_o))
display(lr_explain(model_Food_o, test3, tv_5_o))

Contribution?,Feature
1.485,Highlighted in text (sum)
0.294,<BIAS>


Contribution?,Feature
1.083,Highlighted in text (sum)
-0.621,<BIAS>


Contribution?,Feature
0.833,Highlighted in text (sum)
-0.709,<BIAS>


Contribution?,Feature
2.173,Highlighted in text (sum)
-0.872,<BIAS>


Contribution?,Feature
0.499,<BIAS>
-0.025,Highlighted in text (sum)


## Bert

### Location

#### Data Preparation and Tokenization

In [None]:
(x_Location_train_bert, y_Location_train_bert), (x_Location_val_bert, y_Location_val_bert), Location_preproc_bert = text.texts_from_df(
                                                                   train_df = Location_train,
                                                                   text_column = 'review',
                                                                   label_columns = 'Location',
                                                                   val_df = Location_val,
                                                                   max_features=30000,
                                                                   maxlen=256,
                                                                   preprocess_mode = 'bert')

#### Model Preparation

In [None]:
Location_model_bert = text.text_classifier(name = 'bert',
                             train_data = (x_Location_train_bert, y_Location_train_bert),
                             preproc = Location_preproc_bert)

In [None]:
Location_learner_bert = ktrain.get_learner(model=Location_model_bert, train_data=(x_Location_train_bert, y_Location_train_bert),
                   val_data = (x_Location_val_bert, y_Location_val_bert),
                   batch_size = 16)

#### Train

In [None]:
Location_learner_bert.lr_find(show_plot=True, suggest=True, max_epochs = 5)

In [None]:
Location_learner_bert.autofit(lr =2.21E-04, checkpoint_folder='bert-meituan-tags-senti/Location/tmp')
Location_predictor_bert = ktrain.get_predictor(Location_learner_bert.model, Location_preproc_bert)
Location_predictor_bert.save('bert-meituan-tags-senti/Location')

#### Evaluation

In [None]:
x_Location_test_bert = Location_test['review']
y_Location_test_bert = Location_test['Location']
y_Location_pred_bert = Location_predictor_bert.predict(x_Location_test_bert.values)

In [None]:
# classification report
from sklearn.metrics import classification_report
print(classification_report(y_Location_test_bert, y_Location_pred_bert))

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_Location_test_bert, y_Location_pred_bert))

In [None]:
data_test.iloc[53]

In [None]:
Location_predictor_bert.explain(data_test.iloc[53]['review'])

In [None]:
#从大众点评网找两条评论来测试一下
test1 = '很好吃，环境好，所有员工的态度都很好，上菜快，服务也很好，味道好吃，都是用蒸馏水煮的，推荐，超好吃' #5星好评
test2 = '糯米外皮不绵滑，豆沙馅粗躁，没有香甜味。12元一碗不值。' #1星差评
# 4星，搅屎棍
test3 = '昨儿晚上来凯德1818的绿茶吃饭，点了以下10个菜，这里面有好几个都是推荐菜品，咱挨个说啊～  第一张 “客家茄子煲”：一个字：咸、俩字：很咸、仨字：非常咸、四个字：咸（hou)死我了... ... 我很怀疑是不是师傅在放酱油的时候手抖了 然后整锅煲就是一锅酱色。里面的咸鱼粒很显然没有经过任何处理（泡一下水去掉部分盐分等），再加之菜本身很咸根本无法入口，整锅煲基本没动；第二张 “鱼头诱惑 ”：这道菜是翔哥点的 还不错 首先鱼头很新鲜 其次鱼头蒸的火候刚好 既入味还不老，建议除了小米辣再加上点泡椒，这样无论是颜色和口味上都会更棒的！这道菜基本消灭； 第三张：“小锅土豆”：厚片的土豆挂满酱汁，炉子在下面慢慢加热，后来加一片放到嘴里，几乎是入口即化、口感绵软，也是一道不错的下饭菜；第四张：“农家小菜”其实就是少了橄榄菜和肉末的豆角粒，味道还可以，如果把豆角再煸的干一点（表皮起皱）口感会更好；第五张“菜心金钩豆腐” 和没上图的“老乡浓汤”一起说了，两道分不清是汤、羹还是菜的东东，都是推荐菜 分不清也无所谓，重点是上菜的时候都不是热的 都是温的 “菜心”里面有蛋黄、“浓汤”里面有猪肚，这两种食材变冷后都会有腥味且口感不好，加之一碗有淀粉类的羹 温温的很难喝，建议出品后马上传菜到客人面前，口味上冷了就都不美味了；第六张：“麻酱油麦菜”很多地方也叫“麻酱凤尾” 麻酱用的像是麻酱和花生酱混合的甜口的酱料，重点是没有稀释！整个一坨粘在油麦菜上 根本拌不开 相比甜口的麻酱我更喜欢热干面那种咸口的、稀点儿的；第七张“面包诱惑”：这道是敬菜，口味还不错，冰淇淋也很好吃 就是卖相太一般了 既然要推广 至少要给点儿装饰，不用太复杂 拉个巧克力线条、给个蛋卷、水果粒稍稍点缀一下就会好看很多 面包本身口感和冰淇淋都很好吃 比很多外面装饰的那种美美的好吃多了 就差了一点装饰；第八张“绿茶烤鸡”很多人点这道菜，相比前几道的重口，烤鸡显得淡了一些。鸡皮的保护基本没有 所以整个鸡肉又干有柴、也挂不住调味料、下次有机会点个整只的或许会好一点；最后还有未上图的绿茶饼和糖醋里脊，绿茶饼还不错 可以一试 只要火候控制的好不炸过基本没问题，里面糯糯的馅儿很好吃；“糖醋里脊” 这是我吃过最难吃的版本，没有之一！上次有一小哥吐了一通槽 我还想就一个糖醋里脊也不是啥难菜 不至于吧 这次就抱着猎奇的心里点了一个 菜刚上来我就后悔了---老抽色（shai）的、外面一圈淀粉（粉面子）基本没吃着肉，糊哒哒粘成一坨 看起来和那个茄子煲没什么太大区别。建议找一个做糖醋里脊的店吃一次 看一看 就上个浆 过个油 浇个汁儿的事情 没那么难  肉类的菜品价格可以适当调高一些 我宁可多花点儿钱吃肉也不想吃这一坨坨粉面子。 总体而言环境还不错 价格很平民 服务中规中矩 但是菜品质量很是有待提高。看墙上的照片（不知道老板是不是做青旅起家的）菜品不会因为价格低而有销路 只会因为质量好才畅销（相比低价 现在的人们更喜欢质量好而有保障的食品 价格好一点反而是保证） 希望越做越好。地址在凯德1818  5楼手扶梯旁。'

In [None]:
Location_predictor_bert.explain(test1)

In [None]:
Location_predictor_bert.explain(test2)

In [None]:
Location_predictor_bert.explain(test3)

### Service

#### Data Preparation and Tokenization

In [None]:
(x_Service_train_bert, y_Service_train_bert), (x_Service_val_bert, y_Service_val_bert), Service_preproc_bert = text.texts_from_df(
                                                                   train_df = Service_train,
                                                                   text_column = 'review',
                                                                   label_columns = 'Service',
                                                                   val_df = Service_val,
                                                                   max_features=30000,
                                                                   maxlen=256,
                                                                   preprocess_mode = 'bert')

#### Model Preparation

In [None]:
Service_model_bert = text.text_classifier(name = 'bert',
                             train_data = (x_Service_train_bert, y_Service_train_bert),
                             preproc = Service_preproc_bert)

In [None]:
Service_learner_bert = ktrain.get_learner(model=Service_model_bert, train_data=(x_Service_train_bert, y_Service_train_bert),
                   val_data = (x_Service_val_bert, y_Service_val_bert),
                   batch_size = 16)

#### Train

In [None]:
Service_learner_bert.lr_find(show_plot=True, suggest=True, max_epochs = 2)

In [None]:
Service_learner_bert.autofit(lr =8E-04, checkpoint_folder='bert-meituan-tags-senti/Service/tmp')
Service_predictor_bert = ktrain.get_predictor(Service_learner_bert.model, Service_preproc_bert)
Service_predictor_bert.save('bert-meituan-tags-senti/Service')

#### Evaluation

In [None]:
x_Service_test_bert = Service_test['review']
y_Service_test_bert = Service_test['Service']
y_Service_pred_bert = Service_predictor_bert.predict(x_Service_test_bert.values)

In [None]:
# classification report
from sklearn.metrics import classification_report
print(classification_report(y_Service_test_bert, y_Service_pred_bert))

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_Service_test_bert, y_Service_pred_bert))

In [None]:
data_test.iloc[53]

In [None]:
Service_predictor_bert.explain(data_test.iloc[53]['review'])

In [None]:
#从大众点评网找两条评论来测试一下
test1 = '很好吃，环境好，所有员工的态度都很好，上菜快，服务也很好，味道好吃，都是用蒸馏水煮的，推荐，超好吃' #5星好评
test2 = '糯米外皮不绵滑，豆沙馅粗躁，没有香甜味。12元一碗不值。' #1星差评
# 4星，搅屎棍
test3 = '昨儿晚上来凯德1818的绿茶吃饭，点了以下10个菜，这里面有好几个都是推荐菜品，咱挨个说啊～  第一张 “客家茄子煲”：一个字：咸、俩字：很咸、仨字：非常咸、四个字：咸（hou)死我了... ... 我很怀疑是不是师傅在放酱油的时候手抖了 然后整锅煲就是一锅酱色。里面的咸鱼粒很显然没有经过任何处理（泡一下水去掉部分盐分等），再加之菜本身很咸根本无法入口，整锅煲基本没动；第二张 “鱼头诱惑 ”：这道菜是翔哥点的 还不错 首先鱼头很新鲜 其次鱼头蒸的火候刚好 既入味还不老，建议除了小米辣再加上点泡椒，这样无论是颜色和口味上都会更棒的！这道菜基本消灭； 第三张：“小锅土豆”：厚片的土豆挂满酱汁，炉子在下面慢慢加热，后来加一片放到嘴里，几乎是入口即化、口感绵软，也是一道不错的下饭菜；第四张：“农家小菜”其实就是少了橄榄菜和肉末的豆角粒，味道还可以，如果把豆角再煸的干一点（表皮起皱）口感会更好；第五张“菜心金钩豆腐” 和没上图的“老乡浓汤”一起说了，两道分不清是汤、羹还是菜的东东，都是推荐菜 分不清也无所谓，重点是上菜的时候都不是热的 都是温的 “菜心”里面有蛋黄、“浓汤”里面有猪肚，这两种食材变冷后都会有腥味且口感不好，加之一碗有淀粉类的羹 温温的很难喝，建议出品后马上传菜到客人面前，口味上冷了就都不美味了；第六张：“麻酱油麦菜”很多地方也叫“麻酱凤尾” 麻酱用的像是麻酱和花生酱混合的甜口的酱料，重点是没有稀释！整个一坨粘在油麦菜上 根本拌不开 相比甜口的麻酱我更喜欢热干面那种咸口的、稀点儿的；第七张“面包诱惑”：这道是敬菜，口味还不错，冰淇淋也很好吃 就是卖相太一般了 既然要推广 至少要给点儿装饰，不用太复杂 拉个巧克力线条、给个蛋卷、水果粒稍稍点缀一下就会好看很多 面包本身口感和冰淇淋都很好吃 比很多外面装饰的那种美美的好吃多了 就差了一点装饰；第八张“绿茶烤鸡”很多人点这道菜，相比前几道的重口，烤鸡显得淡了一些。鸡皮的保护基本没有 所以整个鸡肉又干有柴、也挂不住调味料、下次有机会点个整只的或许会好一点；最后还有未上图的绿茶饼和糖醋里脊，绿茶饼还不错 可以一试 只要火候控制的好不炸过基本没问题，里面糯糯的馅儿很好吃；“糖醋里脊” 这是我吃过最难吃的版本，没有之一！上次有一小哥吐了一通槽 我还想就一个糖醋里脊也不是啥难菜 不至于吧 这次就抱着猎奇的心里点了一个 菜刚上来我就后悔了---老抽色（shai）的、外面一圈淀粉（粉面子）基本没吃着肉，糊哒哒粘成一坨 看起来和那个茄子煲没什么太大区别。建议找一个做糖醋里脊的店吃一次 看一看 就上个浆 过个油 浇个汁儿的事情 没那么难  肉类的菜品价格可以适当调高一些 我宁可多花点儿钱吃肉也不想吃这一坨坨粉面子。 总体而言环境还不错 价格很平民 服务中规中矩 但是菜品质量很是有待提高。看墙上的照片（不知道老板是不是做青旅起家的）菜品不会因为价格低而有销路 只会因为质量好才畅销（相比低价 现在的人们更喜欢质量好而有保障的食品 价格好一点反而是保证） 希望越做越好。地址在凯德1818  5楼手扶梯旁。'

In [None]:
Service_predictor_bert.explain(test1)

In [None]:
Service_predictor_bert.explain(test2)

In [None]:
Service_predictor_bert.explain(test3)

### Price

#### Data Preparation and Tokenization

In [None]:
(x_Price_train_bert, y_Price_train_bert), (x_Price_val_bert, y_Price_val_bert), Price_preproc_bert = text.texts_from_df(
                                                                   train_df = Price_train,
                                                                   text_column = 'review',
                                                                   label_columns = 'Price',
                                                                   val_df = Price_val,
                                                                   max_features=30000,
                                                                   maxlen=256,
                                                                   preprocess_mode = 'bert')

#### Model Preparation

In [None]:
Price_model_bert = text.text_classifier(name = 'bert',
                             train_data = (x_Price_train_bert, y_Price_train_bert),
                             preproc = Price_preproc_bert)

In [None]:
Price_learner_bert = ktrain.get_learner(model=Price_model_bert, train_data=(x_Price_train_bert, y_Price_train_bert),
                   val_data = (x_Price_val_bert, y_Price_val_bert),
                   batch_size = 16)

#### Train

In [None]:
Price_learner_bert.lr_find(show_plot=True, suggest=True, max_epochs = 2)

In [None]:
Price_learner_bert.autofit(lr =8.46E-04, checkpoint_folder='bert-meituan-tags-senti/Price/tmp')
Price_predictor_bert = ktrain.get_predictor(Price_learner_bert.model, Price_preproc_bert)
Price_predictor_bert.save('bert-meituan-tags-senti/Price')

#### Evaluation

In [None]:
x_Price_test_bert = Price_test['review']
y_Price_test_bert = Price_test['Price']
y_Price_pred_bert = Price_predictor_bert.predict(x_Price_test_bert.values)

In [None]:
# classification report
from sklearn.metrics import classification_report
print(classification_report(y_Price_test_bert, y_Price_pred_bert))

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_Price_test_bert, y_Price_pred_bert))

In [None]:
data_test.iloc[53]

In [None]:
Price_predictor_bert.explain(data_test.iloc[53]['review'])

In [None]:
#从大众点评网找两条评论来测试一下
test1 = '很好吃，环境好，所有员工的态度都很好，上菜快，服务也很好，味道好吃，都是用蒸馏水煮的，推荐，超好吃' #5星好评
test2 = '糯米外皮不绵滑，豆沙馅粗躁，没有香甜味。12元一碗不值。' #1星差评
# 4星，搅屎棍
test3 = '昨儿晚上来凯德1818的绿茶吃饭，点了以下10个菜，这里面有好几个都是推荐菜品，咱挨个说啊～  第一张 “客家茄子煲”：一个字：咸、俩字：很咸、仨字：非常咸、四个字：咸（hou)死我了... ... 我很怀疑是不是师傅在放酱油的时候手抖了 然后整锅煲就是一锅酱色。里面的咸鱼粒很显然没有经过任何处理（泡一下水去掉部分盐分等），再加之菜本身很咸根本无法入口，整锅煲基本没动；第二张 “鱼头诱惑 ”：这道菜是翔哥点的 还不错 首先鱼头很新鲜 其次鱼头蒸的火候刚好 既入味还不老，建议除了小米辣再加上点泡椒，这样无论是颜色和口味上都会更棒的！这道菜基本消灭； 第三张：“小锅土豆”：厚片的土豆挂满酱汁，炉子在下面慢慢加热，后来加一片放到嘴里，几乎是入口即化、口感绵软，也是一道不错的下饭菜；第四张：“农家小菜”其实就是少了橄榄菜和肉末的豆角粒，味道还可以，如果把豆角再煸的干一点（表皮起皱）口感会更好；第五张“菜心金钩豆腐” 和没上图的“老乡浓汤”一起说了，两道分不清是汤、羹还是菜的东东，都是推荐菜 分不清也无所谓，重点是上菜的时候都不是热的 都是温的 “菜心”里面有蛋黄、“浓汤”里面有猪肚，这两种食材变冷后都会有腥味且口感不好，加之一碗有淀粉类的羹 温温的很难喝，建议出品后马上传菜到客人面前，口味上冷了就都不美味了；第六张：“麻酱油麦菜”很多地方也叫“麻酱凤尾” 麻酱用的像是麻酱和花生酱混合的甜口的酱料，重点是没有稀释！整个一坨粘在油麦菜上 根本拌不开 相比甜口的麻酱我更喜欢热干面那种咸口的、稀点儿的；第七张“面包诱惑”：这道是敬菜，口味还不错，冰淇淋也很好吃 就是卖相太一般了 既然要推广 至少要给点儿装饰，不用太复杂 拉个巧克力线条、给个蛋卷、水果粒稍稍点缀一下就会好看很多 面包本身口感和冰淇淋都很好吃 比很多外面装饰的那种美美的好吃多了 就差了一点装饰；第八张“绿茶烤鸡”很多人点这道菜，相比前几道的重口，烤鸡显得淡了一些。鸡皮的保护基本没有 所以整个鸡肉又干有柴、也挂不住调味料、下次有机会点个整只的或许会好一点；最后还有未上图的绿茶饼和糖醋里脊，绿茶饼还不错 可以一试 只要火候控制的好不炸过基本没问题，里面糯糯的馅儿很好吃；“糖醋里脊” 这是我吃过最难吃的版本，没有之一！上次有一小哥吐了一通槽 我还想就一个糖醋里脊也不是啥难菜 不至于吧 这次就抱着猎奇的心里点了一个 菜刚上来我就后悔了---老抽色（shai）的、外面一圈淀粉（粉面子）基本没吃着肉，糊哒哒粘成一坨 看起来和那个茄子煲没什么太大区别。建议找一个做糖醋里脊的店吃一次 看一看 就上个浆 过个油 浇个汁儿的事情 没那么难  肉类的菜品价格可以适当调高一些 我宁可多花点儿钱吃肉也不想吃这一坨坨粉面子。 总体而言环境还不错 价格很平民 服务中规中矩 但是菜品质量很是有待提高。看墙上的照片（不知道老板是不是做青旅起家的）菜品不会因为价格低而有销路 只会因为质量好才畅销（相比低价 现在的人们更喜欢质量好而有保障的食品 价格好一点反而是保证） 希望越做越好。地址在凯德1818  5楼手扶梯旁。'

In [None]:
Price_predictor_bert.explain(test1)

In [None]:
Price_predictor_bert.explain(test2)

In [None]:
Price_predictor_bert.explain(test3)

### Ambience

#### Data Preparation and Tokenization

In [None]:
(x_Ambience_train_bert, y_Ambience_train_bert), (x_Ambience_val_bert, y_Ambience_val_bert), Ambience_preproc_bert = text.texts_from_df(
                                                                   train_df = Ambience_train,
                                                                   text_column = 'review',
                                                                   label_columns = 'Ambience',
                                                                   val_df = Ambience_val,
                                                                   max_features=30000,
                                                                   maxlen=256,
                                                                   preprocess_mode = 'bert')

#### Model Preparation

In [None]:
Ambience_model_bert = text.text_classifier(name = 'bert',
                             train_data = (x_Ambience_train_bert, y_Ambience_train_bert),
                             preproc = Ambience_preproc_bert)

In [None]:
Ambience_learner_bert = ktrain.get_learner(model=Ambience_model_bert, train_data=(x_Ambience_train_bert, y_Ambience_train_bert),
                   val_data = (x_Ambience_val_bert, y_Ambience_val_bert),
                   batch_size = 16)

#### Train

In [None]:
Ambience_learner_bert.lr_find(show_plot=True, suggest=True, max_epochs = 2)

In [None]:
Ambience_learner_bert.autofit(lr =2E-04, checkpoint_folder='bert-meituan-tags-senti/Ambience/tmp')
Ambience_predictor_bert = ktrain.get_predictor(Ambience_learner_bert.model, Ambience_preproc_bert)
Ambience_predictor_bert.save('bert-meituan-tags-senti/Ambience')

#### Evaluation

In [None]:
x_Ambience_test_bert = Ambience_test['review']
y_Ambience_test_bert = Ambience_test['Ambience']
y_Ambience_pred_bert = Ambience_predictor_bert.predict(x_Ambience_test_bert.values)

In [None]:
# classification report
from sklearn.metrics import classification_report
print(classification_report(y_Ambience_test_bert, y_Ambience_pred_bert))

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_Ambience_test_bert, y_Ambience_pred_bert))

In [None]:
data_test.iloc[53]

In [None]:
Ambience_predictor_bert.explain(data_test.iloc[53]['review'])

In [None]:
#从大众点评网找两条评论来测试一下
test1 = '很好吃，环境好，所有员工的态度都很好，上菜快，服务也很好，味道好吃，都是用蒸馏水煮的，推荐，超好吃' #5星好评
test2 = '糯米外皮不绵滑，豆沙馅粗躁，没有香甜味。12元一碗不值。' #1星差评
# 4星，搅屎棍
test3 = '昨儿晚上来凯德1818的绿茶吃饭，点了以下10个菜，这里面有好几个都是推荐菜品，咱挨个说啊～  第一张 “客家茄子煲”：一个字：咸、俩字：很咸、仨字：非常咸、四个字：咸（hou)死我了... ... 我很怀疑是不是师傅在放酱油的时候手抖了 然后整锅煲就是一锅酱色。里面的咸鱼粒很显然没有经过任何处理（泡一下水去掉部分盐分等），再加之菜本身很咸根本无法入口，整锅煲基本没动；第二张 “鱼头诱惑 ”：这道菜是翔哥点的 还不错 首先鱼头很新鲜 其次鱼头蒸的火候刚好 既入味还不老，建议除了小米辣再加上点泡椒，这样无论是颜色和口味上都会更棒的！这道菜基本消灭； 第三张：“小锅土豆”：厚片的土豆挂满酱汁，炉子在下面慢慢加热，后来加一片放到嘴里，几乎是入口即化、口感绵软，也是一道不错的下饭菜；第四张：“农家小菜”其实就是少了橄榄菜和肉末的豆角粒，味道还可以，如果把豆角再煸的干一点（表皮起皱）口感会更好；第五张“菜心金钩豆腐” 和没上图的“老乡浓汤”一起说了，两道分不清是汤、羹还是菜的东东，都是推荐菜 分不清也无所谓，重点是上菜的时候都不是热的 都是温的 “菜心”里面有蛋黄、“浓汤”里面有猪肚，这两种食材变冷后都会有腥味且口感不好，加之一碗有淀粉类的羹 温温的很难喝，建议出品后马上传菜到客人面前，口味上冷了就都不美味了；第六张：“麻酱油麦菜”很多地方也叫“麻酱凤尾” 麻酱用的像是麻酱和花生酱混合的甜口的酱料，重点是没有稀释！整个一坨粘在油麦菜上 根本拌不开 相比甜口的麻酱我更喜欢热干面那种咸口的、稀点儿的；第七张“面包诱惑”：这道是敬菜，口味还不错，冰淇淋也很好吃 就是卖相太一般了 既然要推广 至少要给点儿装饰，不用太复杂 拉个巧克力线条、给个蛋卷、水果粒稍稍点缀一下就会好看很多 面包本身口感和冰淇淋都很好吃 比很多外面装饰的那种美美的好吃多了 就差了一点装饰；第八张“绿茶烤鸡”很多人点这道菜，相比前几道的重口，烤鸡显得淡了一些。鸡皮的保护基本没有 所以整个鸡肉又干有柴、也挂不住调味料、下次有机会点个整只的或许会好一点；最后还有未上图的绿茶饼和糖醋里脊，绿茶饼还不错 可以一试 只要火候控制的好不炸过基本没问题，里面糯糯的馅儿很好吃；“糖醋里脊” 这是我吃过最难吃的版本，没有之一！上次有一小哥吐了一通槽 我还想就一个糖醋里脊也不是啥难菜 不至于吧 这次就抱着猎奇的心里点了一个 菜刚上来我就后悔了---老抽色（shai）的、外面一圈淀粉（粉面子）基本没吃着肉，糊哒哒粘成一坨 看起来和那个茄子煲没什么太大区别。建议找一个做糖醋里脊的店吃一次 看一看 就上个浆 过个油 浇个汁儿的事情 没那么难  肉类的菜品价格可以适当调高一些 我宁可多花点儿钱吃肉也不想吃这一坨坨粉面子。 总体而言环境还不错 价格很平民 服务中规中矩 但是菜品质量很是有待提高。看墙上的照片（不知道老板是不是做青旅起家的）菜品不会因为价格低而有销路 只会因为质量好才畅销（相比低价 现在的人们更喜欢质量好而有保障的食品 价格好一点反而是保证） 希望越做越好。地址在凯德1818  5楼手扶梯旁。'

In [None]:
Ambience_predictor_bert.explain(test1)

In [None]:
Ambience_predictor_bert.explain(test2)

In [None]:
Ambience_predictor_bert.explain(test3)

### Food

#### Data Preparation and Tokenization

In [None]:
(x_Food_train_bert, y_Food_train_bert), (x_Food_val_bert, y_Food_val_bert), Food_preproc_bert = text.texts_from_df(
                                                                   train_df = Food_train,
                                                                   text_column = 'review',
                                                                   label_columns = 'Food',
                                                                   val_df = Food_val,
                                                                   max_features=30000,
                                                                   maxlen=256,
                                                                   preprocess_mode = 'bert')

#### Model Preparation

In [None]:
Food_model_bert = text.text_classifier(name = 'bert',
                             train_data = (x_Food_train_bert, y_Food_train_bert),
                             preproc = Food_preproc_bert)

In [None]:
Food_learner_bert = ktrain.get_learner(model=Food_model_bert, train_data=(x_Food_train_bert, y_Food_train_bert),
                   val_data = (x_Food_val_bert, y_Food_val_bert),
                   batch_size = 16)

#### Train

In [None]:
Food_learner_bert.lr_find(show_plot=True, suggest=True, max_epochs = 2)

In [None]:
Food_learner_bert.autofit(lr =8.46E-04, checkpoint_folder='bert-meituan-tags-senti/Food/tmp')
Food_predictor_bert = ktrain.get_predictor(Food_learner_bert.model, Food_preproc_bert)
Food_predictor_bert.save('bert-meituan-tags-senti/Food')

#### Evaluation

In [None]:
x_Food_test_bert = Food_test['review']
y_Food_test_bert = Food_test['Food']
y_Food_pred_bert = Food_predictor_bert.predict(x_Food_test_bert.values)

In [None]:
# classification report
from sklearn.metrics import classification_report
print(classification_report(y_Food_test_bert, y_Food_pred_bert))

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_Food_test_bert, y_Food_pred_bert))

In [None]:
data_test.iloc[53]

In [None]:
Food_predictor_bert.explain(data_test.iloc[53]['review'])

In [None]:
#从大众点评网找两条评论来测试一下
test1 = '很好吃，环境好，所有员工的态度都很好，上菜快，服务也很好，味道好吃，都是用蒸馏水煮的，推荐，超好吃' #5星好评
test2 = '糯米外皮不绵滑，豆沙馅粗躁，没有香甜味。12元一碗不值。' #1星差评
# 4星，搅屎棍
test3 = '昨儿晚上来凯德1818的绿茶吃饭，点了以下10个菜，这里面有好几个都是推荐菜品，咱挨个说啊～  第一张 “客家茄子煲”：一个字：咸、俩字：很咸、仨字：非常咸、四个字：咸（hou)死我了... ... 我很怀疑是不是师傅在放酱油的时候手抖了 然后整锅煲就是一锅酱色。里面的咸鱼粒很显然没有经过任何处理（泡一下水去掉部分盐分等），再加之菜本身很咸根本无法入口，整锅煲基本没动；第二张 “鱼头诱惑 ”：这道菜是翔哥点的 还不错 首先鱼头很新鲜 其次鱼头蒸的火候刚好 既入味还不老，建议除了小米辣再加上点泡椒，这样无论是颜色和口味上都会更棒的！这道菜基本消灭； 第三张：“小锅土豆”：厚片的土豆挂满酱汁，炉子在下面慢慢加热，后来加一片放到嘴里，几乎是入口即化、口感绵软，也是一道不错的下饭菜；第四张：“农家小菜”其实就是少了橄榄菜和肉末的豆角粒，味道还可以，如果把豆角再煸的干一点（表皮起皱）口感会更好；第五张“菜心金钩豆腐” 和没上图的“老乡浓汤”一起说了，两道分不清是汤、羹还是菜的东东，都是推荐菜 分不清也无所谓，重点是上菜的时候都不是热的 都是温的 “菜心”里面有蛋黄、“浓汤”里面有猪肚，这两种食材变冷后都会有腥味且口感不好，加之一碗有淀粉类的羹 温温的很难喝，建议出品后马上传菜到客人面前，口味上冷了就都不美味了；第六张：“麻酱油麦菜”很多地方也叫“麻酱凤尾” 麻酱用的像是麻酱和花生酱混合的甜口的酱料，重点是没有稀释！整个一坨粘在油麦菜上 根本拌不开 相比甜口的麻酱我更喜欢热干面那种咸口的、稀点儿的；第七张“面包诱惑”：这道是敬菜，口味还不错，冰淇淋也很好吃 就是卖相太一般了 既然要推广 至少要给点儿装饰，不用太复杂 拉个巧克力线条、给个蛋卷、水果粒稍稍点缀一下就会好看很多 面包本身口感和冰淇淋都很好吃 比很多外面装饰的那种美美的好吃多了 就差了一点装饰；第八张“绿茶烤鸡”很多人点这道菜，相比前几道的重口，烤鸡显得淡了一些。鸡皮的保护基本没有 所以整个鸡肉又干有柴、也挂不住调味料、下次有机会点个整只的或许会好一点；最后还有未上图的绿茶饼和糖醋里脊，绿茶饼还不错 可以一试 只要火候控制的好不炸过基本没问题，里面糯糯的馅儿很好吃；“糖醋里脊” 这是我吃过最难吃的版本，没有之一！上次有一小哥吐了一通槽 我还想就一个糖醋里脊也不是啥难菜 不至于吧 这次就抱着猎奇的心里点了一个 菜刚上来我就后悔了---老抽色（shai）的、外面一圈淀粉（粉面子）基本没吃着肉，糊哒哒粘成一坨 看起来和那个茄子煲没什么太大区别。建议找一个做糖醋里脊的店吃一次 看一看 就上个浆 过个油 浇个汁儿的事情 没那么难  肉类的菜品价格可以适当调高一些 我宁可多花点儿钱吃肉也不想吃这一坨坨粉面子。 总体而言环境还不错 价格很平民 服务中规中矩 但是菜品质量很是有待提高。看墙上的照片（不知道老板是不是做青旅起家的）菜品不会因为价格低而有销路 只会因为质量好才畅销（相比低价 现在的人们更喜欢质量好而有保障的食品 价格好一点反而是保证） 希望越做越好。地址在凯德1818  5楼手扶梯旁。'

In [None]:
Food_predictor_bert.explain(test1)

In [None]:
Food_predictor_bert.explain(test2)

In [None]:
Food_predictor_bert.explain(test3)