In [14]:
import pandas as pd
pd_corpus = pd.read_csv('ChnSentiCorp_htl_all.csv')
pd_corpus.head()

Unnamed: 0,label,review
0,1,"距离川沙公路较近,但是公交指示不对,如果是""蔡陆线""的话,会非常麻烦.建议用别的路线.房间较..."
1,1,商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!
2,1,早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。
3,1,宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小...
4,1,"CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风"


In [5]:
pd_positive = pd_corpus[pd_corpus['label']==1]
pd_negative = pd_corpus[pd_corpus['label']==0]
print(f'Total:{len(pd_corpus)}, Postive: {len(pd_positive)}, Negative: {len(pd_negative)}')

Total:7766, Postive: 5322, Negative: 2444


In [15]:
pd_corpus.dropna(inplace = True)

## Create Jieba Cutting Class

In [7]:
import jieba

In [28]:
class JiebaCuttingClass(object):
  def __init__(self,key_to_cut:str, dic:str=None, userdict:str=None):
    if dic is not None:
      jieba.set_dictionary(dic)
    if userdict is not None:
      jieba.load_userdict(userdict)
    self.key_to_cut = key_to_cut
    jieba.enable_paddle()
  
  @staticmethod
  def cut_single_sentence(sentence, use_paddle = False, use_full = False, use_search = False):
    if use_search:
      out = jieba.cut_for_search(sentence)
    else:
      out = jieba.cut(sentence,use_paddle=use_paddle, cut_all = use_full)
    return list(out)
    
  def cut_corpus(self, corpus: pd.DataFrame, mode: str) -> pd.DataFrame:
    if mode not in ['paddle','full','precise','search']:
      raise TypeError(f'only support `paddle`,`full`,`precise`, and `search` mode, but get {mode}')

    if mode == 'paddle':
      out = self._paddle_cut(corpus)
    elif mode == 'full':
      out = self._full_cut(corpus)   
    elif mode == 'precise':
      out = self._precise_cut(corpus)
    elif mode == 'search':
      out = self._search_cut(corpus)
    return out

  def _paddle_cut(self, corpus):
    jieba.enable_paddle()
    
    out = []
    for single_review in corpus[self.key_to_cut]:
      out.append([word for word in JiebaCuttingClass.cut_single_sentence(single_review,use_paddle=True)])
    
    corpus['cut'] = out 
    return corpus

  def _full_cut(self, corpus):
    
    out = []
    for single_review in corpus[self.key_to_cut]:
      out.append([word for word in JiebaCuttingClass.cut_single_sentence(single_review,use_full=True)])
    
    corpus['cut'] = out 
    return corpus

  def _precise_cut(self, corpus):
    
    out = []
    for single_review in corpus[self.key_to_cut]:
      out.append([word for word in JiebaCuttingClass.cut_single_sentence(single_review)])
    
    corpus['cut'] = out 
    return corpus

  def _search_cut(self, corpus):
    
    out = []
    for single_review in corpus[self.key_to_cut]:
      out.append([word for word in JiebaCuttingClass.cut_single_sentence(single_review,use_search=True)])
    
    corpus['cut'] = out 
    return corpus
  

In [29]:
jieba_cut = JiebaCuttingClass(key_to_cut = 'review')
pd_cut = jieba_cut.cut_corpus(pd_corpus.loc[:50,:],mode = 'precise')
pd_cut.head()

Paddle enabled successfully......
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,label,review,cut
0,1,"距离川沙公路较近,但是公交指示不对,如果是""蔡陆线""的话,会非常麻烦.建议用别的路线.房间较...","[距离, 川沙, 公路, 较近, ,, 但是, 公交, 指示, 不, 对, ,, 如果, 是..."
1,1,商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!,"[商务, 大床, 房, ，, 房间, 很大, ，, 床有, 2M, 宽, ，, 整体, 感觉..."
2,1,早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。,"[早餐, 太差, ，, 无论, 去, 多少, 人, ，, 那边, 也, 不加, 食品, 的,..."
3,1,宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小...,"[宾馆, 在, 小, 街道, 上, ，, 不大好, 找, ，, 但, 还好, 北京, 热心,..."
4,1,"CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风","[CBD, 中心, ,, 周围, 没什么, 店铺, ,, 说, 5, 星, 有点, 勉强, ..."


## Use Dict

In [30]:
test_string = '我愛cupoy自然語言處理馬拉松課程'
jieba_cut = JiebaCuttingClass(key_to_cut='', dic='dict.txt.big')


out_string = jieba_cut.cut_single_sentence(test_string,use_paddle=True)
print(f'Paddle模式: {[string for string in out_string]}')

out_string = jieba_cut.cut_single_sentence(test_string,use_full=True)
print(f'全模式: {[string for string in out_string]}')

out_string = jieba_cut.cut_single_sentence(test_string,use_search=True)
print(f'搜尋模式: {[string for string in out_string]}')

out_string = jieba_cut.cut_single_sentence(test_string)
print(f'精確模式: {[string for string in out_string]}')


Paddle enabled successfully......
Building prefix dict from /content/dict.txt.big ...
Loading model from cache /tmp/jieba.u501edca284da514cb68b53a20324f4e3.cache


Paddle模式: ['我', '愛', 'cupoy', '自然', '語言', '處理', '馬拉松', '課程']


Loading model cost 1.519 seconds.
Prefix dict has been built successfully.


全模式: ['我', '愛', 'cupoy', '自然', '自然語言', '語言', '處理', '馬拉', '馬拉松', '課程']
搜尋模式: ['我', '愛', 'cupoy', '自然', '語言', '自然語言', '處理', '馬拉', '馬拉松', '課程']
精確模式: ['我', '愛', 'cupoy', '自然語言', '處理', '馬拉松', '課程']
