# BOW & tf-idf

In [7]:
def stop_words(data, path='data/', file='stopwords.txt'):
    """
    input:
        data: pd.Series with list element
        path: str, path of stopwords file
        file: str, name of stopwords file

    output:
        data: pd.Series
    """
    s = ''
    with open(path+file, 'r', encoding='utf8') as r:
        for i in r.readlines():
            s += i.strip()
    data = data.map(lambda x:[i for i in x if(i not in s) and (len(i) > 1)])
    return data

from sklearn.feature_extraction.text import CountVectorizer
def bow(X_train, X_test, max_features=None, sparse=True):
    """
    input:
        X_train: pd.Series with list element
        X_test: pd.Series with list element
        max_features: int, default None
    
    output:
        X_train: csr_matrix or ndarray
        X_test: csr_matrix or ndarray
        feature_name: list
    """
    print('BOWing...')
    # 将 pd.Series 转换为 list
    # 每个句子为一个 str
    # 每个单词由空格分开
    X_train = X_train.map(lambda line:' '.join(line))    
    X_test = X_test.map(lambda line:' '.join(line))  
    
    BOW = CountVectorizer(max_features=max_features)
    X_train = BOW.fit_transform(X_train)          
    X_test = BOW.transform(X_test)
    feature_name = BOW.get_feature_names()
    
    if sparse==True:
        print('X is a Sparse Matrix')
    else:
        print('X is a dense Matrix')
        # 将稀疏矩阵转换为 ndarray
        X_train = X_train.A    
        X_test = X_test.A    
    return X_train, X_test, feature_name


from sklearn.feature_extraction.text import TfidfTransformer
def tf_idf(X_train, X_test, max_features=None, sparse=True):
    """
    input:
        X_train: pd.Series with list element
        X_test: pd.Series with list element
        max_features: int, default None
        sparse: bool, return sparse matrix, default True
    
    output:
        X_train: csr_matrix or ndarray
        X_test: csr_matrix or ndarray
        feature_name: list
    """
    X_train, X_test, feature_name = bow(X_train, X_test, max_features)
    print('tf_idfing...')
    TF_idf = TfidfTransformer()
    X_train = TF_idf.fit_transform(X_train)     # 返回的是稀疏矩阵
    X_test = TF_idf.transform(X_test)
    if sparse==True:
        print('X is a Sparse Matrix')
    else:
        print('X is a dense Matrix')
        # 将稀疏矩阵转换为 ndarray
        X_train = X_train.A    
        X_test = X_test.A    
    return X_train, X_test, feature_name

def print_info(X_train, X_test, Y_train, Y_test):
    print('\nX_train shape:', X_train.shape, 'type:', type(X_train))
    print('X_test shape:', X_test.shape, 'type:', type(X_test))
    print('Y_train shape:', Y_train.shape, 'type:', type(Y_train))
    print('Y_test shape:', Y_test.shape, 'type:', type(Y_test))
    

import jieba
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
def load_data(path='data/', file='广东(数据清洗4).xlsx', 
              vectorize='tf_idf', drop_stop_words=True, 
              max_features=None, sparse=True):
    """
    input:
        path: str, path of file, default 'data/'
        file: str, name of file, default '广东(数据清洗4).xlsx'
        vectorize: str, ['bow','tf_idf'], default 'tf_idf'
        drop_stop_words: bool, default True
        max_features: int, max_features of X, default None
        sparse: bool, whether 'tf_idf' return sparse matrix, default True
            
    output:
        X_train: csr_matrix or ndarray
        X_test: csr_matrix or ndarray
        Y_train: pd.Series
        Y_test: pd.Series
        feature_name: list
    """
    # 读取数据
    print('Reading data...')
    data = pd.read_excel(path+file, index_col=0)
    data.drop(columns=['景点','昵称','等级','时间'], inplace=True)
    data.columns = ['Corpus', 'Sentiment']
    
    Y = data['Sentiment']
    Y = Y.map({0:0, 1:0, 2:0, 3:0, 4:1, 5:1})       # 将评分转换为情感倾向
    X = data['Corpus']
    
    # 分词
    print('Splitting...')
    X = X.map(lambda x:jieba.lcut(x))    
    
    # 去除停用词
    if(drop_stop_words):
        print('Dropping Stop Words...')
        X = stop_words(X, path, file='stopwords.txt') 
           
    # 划分训练集和测试集
    X_train, X_test, Y_train, Y_test = \
    train_test_split(X, Y, test_size=0.2, random_state=0)
    
    # 向量化
    if(vectorize=='bow'):                           # Bag of Word
        X_train, X_test, feature_name = \
        bow(X_train, X_test, max_features=max_features, sparse=sparse)
    elif(vectorize=='tf_idf'):                      # tf-idf
        X_train, X_test, feature_name = \
        tf_idf(X_train, X_test, max_features=max_features, sparse=sparse)
    elif(vectorize==None):
        print_info(X_train, X_test, Y_train, Y_test)
        return X_train, X_test, Y_train, Y_test
        
    print_info(X_train, X_test, Y_train, Y_test)
    return X_train, X_test, Y_train, Y_test, feature_name

In [8]:
X_train, X_test, Y_train, Y_test, feature_name = load_data()

Reading data...
Splitting...
Droping Stop Words...
BOWing...
X is a Sparse Matrix
tf_idfing...
X is a Sparse Matrix

X_train shape: (31920, 39716) type: <class 'scipy.sparse.csr.csr_matrix'>
X_test shape: (7981, 39716) type: <class 'scipy.sparse.csr.csr_matrix'>
Y_train shape: (31920,) type: <class 'pandas.core.series.Series'>
Y_test shape: (7981,) type: <class 'pandas.core.series.Series'>


# Word2vec

In [132]:
import gensim
import warnings
warnings.filterwarnings("ignore")
from tqdm.notebook import tqdm as tqdm
def w2v_transformer(X, model, size):
    '''
    input:
        X: pd.DataFrame
        model: word2vec model
        size: int, size of X in word2vec model
        
    output:
        X_vec: np.ndarray, shape: (num of sample, num of features)
    '''
    print('Transforming...')
    X_vec = np.zeros((size,))
    for corpus in tqdm(X):
        vec = np.zeros((size,))
        length = 0
        for word in corpus:
            try:
                vec += model[word]
                length += 1
            except KeyError:
                continue
        length = length if length!=0 else 1
        vec = vec/length
        X_vec = np.vstack([X_vec, vec])
    X_vec = np.delete(X_vec, 0, axis=0)
    return X_vec


import os
def save_data(X_train, X_test, Y_train, Y_test, 
              path='data/word2vec/', file='New_data'):
    '''
    input:
        X_train: ndarray
        X_test: ndarray
        Y_train: ndarray
        Y_test: ndarray
    '''
    print('Saving data...')
    if(os.path.exists(path+file+'.npz')):
        for i in range(1,10):
            if(not(os.path.exists(path+file+str(i)+'.npz'))):
                file = file+'_'+str(i)+'.npz'
                break
    else:
        file = file+'.npz'
        
    np.savez(path+file, 
             X_train=X_train, Y_train=Y_train,
             X_test=X_test, Y_test=Y_test) 
    print('Data saved in :', path+file)


def Word2vec(path='data/', file='广东(数据清洗4).xlsx', 
             vectorize='CBOW', size=100,
             drop_stop_words=False, save_file=True):
    """
    input:
        path: str, path of file, default 'data/'
        file: str, name of file, default '广东(数据清洗4).xlsx'
        vectorize: str, ['CBOW', 'Skip_Gram'], default 'CBOW'
        drop_stop_words: bool, default False
        size: int, max_features of X, default 100
        save_file： bool defalut True
            
    output:
        X_train: ndarray
        X_test: ndarray
        Y_train: ndarray
        Y_test: ndarray
    """
    # 读取数据
    print('Reading data...')
    data = pd.read_excel(path+file, index_col=0)
    data.drop(columns=['景点','昵称','等级','时间'], inplace=True)
    data.columns = ['Corpus', 'Sentiment']
    
    Y = data['Sentiment']
    Y = Y.map({0:0, 1:0, 2:0, 3:0, 4:1, 5:1}).values   # 将评分转换为情感倾向
    X = data['Corpus']
    
    # 分词
    print('Splitting...')
    X = X.map(lambda x:jieba.lcut(x))    
    
    # 去除停用词
    if(drop_stop_words):
        print('Dropping Stop Words...')
        X = stop_words(X, path, file='stopwords.txt') 
        
    # word2vec
    if(vectorize=='CBOW'):
        sg=0
    elif(vectorize=='Skip_Gram'):
        sg=1
    else:
        print('vectorize type error')
        return None
    sentences = X.tolist()
    model = gensim.models.Word2Vec(
        sentences,           # 语料
        size=size,           # 词向量大小
        sg=sg,               # 模型的训练算法: 1: skip-gram; 0: CBOW
        window=5,            # 句子中当前单词和被预测单词的最大距离
        hs=0,                # 1: 采用hierarchical softmax训练模型; 0: 使用负采样
        negative=5,          # 使用负采样，设置多个负采样(通常在5-20之间)
        ns_exponent=0.75,    # 负采样分布指数。1.0样本值与频率成正比，0.0样本所有单词均等，负值更多地采样低频词。
        min_count=5,         # 忽略词频小于此值的单词
        alpha=0.025,         # 初始学习率
        min_alpha=0.0001,    # 随着训练的进行，学习率线性下降到min_alpha
        sample=0.001,        # 高频词随机下采样的配置阈值
        cbow_mean=1,         # 0: 使用上下文单词向量的总和; 1: 使用均值，适用于使用CBOW。
        seed=1,              # 随机种子
        workers=4            # 线程数
    )
    
    # 将 X 转换为向量形式
    X_vec = w2v_transformer(X, model, size)
    
    # 划分训练集和测试集
    X_train, X_test, Y_train, Y_test = \
    train_test_split(X_vec, Y, test_size=0.2, random_state=0)
    
    print_info(X_train, X_test, Y_train, Y_test)
    
    if save_file:
        save_data(X_train, X_test, Y_train, Y_test, 
                  file=vectorize+'_'+str(size))
    
    return X_train, X_test, Y_train, Y_test


def load_data_w2v(vectorize, size, path='data/word2vec/'):
    '''
    input:
        vectorize: str, ['CBOW', 'Skip_Gram']
        size: int, max_features of X
    
    output:
        X_train: ndarray
        X_test: ndarray
        Y_train: ndarray
        Y_test: ndarray
    '''
    print('Loading '+str(vectorize)+'_'+str(size)+'.npz')
    data = np.load(path+str(vectorize)+'_'+str(size)+'.npz')
    X_train = data['X_train']
    X_test = data['X_test']
    Y_train = data['Y_train']
    Y_test = data['Y_test']
    print_info(X_train, X_test, Y_train, Y_test)
    return X_train, X_test, Y_train, Y_test

训练词向量

In [125]:
vectorize = 'CBOW'
size = 200
X_train, X_test, Y_train, Y_test = Word2vec(vectorize=vectorize, size=size)

Reading data...
Splitting...
Transforming...


HBox(children=(FloatProgress(value=0.0, max=39901.0), HTML(value='')))



X_train shape: (31920, 100) type: <class 'numpy.ndarray'>
X_test shape: (7981, 100) type: <class 'numpy.ndarray'>
Y_train shape: (31920,) type: <class 'pandas.core.series.Series'>
Y_test shape: (7981,) type: <class 'pandas.core.series.Series'>
Saving data...
Data saved in : data/word2vec/Skip_Gram_100.npz


加载数据

## Skip_Gram

In [151]:
vectorize = 'Skip_Gram'
size = 100
X_train, X_test, Y_train, Y_test = load_data_w2v(vectorize, size)

Loading Skip_Gram_100.npz

X_train shape: (31920, 100) type: <class 'numpy.ndarray'>
X_test shape: (7981, 100) type: <class 'numpy.ndarray'>
Y_train shape: (31920,) type: <class 'numpy.ndarray'>
Y_test shape: (7981,) type: <class 'numpy.ndarray'>


In [152]:
from sklearn import metrics
from sklearn.naive_bayes import BernoulliNB
# 建立朴素贝叶斯分类模型
model = BernoulliNB()
model.fit(X_train, Y_train)
# 进行模型训练和预测
Y_predict = model.predict(X_test)
# 打印混淆矩阵和分类报告
print('classification_report on test set:\n', 
      metrics.classification_report(Y_test, Y_predict))

print('classification_report on train set:\n', 
      metrics.classification_report(Y_train, model.predict(X_train)))

classification_report on test set:
               precision    recall  f1-score   support

           0       0.61      0.70      0.65      4063
           1       0.63      0.54      0.58      3918

    accuracy                           0.62      7981
   macro avg       0.62      0.62      0.62      7981
weighted avg       0.62      0.62      0.62      7981

classification_report on train set:
               precision    recall  f1-score   support

           0       0.61      0.69      0.65     16125
           1       0.64      0.56      0.59     15795

    accuracy                           0.62     31920
   macro avg       0.63      0.62      0.62     31920
weighted avg       0.63      0.62      0.62     31920



## CBOW

In [133]:
vectorize = 'CBOW'
size = 200
X_train, X_test, Y_train, Y_test = load_data_w2v(vectorize, size)

Loading CBOW_200.npz

X_train shape: (31920, 200) type: <class 'numpy.ndarray'>
X_test shape: (7981, 200) type: <class 'numpy.ndarray'>
Y_train shape: (31920,) type: <class 'numpy.ndarray'>
Y_test shape: (7981,) type: <class 'numpy.ndarray'>


In [134]:
from sklearn import metrics
from sklearn.naive_bayes import BernoulliNB
# 建立朴素贝叶斯分类模型
model = BernoulliNB()
model.fit(X_train, Y_train)
# 进行模型训练和预测
Y_predict = model.predict(X_test)
# 打印混淆矩阵和分类报告
print('classification_report on test set:\n', 
      metrics.classification_report(Y_test, Y_predict))

print('classification_report on train set:\n', 
      metrics.classification_report(Y_train, model.predict(X_train)))

classification_report on test set:
               precision    recall  f1-score   support

           0       0.60      0.71      0.65      4063
           1       0.63      0.52      0.57      3918

    accuracy                           0.62      7981
   macro avg       0.62      0.61      0.61      7981
weighted avg       0.62      0.62      0.61      7981

classification_report on train set:
               precision    recall  f1-score   support

           0       0.60      0.71      0.65     16125
           1       0.64      0.52      0.58     15795

    accuracy                           0.62     31920
   macro avg       0.62      0.62      0.61     31920
weighted avg       0.62      0.62      0.61     31920



# 模型比较

In [146]:
vectorize = 'CBOW'
size = 200
X_train, X_test, Y_train, Y_test = load_data_w2v(vectorize, size)

n_train = Y_train.shape[0]
n_test = Y_test.shape[0]
X = np.vstack([X_train, X_test])
Y = np.vstack([Y_train.reshape(n_train,1), 
               Y_test.reshape(n_test,1)])
data_df = pd.concat([pd.DataFrame(X), pd.Series(Y.ravel())], axis=1)
col_name = ['X'+str(i) for i in range(0, X.shape[1])]
col_name.append('Y')
data_df.columns = col_name
data_df.head()

Loading CBOW_200.npz

X_train shape: (31920, 200) type: <class 'numpy.ndarray'>
X_test shape: (7981, 200) type: <class 'numpy.ndarray'>
Y_train shape: (31920,) type: <class 'numpy.ndarray'>
Y_test shape: (7981,) type: <class 'numpy.ndarray'>


Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X191,X192,X193,X194,X195,X196,X197,X198,X199,Y
0,0.029393,0.177788,0.047445,-0.121467,-0.08532,0.324261,-0.01949,-0.189239,0.151248,-0.425302,...,-0.024507,-0.106273,0.105524,0.352464,-0.096433,0.207977,0.206285,0.185514,0.020179,1
1,-0.018025,0.011907,-0.004808,-0.079012,-0.115963,0.17388,-0.24647,-0.323955,-0.156629,-0.37339,...,0.220696,-0.305901,-0.140108,0.094671,-0.041857,0.427454,0.203194,0.133867,-0.120397,1
2,-0.064865,0.096922,0.001968,-0.298051,0.021166,0.178036,-0.106695,-0.394553,0.169719,-0.482603,...,-0.01368,-0.244887,0.079051,0.266706,0.047921,0.461391,0.36063,0.21055,-0.021371,0
3,-0.027654,-0.066606,0.028027,-0.120977,-0.168097,0.282191,-0.254473,-0.231427,-0.135479,-0.366067,...,0.108531,-0.34485,-0.141817,0.176757,-0.137319,0.449372,0.125032,0.176738,-0.116875,0
4,-0.256325,-0.175646,0.125794,-0.355323,-0.082088,0.336249,-0.310871,-0.108484,0.04,-0.272495,...,0.008906,-0.359041,-0.296457,0.070864,0.02292,0.637443,0.075743,0.298623,0.123394,1


In [147]:
# 初始化模型
from pycaret.classification import *
exp1 = setup(data_df, target='Y')

 
Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,3194
1,Target Type,Binary
2,Label Encoded,
3,Original Data,"(39901, 201)"
4,Missing Values,False
5,Numeric Features,200
6,Categorical Features,0
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


In [148]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa
0,Extra Trees Classifier,0.6883,0.7681,0.6812,0.6861,0.6834,0.3764
1,CatBoost Classifier,0.6883,0.7584,0.6975,0.6798,0.6882,0.3766
2,Light Gradient Boosting Machine,0.6851,0.7484,0.6906,0.678,0.6839,0.3701
3,Extreme Gradient Boosting,0.6632,0.7192,0.6841,0.6519,0.6675,0.3267
4,Ridge Classifier,0.6555,0.0,0.6841,0.6421,0.6623,0.3114
5,Linear Discriminant Analysis,0.6544,0.6992,0.6754,0.6428,0.6585,0.3092
6,Logistic Regression,0.6535,0.697,0.675,0.642,0.658,0.3074
7,Gradient Boosting Classifier,0.653,0.7201,0.6699,0.6433,0.6561,0.3062
8,Random Forest Classifier,0.6528,0.7247,0.5616,0.6805,0.6151,0.3042
9,Quadratic Discriminant Analysis,0.6516,0.7003,0.6815,0.6383,0.659,0.3036
