In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.neural_network import MLPClassifier


### 数据预处理

In [6]:
df = pd.read_excel('data_processed.xlsx')
drop_cols = ['发帖吧','情绪','多空','发帖id','发帖内容','查询时间','发帖时间']
def preprocess(df, drop_cols):
    word_number = []
    for i in df['发帖内容']:
        word_number.append(len(str(i)))
    df['发帖字数'] = word_number
    df.drop(columns=drop_cols,inplace=True)
    df = df.replace('NAN',0)
    return df

df = preprocess(df, drop_cols)
df = df[df['身份']!=0]
print(df.isnull().sum())
df.info()

身份            0
发帖人           0
是否为主贴         0
是否有图片         0
阅读量           0
评论量           0
点赞量           0
发帖人发帖数        0
发帖人评论数        0
发帖人关注数        0
发帖人粉丝数        0
发帖人影响力        0
发帖人吧龄         0
发帖人总访问量       0
发帖人今日访问量      0
Time_delta    0
近n日发帖数        0
发帖字数          0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2244 entries, 0 to 2329
Data columns (total 18 columns):
身份            2244 non-null int64
发帖人           2244 non-null object
是否为主贴         2244 non-null int64
是否有图片         2244 non-null int64
阅读量           2244 non-null int64
评论量           2244 non-null int64
点赞量           2244 non-null int64
发帖人发帖数        2244 non-null int64
发帖人评论数        2244 non-null int64
发帖人关注数        2244 non-null int64
发帖人粉丝数        2244 non-null int64
发帖人影响力        2244 non-null int64
发帖人吧龄         2244 non-null float64
发帖人总访问量       2244 non-null int64
发帖人今日访问量      2244 non-null int64
Time_delta    2244 non-null float64
近n日发帖数        2244 non-null int64
发帖字数     

In [7]:
# LabelEncoder
encoded_df = df.drop(columns='身份').apply(LabelEncoder().fit_transform)
encoded_df.head()


Unnamed: 0,发帖人,是否为主贴,是否有图片,阅读量,评论量,点赞量,发帖人发帖数,发帖人评论数,发帖人关注数,发帖人粉丝数,发帖人影响力,发帖人吧龄,发帖人总访问量,发帖人今日访问量,Time_delta,近n日发帖数,发帖字数
0,526,1,1,92,2,0,21,48,0,0,2,51,424,35,31,12,90
1,526,1,0,0,0,0,21,48,0,0,2,51,424,35,31,12,32
2,526,1,0,0,0,0,21,48,0,0,2,51,424,35,31,12,27
3,284,1,1,142,0,0,383,393,66,100,6,76,935,31,32,7,29
4,260,1,0,205,1,0,386,353,49,141,7,97,1086,2,34,4,170


In [4]:
# Onehot
cols = ['发帖人']
onehot_df = pd.get_dummies(df, columns=cols)
onehot_df.drop(columns=['身份'],inplace=True)
onehot_df.head()

Unnamed: 0,是否为主贴,是否有图片,阅读量,评论量,点赞量,发帖人发帖数,发帖人评论数,发帖人关注数,发帖人粉丝数,发帖人影响力,...,发帖人_骑熊红牛仔,发帖人_鬼手王承,发帖人_鲲鹏雕虫,发帖人_鹤Au99818,发帖人_麦道1688,发帖人_黄金期货原油分析,发帖人_黑嘴大阴棒特使,发帖人_黑芝麻小散,发帖人_黑金策略,发帖人_龙游城空
0,1,1,382,2,0,21,49,0,0,2,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,21,49,0,0,2,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,21,49,0,0,2,...,0,0,0,0,0,0,0,0,0,0
3,1,1,464,0,0,1211,4364,111,176,6,...,0,0,0,0,0,0,0,0,0,0
4,1,0,582,1,0,1222,2019,60,936,7,...,0,0,0,0,0,0,0,0,0,0


### CART决策树预测模型

In [11]:
X_le = encoded_df.values
Y_le = df['身份']
X_train_le, X_test_le, y_train_le, y_test_le = train_test_split(X_le, Y_le, test_size=0.3, random_state=88)
X_oh = onehot_df.values
Y_oh = df['身份']
X_train_oh, X_test_oh, y_train_oh, y_test_oh = train_test_split(X_oh, Y_oh, test_size=0.3, random_state=88)

DTC_oh = tree.DecisionTreeClassifier(max_depth=20,min_samples_leaf=10)
DTC_oh.fit(X_train_oh,y_train_oh.ravel())
DTC_oh_pred = DTC_oh.predict(X_test_oh)
print('The accuracy of OneHot DecisionTree is', metrics.accuracy_score(DTC_oh_pred, y_test_oh))
print('特征重要性排序前五：',onehot_df.columns[DTC_oh.feature_importances_.argsort()[::-1][0:5]])
#print(DTC_oh.feature_importances_)

DTC_le = tree.DecisionTreeClassifier(max_depth=20,min_samples_leaf=10,max_features=16,max_leaf_nodes=10)
DTC_le.fit(X_train_le,y_train_le.ravel())
DTC_le_pred = DTC_le.predict(X_test_le)
print('The accuracy of LabelEncoder DecisionTree is', metrics.accuracy_score(DTC_le_pred, y_test_le))
print('特征重要性排序前五：',encoded_df.columns[DTC_le.feature_importances_.argsort()[::-1][0:5]])
#print(DTC_le.feature_importances_)

The accuracy of OneHot DecisionTree is 0.9673590504451038
特征重要性排序前五： Index(['发帖字数', '发帖人粉丝数', '阅读量', '发帖人_李李杰杰', '发帖人总访问量'], dtype='object')
The accuracy of LabelEncoder DecisionTree is 0.9614243323442137
特征重要性排序前五： Index(['发帖字数', '发帖人粉丝数', '阅读量', '发帖人总访问量', '发帖人今日访问量'], dtype='object')


### 随机森林预测结果

In [8]:
encoded_user = df[['发帖人']].apply(LabelEncoder().fit_transform)
df['发帖人'] = encoded_user
X = np.array(df.drop(columns='身份'))
y = np.array(df['身份'])
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X, y, test_size=0.2, random_state=7)

rfc = RandomForestClassifier(n_estimators=20,max_depth=15,max_leaf_nodes=20)
rfc.fit(X_train_rf,y_train_rf.ravel())
rfc_pred = rfc.predict(X_test_rf)
print('The accuracy of RandomForest is', metrics.accuracy_score(rfc_pred, y_test_rf))
print('特征重要性排序前五：',encoded_df.columns[rfc.feature_importances_.argsort()[::-1][0:5]])
print('Feature Importance:')
dict(zip(encoded_df.columns,rfc.feature_importances_))

The accuracy of RandomForest is 0.9799554565701559
特征重要性排序前五： Index(['发帖字数', '发帖人评论数', '发帖人粉丝数', '阅读量', 'Time_delta'], dtype='object')
Feature Importance:


{'发帖人': 0.02930349534650372,
 '是否为主贴': 0.005878783450528902,
 '是否有图片': 0.010608298256623266,
 '阅读量': 0.10027191381243002,
 '评论量': 0.0157100772818809,
 '点赞量': 0.004194247441092892,
 '发帖人发帖数': 0.02472486861900697,
 '发帖人评论数': 0.14406526522975666,
 '发帖人关注数': 0.012887545077356904,
 '发帖人粉丝数': 0.12942690065884443,
 '发帖人影响力': 0.018103817289718306,
 '发帖人吧龄': 0.038737902184375596,
 '发帖人总访问量': 0.05629639458967767,
 '发帖人今日访问量': 0.0429615907964332,
 'Time_delta': 0.07041855020443755,
 '近n日发帖数': 0.029051796511610283,
 '发帖字数': 0.26735855324972274}

### 神经网络预测效果

In [214]:
X_train_mlp, X_test_mlp, y_train_mlp, y_test_mlp = train_test_split(X, y, test_size=0.2, random_state=7)

mlp = MLPClassifier(hidden_layer_sizes=(100,20),alpha=0.0001,max_iter=600,solver='lbfgs') 
mlp.fit(X_train_mlp,y_train_mlp.ravel())
mlp_pred = mlp.predict(X_test_mlp)
print('The loss on training set is',mlp.loss_)
print('The accuracy of Multi-layer Perceptron Classifier is', metrics.accuracy_score(mlp_pred, y_test_mlp))

matrix_train = metrics.confusion_matrix(y_train_mlp,mlp.predict(X_train_mlp))
print('Confusion matrix on training set：\n',matrix_train)
report_train = metrics.classification_report(y_train_mlp,mlp.predict(X_train_mlp))
print('Classification on training set：\n',report_train)


The loss on training set is 0.2660709443164722
The accuracy of Multi-layer Perceptron Classifier is 0.9192139737991266
Confusion matrix on training set：
 [[1522  119]
 [  35  152]]
Classification on training set：
               precision    recall  f1-score   support

           1       0.98      0.93      0.95      1641
           2       0.56      0.81      0.66       187

    accuracy                           0.92      1828
   macro avg       0.77      0.87      0.81      1828
weighted avg       0.93      0.92      0.92      1828



### CrossValidation参数调优并使用调优后的参数再次进行上述预测

In [209]:
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [10,15,20],
              'max_depth': [10,15,20],
              'max_leaf_nodes': [20,30,40]}
rfc = RandomForestClassifier()
rfc_cv = GridSearchCV(rfc, param_grid=param_grid, cv = 10)
rfc_cv.fit(X, y.ravel())
print('Best hyper-parameter of RandomForrest model:',rfc_cv.best_params_)
print(rfc_cv.best_score_)

param_grid = {'alpha': [0.01,0.001,0.0001],
              'hidden_layer_sizes': [(100,),(100,20),(50,10),(30,10)]}
mlp = MLPClassifier(max_iter = 1000)
mlp_cv = GridSearchCV(mlp, param_grid=param_grid, cv = 3)
mlp_cv.fit(X, y.ravel())
print('Best hyper-parameter of MLP model:',mlp_cv.best_params_)
print(mlp_cv.best_score_)

param_grid = {'max_features': [10,15,16],
              'max_depth': [15,20,25,30],
              'max_leaf_nodes': [5,10,15,20,30,40]}
dtc = tree.DecisionTreeClassifier()
dtc_cv = GridSearchCV(dtc, param_grid=param_grid, cv = 10)
dtc_cv.fit(X_le, Y_le.ravel())
print('Best hyper-parameter of DecisionTree on LabelEncoder model:',dtc_cv.best_params_)
print(dtc_cv.best_score_)
dtc_cv.fit(X_oh, Y_oh.ravel())
print('Best hyper-parameter of DecisionTree on OneHot model:',dtc_cv.best_params_)
print(dtc_cv.best_score_)

Best hyper-parameter of RandomForrest model: {'max_depth': 15, 'max_leaf_nodes': 20, 'n_estimators': 20}
0.9755030621172354
Best hyper-parameter of MLP model: {'alpha': 0.0001, 'hidden_layer_sizes': (100, 20)}
0.9396325459317585
Best hyper-parameter of DecisionTree on LabelEncoder model: {'max_depth': 20, 'max_features': 16, 'max_leaf_nodes': 10}
0.9671916010498688
Best hyper-parameter of DecisionTree on OneHot model: {'max_depth': 25, 'max_features': 15, 'max_leaf_nodes': 40}
0.9431321084864392




In [251]:
scores = [DTC_le.score(X_test_le,y_test_le),
          DTC_oh.score(X_test_oh,y_test_oh), 
          rfc.score(X_test_rf,y_test_rf), 
          mlp.score(X_test_mlp,y_test_mlp)]
score_df = pd.DataFrame({
    'Model' : ['Decision Tree on LabelEncoder','Decision Tree on OneHot','Random Forest','MLP'],
    'Scores' : scores
})
score_df.set_index('Model')
score_df

Unnamed: 0,Model,Scores
0,Decision Tree on LabelEncoder,0.963557
1,Decision Tree on OneHot,0.962099
2,Random Forest,0.978166
3,MLP,0.919214


In [15]:
# 将评论内容写成分开的文本进行人工情绪分类
reviews = pd.read_excel('data_processed.xlsx')[['label','发帖内容','情绪','多空']]
reviews = reviews[reviews['label']==1].reset_index().drop(columns=['index','label'])[['发帖内容','情绪','多空']]

# 情绪文本分类
for i in range(-1,2):
    reviews_i = reviews[reviews['情绪']==i] 
    if i == 1:
        for line in range(len(reviews_i)):
            f = open('train/pos/{}_1.txt'.format(line),'w+') # labeled comments
            f.write(reviews_i[i])
            f.close()
    elif i == 0:
        for line in range(len(reviews_i)):
            f = open('train/neu/{}_0.txt'.format(line),'w+') # labeled comments
            f.write(reviews_i[i])
            f.close()      
    else:
        for line in range(len(reviews_i)):
            f = open('train/neg/{}_-1.txt'.format(line),'w+') # labeled comments
            f.write(reviews_i[i])
            f.close()
# 多空分类
for i in range(-1,3):
    reviews_i = reviews[reviews['多空']==i] 
    if i == 1:
        for line in range(len(reviews_i)):
            f = open('train/pos/{}_1.txt'.format(line),'w+') # labeled comments
            f.write(reviews_i[i])
            f.close()
    elif i == 0:
        for line in range(len(reviews_i)):
            f = open('train/null/{}_0.txt'.format(line),'w+') # labeled comments
            f.write(reviews_i[i])
            f.close()      
    elif: i == -1
        for line in range(len(reviews_i)):
            f = open('train/neg/{}_-1.txt'.format(line),'w+') # labeled comments
            f.write(reviews_i[i])
            f.close()
    else:
        for line in range(len(reviews_i)):
            f = open('train/Undefined/{}_2.txt'.format(line),'w+') # labeled comments
            f.write(reviews_i[i])
            f.close()

In [9]:
test = pd.read_excel('test.xlsx')
drop_cols = ['发帖吧','发帖id','查询时间']
test = preprocess(test, drop_cols)

encoded_test = test[['发帖人']].apply(LabelEncoder().fit_transform)
test['发帖人'] = encoded_test
X_test = np.array(test.drop(columns=['发帖内容','发帖时间']))

y_pred = rfc.predict(X_test)
test['身份'] = y_pred
test.to_excel('identity_labeled.xlsx')