In [1]:
"""
Author: Runyao Yu
runyao.yu@tum.de
Research Internship in ETH Zurich
For Academic Use Purpose only
"""

# Basics + Viz
import pandas as pd

# Text pre-processing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
import paddlehub as hub

# Models
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

# Metrics
from sklearn.metrics import classification_report

import os
import re

In [4]:
topics_dict ={  
                "Appearance and figure": '外貌长相身材',
                "Daily Life": '日常生活',
                "Education and Careers": '教育和职业',
                "Expressing emotional opinions and sharing experiences": '表达情感见解看法，分享过往经历',
                "For more information": '想要获得更多信息',
                "Interaction": '和博主互动',
                "Location": '地理位置',
                "Personal Information": '个人信息',
                "Relationship": '情感恋爱',
                "Sex": '性',
                "Socialization": '社交朋友',
                "Unrelated": '与以上均无关',
                "Wanna know each other": '想要认识对方'}

In [5]:
def remove_punctuation(line):
    line = str(line)
    if line.strip()=='':
        return ''
    rule = re.compile(u"[^A-Z0-1\u4E00-\u9FA5\！\？\!\?]")
    line = rule.sub('',line)
    return line

In [6]:
def file_processing(file_path, file_name):
    df = pd.read_excel(file_path)
    topic_name = topics_dict[file_name.split(".")[0]]
    df["Topic"] = df[topic_name]
    
    # 处理格式/processing data
    for i in range(df["Topic"].shape[0]):
        cla = df[topic_name][i]
        if cla == 1:
            df["Topic"][i] = topic_name
        else:
            df["Topic"][i] = "无关"
            
    df = df.drop([topic_name], axis = 1)
    
    df['topic_id'] = df['Topic'].factorize()[0]
    topic_id_df = df[['Topic', 'topic_id']].drop_duplicates().sort_values('topic_id').reset_index(drop=True)
    topic_to_id = dict(topic_id_df.values)
    id_to_topic = dict(topic_id_df[['topic_id', 'Topic']].values)
    
    #处理数据/processing data
    df['clean_review'] = df['content'].apply(remove_punctuation)
    lac = hub.Module(name="lac")
    df['cut_review'] = df['clean_review'].apply(lambda x: " ".join([w for w in lac.lexical_analysis(texts=[x], use_gpu=False, batch_size=1)[0]["word"]]))
    
    return df

In [7]:
def training(df):
    train, test = train_test_split(df[['cut_review', 'topic_id']],
                                test_size=0.2,
                                random_state = 0)
    onevr = Pipeline(steps=[('tfidf',TfidfVectorizer(sublinear_tf=True, min_df=1,
                            ngram_range=(1, 2))),
                            ('clasfi', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1))])
    
    topic_id_df = df[['Topic', 'topic_id']].drop_duplicates().sort_values('topic_id').reset_index(drop=True)
    topic_to_id = dict(topic_id_df.values)
    id_to_topic = dict(topic_id_df[['topic_id', 'Topic']].values)
    
    
    for i in range(len(topic_to_id)):
        print(f'Processing **{id_to_topic[i]}** posts...')

        # Training logistic regression model on train data
        onevr.fit(train['cut_review'], train['topic_id'] == i)

        # calculating test accuracy
        prediction = onevr.predict(test['cut_review'])
    #     print(prediction)
        temp_y = test['topic_id'].values == i

        print(classification_report(temp_y, prediction))
    #     print(f'Test accuracy is {accuracy_score(temp_y, prediction)}')
        print("\n")

In [8]:
# 只需改变files_path，也就是存放每一类增补sh/only need to change the path where the augmented data stored
files_path = './增补的数据'
all_file = pd.DataFrame(os.walk(files_path))
files = all_file[2][0]
for file_name in files:
    lujin = files_path + os.sep + file_name
    df = file_processing(lujin, file_name) 
    training(df)
    print("=" * 100)



Processing **外貌长相身材** posts...
              precision    recall  f1-score   support

       False       0.83      0.99      0.90       268
        True       0.89      0.23      0.36        70

    accuracy                           0.83       338
   macro avg       0.86      0.61      0.63       338
weighted avg       0.84      0.83      0.79       338



Processing **无关** posts...
              precision    recall  f1-score   support

       False       0.89      0.23      0.36        70
        True       0.83      0.99      0.90       268

    accuracy                           0.83       338
   macro avg       0.86      0.61      0.63       338
weighted avg       0.84      0.83      0.79       338







Processing **日常生活** posts...
              precision    recall  f1-score   support

       False       0.80      1.00      0.89       273
        True       0.80      0.06      0.11        71

    accuracy                           0.80       344
   macro avg       0.80      0.53      0.50       344
weighted avg       0.80      0.80      0.73       344



Processing **无关** posts...
              precision    recall  f1-score   support

       False       0.80      0.06      0.11        71
        True       0.80      1.00      0.89       273

    accuracy                           0.80       344
   macro avg       0.80      0.53      0.50       344
weighted avg       0.80      0.80      0.73       344







Processing **教育和职业** posts...
              precision    recall  f1-score   support

       False       0.79      1.00      0.88       254
        True       1.00      0.50      0.66       133

    accuracy                           0.83       387
   macro avg       0.90      0.75      0.77       387
weighted avg       0.86      0.83      0.81       387



Processing **无关** posts...
              precision    recall  f1-score   support

       False       1.00      0.50      0.66       133
        True       0.79      1.00      0.88       254

    accuracy                           0.83       387
   macro avg       0.90      0.75      0.77       387
weighted avg       0.86      0.83      0.81       387







Processing **表达情感见解看法，分享过往经历** posts...
              precision    recall  f1-score   support

       False       0.95      0.80      0.87       271
        True       0.85      0.96      0.90       308

    accuracy                           0.89       579
   macro avg       0.90      0.88      0.89       579
weighted avg       0.89      0.89      0.89       579



Processing **无关** posts...
              precision    recall  f1-score   support

       False       0.85      0.96      0.90       308
        True       0.95      0.80      0.87       271

    accuracy                           0.89       579
   macro avg       0.90      0.88      0.89       579
weighted avg       0.89      0.89      0.89       579







Processing **想要获得更多信息** posts...
              precision    recall  f1-score   support

       False       0.75      0.98      0.85       248
        True       0.90      0.31      0.46       119

    accuracy                           0.77       367
   macro avg       0.83      0.65      0.66       367
weighted avg       0.80      0.77      0.72       367



Processing **无关** posts...
              precision    recall  f1-score   support

       False       0.90      0.31      0.46       119
        True       0.75      0.98      0.85       248

    accuracy                           0.77       367
   macro avg       0.83      0.65      0.66       367
weighted avg       0.80      0.77      0.72       367







Processing **和博主互动** posts...
              precision    recall  f1-score   support

       False       0.74      0.44      0.56       124
        True       0.69      0.89      0.78       176

    accuracy                           0.71       300
   macro avg       0.72      0.67      0.67       300
weighted avg       0.71      0.71      0.69       300



Processing **无关** posts...
              precision    recall  f1-score   support

       False       0.69      0.89      0.78       176
        True       0.74      0.44      0.56       124

    accuracy                           0.71       300
   macro avg       0.72      0.67      0.67       300
weighted avg       0.71      0.71      0.69       300







Processing **地理位置** posts...
              precision    recall  f1-score   support

       False       0.78      0.99      0.87       235
        True       0.97      0.49      0.65       129

    accuracy                           0.81       364
   macro avg       0.87      0.74      0.76       364
weighted avg       0.85      0.81      0.79       364



Processing **无关** posts...
              precision    recall  f1-score   support

       False       0.97      0.49      0.65       129
        True       0.78      0.99      0.87       235

    accuracy                           0.81       364
   macro avg       0.87      0.74      0.76       364
weighted avg       0.85      0.81      0.79       364







Processing **个人信息** posts...
              precision    recall  f1-score   support

       False       0.85      1.00      0.92       275
        True       0.88      0.13      0.23        54

    accuracy                           0.85       329
   macro avg       0.86      0.56      0.57       329
weighted avg       0.86      0.85      0.81       329



Processing **无关** posts...
              precision    recall  f1-score   support

       False       0.88      0.13      0.23        54
        True       0.85      1.00      0.92       275

    accuracy                           0.85       329
   macro avg       0.86      0.56      0.57       329
weighted avg       0.86      0.85      0.81       329







Processing **情感恋爱** posts...
              precision    recall  f1-score   support

       False       0.85      0.92      0.88       244
        True       0.88      0.78      0.82       178

    accuracy                           0.86       422
   macro avg       0.86      0.85      0.85       422
weighted avg       0.86      0.86      0.86       422



Processing **无关** posts...
              precision    recall  f1-score   support

       False       0.88      0.78      0.82       178
        True       0.85      0.92      0.88       244

    accuracy                           0.86       422
   macro avg       0.86      0.85      0.85       422
weighted avg       0.86      0.86      0.86       422







Processing **性** posts...
              precision    recall  f1-score   support

       False       0.84      1.00      0.91       274
        True       1.00      0.06      0.11        54

    accuracy                           0.84       328
   macro avg       0.92      0.53      0.51       328
weighted avg       0.87      0.84      0.78       328



Processing **无关** posts...
              precision    recall  f1-score   support

       False       1.00      0.06      0.11        54
        True       0.84      1.00      0.91       274

    accuracy                           0.84       328
   macro avg       0.92      0.53      0.51       328
weighted avg       0.87      0.84      0.78       328







Processing **社交朋友** posts...
              precision    recall  f1-score   support

       False       0.88      1.00      0.94       275
        True       1.00      0.27      0.43        51

    accuracy                           0.89       326
   macro avg       0.94      0.64      0.68       326
weighted avg       0.90      0.89      0.86       326



Processing **无关** posts...
              precision    recall  f1-score   support

       False       1.00      0.27      0.43        51
        True       0.88      1.00      0.94       275

    accuracy                           0.89       326
   macro avg       0.94      0.64      0.68       326
weighted avg       0.90      0.89      0.86       326







Processing **与以上均无关** posts...
              precision    recall  f1-score   support

       False       0.70      0.93      0.79       201
        True       0.81      0.45      0.58       147

    accuracy                           0.72       348
   macro avg       0.76      0.69      0.69       348
weighted avg       0.75      0.72      0.70       348



Processing **无关** posts...
              precision    recall  f1-score   support

       False       0.81      0.45      0.58       147
        True       0.70      0.93      0.79       201

    accuracy                           0.72       348
   macro avg       0.76      0.69      0.69       348
weighted avg       0.75      0.72      0.70       348







Processing **想要认识对方** posts...
              precision    recall  f1-score   support

       False       0.84      0.98      0.90       256
        True       0.85      0.41      0.56        82

    accuracy                           0.84       338
   macro avg       0.84      0.70      0.73       338
weighted avg       0.84      0.84      0.82       338



Processing **无关** posts...
              precision    recall  f1-score   support

       False       0.85      0.41      0.56        82
        True       0.84      0.98      0.90       256

    accuracy                           0.84       338
   macro avg       0.84      0.70      0.73       338
weighted avg       0.84      0.84      0.82       338



