In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import json
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

nltk.download('wordnet')
nltk.download('punkt')

!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

random_state = 42

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: /usr/share/nltk_data/corpora/wordnet/
  inflating: /usr/share/nltk_data/corpora/wordnet/lexnames  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adv  
  inflating: /usr/share/nltk_data/corpora/wordnet/adv.exc  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.verb  
  inflating: /usr/share/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /usr/share/nltk_data/corpora/wordnet/data.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/index.adj  
  inflating: /usr/share/nltk_data/corpora/wordnet/LICENSE  
  inflating: /usr/share/nltk_data/corpora/wordnet/citation.bib  
  inflating: /usr/share/nltk_data/corpora/

# 一、数据预处理

## 1. 数据加载

In [2]:
df = pd.read_csv("../input/mbti-type/mbti_1.csv")
df.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


## 2. 数据预处理

将MBTI类型转为序号：

In [3]:
unique_type_list = ['INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',
                    'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ']
type_to_idx = {mbti_type: idx for idx, mbti_type in enumerate(unique_type_list)}

df['type_idx'] = df['type'].map(type_to_idx)
df.head()

Unnamed: 0,type,posts,type_idx
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,0
1,ENTP,'I'm finding the lack of me in these posts ver...,1
2,INTP,'Good one _____ https://www.youtube.com/wat...,2
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",3
4,ENTJ,'You're fired.|||That's another silly misconce...,4


类别不平衡权重的计算：

In [4]:
type_count = df.groupby(['type']).count()
type_count_sum = len(df)

class_weights = []

for mbti_type in unique_type_list:
    class_weight = type_count_sum / (type_count.loc[mbti_type, 'posts'] * len(type_count))
    class_weights.append(class_weight)

class_weights

[0.36883503401360546,
 0.7915145985401459,
 0.41578796012269936,
 0.4969637946837763,
 2.347132034632035,
 2.8536184210526314,
 0.2959538755458515,
 0.8032407407407407,
 2.000691881918819,
 1.6088649851632046,
 3.2661897590361444,
 2.644817073170732,
 6.091994382022472,
 11.295572916666666,
 13.90224358974359,
 12.90922619047619]

读取 YouTube URL 到标题的映射文件：

In [5]:
import json

youtube_title_file = json.load(open("/kaggle/input/mbti-youtube/youtube_video_info.json"))

youtube_title = {}
for line in youtube_title_file:
    youtube_title[line['url']] = line['title']

# 二、统计特征提取

文本清洗，包括将 YouTube URL 替换为标题：

In [6]:
# 词形还原
lemmatiser = WordNetLemmatizer()

# 停用词
useless_words = stopwords.words("english")

# 提取YouTube链接
def extract_youtube_links(posts_str):
    youtube_pattern = r'https://(?:www\.)?(?:youtube\.com/(?:watch\?v=|embed/)|youtu\.be/)[\w\-]+'
    
    # 拆分多条发言
    posts = posts_str.split('|||')
    # 提取所有匹配的YouTube链接
    links = []
    for post in posts:
        # 使用 re.findall 提取完整链接
        links.extend(re.findall(youtube_pattern, post))
    return links

# 数据预处理
def pre_process_text(data, remove_stop_words=True, remove_mbti_profiles=True, replace_youtube_url=True):
    list_personality = []
    list_posts = []
    
    for row in data.iterrows():
        posts = row[1].posts
        temp = posts

        # 将 YouTube URL 替换为标题
        if replace_youtube_url:
            youtube_urls = extract_youtube_links(temp)
            for youtube_url in youtube_urls:
                temp = temp.replace(youtube_url, youtube_title.get(youtube_url, youtube_url))
        
        # 移除URL链接
        temp = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', temp)
        
        # 去除非字母字符 - 只保留单词
        temp = re.sub("[^a-zA-Z]", " ", temp)
        
        # 移除多余的空格（大于1个的空格）
        temp = re.sub(' +', ' ', temp).lower()
        
        # 移除重复字母的单词
        temp = re.sub(r'([a-z])\1{2,}[\s|\w]*', '', temp)
        
        # 移除停用词
        if remove_stop_words:
            temp = " ".join([lemmatiser.lemmatize(w) for w in temp.split(' ') if w not in useless_words])
        else:
            temp = " ".join([lemmatiser.lemmatize(w) for w in temp.split(' ')])
        
        # 从帖子中移除MBTI人格类型词汇
        if remove_mbti_profiles:
            for t in unique_type_list:
                temp = temp.replace(t.lower(),"mbtitype")

        list_posts.append(temp)
        list_personality.append(row[1].type_idx)
    
    # 返回结果
    list_posts = np.array(list_posts)
    list_personality = np.array(list_personality)
    return list_posts, list_personality

# 预处理
df_copy = df.copy()
list_posts, list_personality = pre_process_text(df_copy, remove_stop_words=True, remove_mbti_profiles=False, replace_youtube_url=False)
list_posts_youtube, list_personality_youtube = pre_process_text(df_copy, remove_stop_words=True, remove_mbti_profiles=False, replace_youtube_url=True)

# 展示一个样本
print('posts:', list_posts[1])
print('type:', list_personality[1])

posts:  finding lack post alarming sex boring position often example girlfriend currently environment creatively use cowgirl missionary enough giving new meaning game theory hello entp grin take converse flirting acknowledge presence return word smooth wordplay cheeky grin lack balance hand eye coordination real iq test score internet iq test funny score higher like former response thread mention believe iq test banish know entp vanish site year half return find people still commenting post liking idea thought know entp think thing sometimes go old sherlock holmes quote perhaps man special knowledge special power like rather encourages seek complex cheshirewolf tumblr com post really never thought e j p real function judge use use ne ti dominates fe emotion rarely si also use ni due strength know though ingenious saying really want try see happens playing first person shooter back drive around want see look rock paper one best make lol guy lucky really high tumblr system hear new first

将文本转为TF-IDF特征：

In [7]:
# 文本 -> TF
cntizer = CountVectorizer(analyzer="word", 
                             max_features=1000,  
                             max_df=0.7,
                             min_df=0.1) 
X_cnt = cntizer.fit_transform(list_posts)
X_cnt_youtube = cntizer.fit_transform(list_posts_youtube)

# TF -> TF-IDF
tfizer = TfidfTransformer()
X_tfidf = tfizer.fit_transform(X_cnt).toarray()
X_tfidf_youtube = tfizer.fit_transform(X_cnt_youtube).toarray()
print(X_tfidf.shape)

(8675, 615)


# 三、语义特征提取

由于提取时间较长，使用下面的代码将提取出的特征进行保存：

In [8]:
# import torch
# from tqdm import tqdm
# from transformers import pipeline
# from transformers import BertTokenizer, BertForSequenceClassification

# # 加载预训练的 BERT 模型和分词器
# model_name = 'bhadresh-savani/bert-base-uncased-emotion'
# tokenizer = BertTokenizer.from_pretrained(model_name)
# model = BertForSequenceClassification.from_pretrained(model_name, output_hidden_states=True, return_dict=True).cuda()

# # # 获取文本的 BERT 特征
# # def get_bert_embeddings(text):
# #     with torch.no_grad():
# #         inputs = tokenizer(text, truncation=True, max_length=512, return_tensors='pt').to('cuda')
# #         outputs = model(**inputs)
    
# #         # 获取 [CLS] token
# #         embeddings = outputs['hidden_states'][-1][:, 0, :].squeeze()

# #         # 获取概率
# #         logits = outputs['logits']
# #         predictions = torch.nn.functional.softmax(logits, dim=-1)
    
# #         return embeddings, predictions

# # 获取文本的情感分类
# def get_bert_embeddings(text):
#     with torch.no_grad():
#         inputs = tokenizer(text, truncation=True, max_length=512, return_tensors='pt').to('cuda')
#         outputs = model(**inputs)
#         predictions = torch.nn.functional.softmax(outputs['logits'], dim=-1)
    
#         return predictions

# # 数据预处理
# def pre_process_text_bert(data, remove_mbti_profiles=True, replace_youtube_url=True):
#     list_embeddings = []
    
#     for row in tqdm(data.iterrows(), total=len(data)):
#         posts = row[1].posts
#         temp = posts

#         # 将 YouTube URL 替换为标题
#         if replace_youtube_url:
#             youtube_urls = extract_youtube_links(temp)
#             for youtube_url in youtube_urls:
#                 temp = temp.replace(youtube_url, youtube_title.get(youtube_url, youtube_url))
        
#         # 移除URL链接
#         temp = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', temp)
        
#         # 从帖子中移除MBTI人格类型词汇增强模型泛化性
#         if remove_mbti_profiles:
#             for t in unique_type_list:
#                 temp = temp.replace(t.lower(),"mbtitype")

#         # 提取特征
#         emotions = [0] * 6
#         with torch.no_grad():
#             for post in temp.split('|||'):
#                 if any(char.isalpha() for char in post):
#                     scores = get_bert_embeddings(post)
#                     if scores.max() > 0.9:
#                         emotions[scores.argmax().item()] += 1

#         emotions = np.array(emotions)
#         if emotions.sum().item() == 0:
#             emotions = np.zeros((6))
#         else:
#             emotions = emotions / emotions.sum()
#         list_embeddings.append(emotions)
    
#     # 返回结果
#     list_embeddings = np.array(list_embeddings)
#     return list_embeddings

# # 预处理
# df_copy = df.copy()
# X_bert = pre_process_text_bert(df_copy, remove_mbti_profiles=False, replace_youtube_url=False)
# np.save("bert.npy", X_bert)

# # 展示一个样本
# print(X_bert[1].shape)

加载保存的特征：

In [9]:
X_bert = np.load("/kaggle/input/bert-feature/bert.npy")
X_bert = np.concatenate([X_bert, X_tfidf_youtube], axis=1)

# 四、实验

按7:3的比例划分数据集：

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, list_personality, test_size=0.3, random_state=random_state)
X_train_youtube, X_test_youtube, _, _ = train_test_split(X_tfidf_youtube, list_personality_youtube, test_size=0.3, random_state=random_state)
X_train_bert, X_test_bert, _, _ = train_test_split(X_bert, list_personality_youtube, test_size=0.3, random_state=random_state)

定义评估函数：

In [11]:
def idx_to_type(idx_array):
    IE, NS, TF, JP = [], [], [], []
    for idx in idx_array:
        mbti = unique_type_list[idx]
        IE.append(mbti[0] == 'E')
        NS.append(mbti[1] == 'S')
        TF.append(mbti[2] == 'F')
        JP.append(mbti[3] == 'P')

    return IE, NS, TF, JP

def evaluate(y_pred):
    # 计算总体准确率
    total_acc = accuracy_score(y_test, y_pred)

    # 拆成四个维度
    y_test_IE, y_test_NS, y_test_TF, y_test_JP = idx_to_type(y_test)
    y_pred_IE, y_pred_NS, y_pred_TF, y_pred_JP = idx_to_type(y_pred)

    # 计算各维度准确率
    IE_acc = accuracy_score(y_test_IE, y_pred_IE)
    NS_acc = accuracy_score(y_test_NS, y_pred_NS)
    TF_acc = accuracy_score(y_test_TF, y_pred_TF)
    JP_acc = accuracy_score(y_test_JP, y_pred_JP)
    
    return {'IE': IE_acc, 'NS':NS_acc, 'TF':TF_acc, 'JP':JP_acc, 'Total': total_acc}

## 1. Baseline

In [12]:
from sklearn.svm import SVC

# 在训练数据上拟合模型
model = SVC(random_state=random_state)
model.fit(X_train, y_train)

# 对测试数据进行预测
y_pred = model.predict(X_test)
print(evaluate(y_pred))

{'IE': 0.8079139454475605, 'NS': 0.88897426046869, 'TF': 0.7921628889742605, 'JP': 0.7598924318094507, 'Total': 0.5485977718017672}


## 2. 加入 YouTube 标题

In [13]:
from sklearn.svm import SVC

# 在训练数据上拟合模型
model = SVC(random_state=random_state)
model.fit(X_train_youtube, y_train)

# 对测试数据进行预测
y_pred = model.predict(X_test_youtube)
print(evaluate(y_pred))

{'IE': 0.8086822896657703, 'NS': 0.8885900883595851, 'TF': 0.7917787168651556, 'JP': 0.7645024971187092, 'Total': 0.5509028044563965}


## 3. 类别加权

In [14]:
from sklearn.svm import SVC

# 在训练数据上拟合模型
model = SVC(random_state=random_state, class_weight={i: weight for i, weight in enumerate(class_weights)})
model.fit(X_train, y_train)

# 对测试数据进行预测
y_pred = model.predict(X_test)
print(evaluate(y_pred))

{'IE': 0.79369957741068, 'NS': 0.8905109489051095, 'TF': 0.8079139454475605, 'JP': 0.7752593161736457, 'Total': 0.5597387629658087}


## 4. 加入 YouTube 标题 + 类别加权

In [15]:
from sklearn.svm import SVC

# 在训练数据上拟合模型
model = SVC(random_state=random_state, class_weight={i: weight for i, weight in enumerate(class_weights)})
model.fit(X_train_youtube, y_train)

# 对测试数据进行预测
y_pred = model.predict(X_test_youtube)
print(evaluate(y_pred))

{'IE': 0.7940837495197849, 'NS': 0.8897426046868997, 'TF': 0.8056089127929312, 'JP': 0.774490971955436, 'Total': 0.5605071071840184}


## 5. 加入 YouTube 标题 + 类别加权 + 语义特征

In [16]:
from sklearn.svm import SVC

# 在训练数据上拟合模型
model = SVC(random_state=random_state, class_weight={i: weight for i, weight in enumerate(class_weights)})
model.fit(X_train_bert, y_train)

# 对测试数据进行预测
y_pred = model.predict(X_test_bert)
print(evaluate(y_pred))

{'IE': 0.8025355359200922, 'NS': 0.879369957741068, 'TF': 0.8152132155205533, 'JP': 0.7748751440645409, 'Total': 0.5724164425662697}
