In [1]:
def clean_post(post):
    # Remove URLs
    post = re.sub(r'http\S+|www\S+|https\S+', '', post, flags=re.MULTILINE)
    # Split posts on '|||'
    post_segments = post.split('|||')
    # Remove special characters and extra whitespace
    cleaned_segments = [re.sub(r'[^A-Za-z0-9\s]', '', segment).strip() for segment in post_segments]
    return cleaned_segments

In [3]:
import pandas as pd
import re

# Load the dataset
file_path = 'dataset/mbti_1.csv' 

# Attempt to read the file with a different encoding
data = pd.read_csv(file_path, encoding='latin1')

# Apply the cleaning function to each post
data['cleaned_posts'] = data['posts'].apply(clean_post)

cleaned_data = data.explode('cleaned_posts')[['cleaned_posts']].reset_index(drop=True)

In [4]:
# Filter out rows where the cleaned_posts have fewer than 15 words
cleaned_data['word_count'] = cleaned_data['cleaned_posts'].apply(lambda x: len(x.split()))
filtered_data = cleaned_data[cleaned_data['word_count'] >= 15]

# Drop the word_count column as it is no longer needed
filtered_data = filtered_data.drop(columns=['word_count'])

In [5]:
# 添加ID列，从1开始
filtered_data['id'] = range(1, len(filtered_data) + 1)

# 重排列的顺序，将ID列放到第一列
filtered_data = filtered_data[['id', 'cleaned_posts']]

# 显示前几行结果
print(filtered_data.head())

In [6]:
num_rows = filtered_data.shape[0]
num_rows

In [7]:
# Save the cleaned data to a new CSV file
filtered_data.to_csv('dataset/cleaned_mbti.csv', index=False)