#2025年truth文本数据

In [6]:
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from datetime import datetime
import nltk
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/wangyijie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/wangyijie/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# === 1. 加载合并后的文本数据 ===
input_file = "01文本抓取与预处理/2025_truth.csv"
df = pd.read_csv(input_file)
print(df.head())


  web-scraper-order                              web-scraper-start-url  \
0     1753354145-67  https://rollcall.com/factbase/trump/topic/soci...   
1     1753354145-68  https://rollcall.com/factbase/trump/topic/soci...   
2     1753354145-69  https://rollcall.com/factbase/trump/topic/soci...   
3     1753354145-70  https://rollcall.com/factbase/trump/topic/soci...   
4     1753354145-71  https://rollcall.com/factbase/trump/topic/soci...   

                         date  \
0  July 20, 2025 @ 8:53 PM ET   
1  July 20, 2025 @ 8:16 PM ET   
2  July 20, 2025 @ 8:06 PM ET   
3  July 20, 2025 @ 8:06 PM ET   
4  July 20, 2025 @ 7:56 PM ET   

                                                text  
0  Adam “Shifty” Schiff is in BIG TROUBLE! He fal...  
1  RT: https://truthsocial.com/users/realDonaldTr...  
2  Go get the GREAT NEW BOOK by Mark Levin. It’s ...  
3  RT: https://truthsocial.com/users/realDonaldTr...  
4   HOW DID SAMANTHA POWER MAKE ALL OF THAT MONEY???  


In [42]:
# 2. 保留重要字段
df = df[['date', 'text']].dropna()
# 3. 转换日期格式
def parse_date(raw_date):
    try:
        return datetime.strptime(raw_date.replace('ET', '').strip(), '%B %d, %Y @ %I:%M %p')
    except:
        return pd.NaT

df['date'] = df['date'].apply(parse_date)
df = df.dropna(subset=['date'])

# 4. 基础清洗函数
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # 去除 URL
    text = re.sub(r"@\w+", "", text)            # 去用户名（如有）
    text = re.sub(r"#\w+", "", text)            # 去话题标签
    #text = re.sub(r"\brt\b", "", text, flags=re.IGNORECASE)  # 删除“rt”（忽略大小写）
    text = re.sub(r"\n", " ", text)             # 换行符换成空格
    text = re.sub(r"\s+", " ", text).strip()    # 多空格压缩
    
    return text

df['clean_text'] = df['text'].apply(clean_text)

# 5. 可选：小写 + 去标点 + 去停用词 + 词形还原
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def normalize(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return ' '.join(words)

df['clean_text'] = df['clean_text'].apply(normalize)

In [47]:
# 删除clean_text列中是空值或仅包含空白的行
df = df[df['clean_text'].notna() & (df['clean_text'].str.strip() != '')]


In [44]:
import re

def remove_emoji(text):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # 表情符号
        u"\U0001F300-\U0001F5FF"  # 符号&象形文字
        u"\U0001F680-\U0001F6FF"  # 交通工具&地图符号
        u"\U0001F1E0-\U0001F1FF"  # 国旗
        u"\U00002700-\U000027BF"  # 杂项符号
        u"\U0001F900-\U0001F9FF"  # 补充表情符号
        u"\U00002600-\U000026FF"  # 杂项符号
        u"\U0001FA70-\U0001FAFF"  # 补充符号和象形文字
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# 应用到 clean_text 字段
df['clean_text'] = df['clean_text'].apply(remove_emoji)

In [45]:
df['clean_text'] = df['clean_text'].str.replace(r'\brt\b', '', case=False, regex=True)
df['clean_text'] = df['clean_text'].str.replace(r'\s+', ' ', regex=True).str.strip()  # 再次去空格


In [48]:
# 7. 保存清洗后的数据
df[['date', 'clean_text']].to_csv("2025_truth_cleaned.csv", index=False)


# 第一届总统任期数据清洗预处理


In [8]:
input_file = "/Users/wangyijie/Visual_Studio_code/毕业论文项目/01文本抓取与预处理/trump_第一任期.csv"
df_17 = pd.read_csv(input_file)
print(df_17.head())

                    id                                               text  \
0    98454970654916608  Republicans and Democrats have both created ou...   
1  1234653427789070336  I was thrilled to be back in the Great city of...   
2  1218010753434820614  RT @CBS_Herridge: READ: Letter to surveillance...   
3  1304875170860015617  The Unsolicited Mail In Ballot Scam is a major...   
4  1218159531554897920  RT @MZHemingway: Very friendly telling of even...   

  isRetweet isDeleted              device  favorites  retweets  \
0         f         f           TweetDeck         49       255   
1         f         f  Twitter for iPhone      73748     17404   
2         t         f  Twitter for iPhone          0      7396   
3         f         f  Twitter for iPhone      80527     23502   
4         t         f  Twitter for iPhone          0      9081   

                  date isFlagged  
0  2011-08-02 18:07:48         f  
1  2020-03-03 01:34:50         f  
2  2020-01-17 03:22:47         f  


In [9]:
# 2. 保留重要字段
df_17 = df_17[['date', 'text']].dropna()
# 3. 转换日期格式
df_17['date'] = pd.to_datetime(df_17['date'], errors='coerce')
df_17 = df_17.dropna(subset=['date'])  # 删除无法解析为时间的行（如果有）

# 4. 基础清洗函数
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # 去除 URL
    text = re.sub(r"@\w+", "", text)            # 去用户名（如有）
    text = re.sub(r"#\w+", "", text)            # 去话题标签
    #text = re.sub(r"\brt\b", "", text, flags=re.IGNORECASE)  # 删除“rt”（忽略大小写）
    text = re.sub(r"\n", " ", text)             # 换行符换成空格
    text = re.sub(r"\s+", " ", text).strip()    # 多空格压缩
    text = re.sub(r'[\u200e\u200f\u202a-\u202e\u2066-\u2069]', '', text)        # 方向控制符
    text = re.sub(r'[\u2190-\u21ff]', '', text)                                 # 箭头符号
    text = text.replace('‼️', '')                                               # 特定emoji
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

df_17['clean_text'] = df_17['text'].apply(clean_text)

# 5. 可选：小写 + 去标点 + 去停用词 + 词形还原
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def normalize(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return ' '.join(words)

df_17['clean_text'] = df_17['clean_text'].apply(normalize)

In [15]:
# 删除clean_text列中是空值或仅包含空白的行
df_17 = df_17[df_17['clean_text'].notna() & (df_17['clean_text'].str.strip() != '')]

In [11]:
import re

def remove_emoji(text):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # 表情符号
        u"\U0001F300-\U0001F5FF"  # 符号&象形文字
        u"\U0001F680-\U0001F6FF"  # 交通工具&地图符号
        u"\U0001F1E0-\U0001F1FF"  # 国旗
        u"\U00002700-\U000027BF"  # 杂项符号
        u"\U0001F900-\U0001F9FF"  # 补充表情符号
        u"\U00002600-\U000026FF"  # 杂项符号
        u"\U0001FA70-\U0001FAFF"  # 补充符号和象形文字
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# 应用到 clean_text 字段
df_17['clean_text'] = df_17['clean_text'].apply(remove_emoji)

In [12]:
df_17['clean_text'] = df_17['clean_text'].str.replace(r'\brt\b', '', case=False, regex=True)
df_17['clean_text'] = df_17['clean_text'].str.replace(r'\s+', ' ', regex=True).str.strip()  # 再次去空格


In [13]:
%pip install emoji
import emoji

# 删除所有 emoji，包括 ⬇️⬆️▶️🟥⏰‼️ 等
df_17['clean_text'] = df_17['clean_text'].apply(lambda x: emoji.replace_emoji(x, replace=''))


Note: you may need to restart the kernel to use updated packages.


In [16]:
# 7. 保存清洗后的数据
df_17[['date', 'clean_text']].to_csv("first_term_cleaned.csv", index=False)


In [18]:
input_file = "/Users/wangyijie/Visual_Studio_code/毕业论文项目/01文本抓取与预处理/first_term_cleaned.csv"
df_17_cleaned = pd.read_csv(input_file)
print(df_17_cleaned.head())

                  date                                         clean_text
0  2011-08-02 18:07:48       republican democrat created economic problem
1  2020-03-03 01:34:50  thrilled back great city charlotte north carol...
2  2020-01-17 03:22:47  read letter surveillance court obtained cbs ne...
3  2020-09-12 20:10:58  unsolicited mail ballot scam major threat demo...
4  2020-01-17 13:13:59  friendly telling event comeys apparent leaking...


In [26]:

df_17_cleaned = df_17_cleaned.sort_values(by='date', ascending=False)
df_17_cleaned.head(20)


Unnamed: 0,date,clean_text
272,2021-01-08 15:44:28,asked going inauguration january 20th
269,2021-01-08 14:46:38,75000000 great american patriot voted america ...
261,2021-01-06 23:01:04,thing event happen sacred landslide election v...
252,2021-01-06 20:13:26,asking everyone u capitol remain peaceful viol...
232,2021-01-06 19:38:58,please support capitol police law enforcement ...
228,2021-01-06 19:24:22,mike penny didn’t courage done protect country...
218,2021-01-06 15:44:31,scoundrel toying great guy vote didn’t want an...
212,2021-01-06 14:16:30,even mexico us voter id
210,2021-01-06 14:15:07,state want redo vote found voted fraud legisla...
200,2021-01-06 14:00:12,happened find 50000 ballot late last night usa...


In [None]:
# 筛选2017年1月20日及以后的数据

start_date = pd.to_datetime('2017-01-20')
df_17_cleaned['date'] = pd.to_datetime(df_17_cleaned['date'], errors='coerce')
df_17_filtered = df_17_cleaned[df_17_cleaned['date'] >= start_date]

# 显示筛选后的数据,查看前10行和后10行
df_17_filtered.head(10),  df_17_filtered.tail(10)


(                   date                                         clean_text
 272 2021-01-08 15:44:28              asked going inauguration january 20th
 269 2021-01-08 14:46:38  75000000 great american patriot voted america ...
 261 2021-01-06 23:01:04  thing event happen sacred landslide election v...
 252 2021-01-06 20:13:26  asking everyone u capitol remain peaceful viol...
 232 2021-01-06 19:38:58  please support capitol police law enforcement ...
 228 2021-01-06 19:24:22  mike penny didn’t courage done protect country...
 218 2021-01-06 15:44:31  scoundrel toying great guy vote didn’t want an...
 212 2021-01-06 14:16:30                            even mexico us voter id
 210 2021-01-06 14:15:07  state want redo vote found voted fraud legisla...
 200 2021-01-06 14:00:12  happened find 50000 ballot late last night usa...,
                      date                                         clean_text
 44127 2017-01-20 17:55:44  follow two simple rule buy american amp hire a...
 44128 

In [30]:
# 保存为trump17_21.csv
df_17_filtered.to_csv('trump17_21.csv', index=False)

In [None]:
# 合并为trump_merged_2017_2025.csv
df_17_filtered = pd.read_csv('trump17_21.csv')
df_25_cleaned = pd.read_csv('2025_truth_cleaned.csv')
df_25_cleaned['date'] = pd.to_datetime(df_25_cleaned['date'], errors='coerce')
df_17_filtered['date'] = pd.to_datetime(df_17_filtered['date'], errors='coerce')
df_merged = pd.concat([df_17_filtered, df_25_cleaned], ignore_index=True)
df_merged = df_merged.sort_values(by='date', ascending=False)
df_merged.to_csv('trump_merged_2017_2025.csv', index=False)