In [None]:
# 判断用户操作系统

import sys

import matplotlib

if sys.platform.startswith('win'):
    OperatingSystem = 'Windows'
elif sys.platform.startswith('linux'):
    OperatingSystem = 'Linux'
else:
    OperatingSystem = 'macOS'

In [None]:
import pandas as pd

filename = "./data/user_archive.csv"
col_name = "post_raw"

result = pd.read_csv(filename, usecols=[col_name])[col_name].tolist()
print(result)

In [None]:
# 清洗数据,这个正则是GPT写的，写的不好见谅喵 :XD
import re
pattern = re.compile(
    r'（帖子已被作者删除）|'      # 文本被删除
    r'（话题已被作者删除）|'      # 话题被删除
    r'#.+?添加|'                  # #****添加 形式
    r'从 #.+? 到 #.+?:.+?|'       # 从 #**** 到 #****:**** 形式
    r'!\[[^\]]*\]\([^)]+\)'      # Markdown 图片 !... [<sup>1</sup>](...)
)

cleaned = [s for s in result if isinstance(s, str) and not pattern.search(s)]
print(cleaned)

In [None]:
# 总结高频词汇
from collections import Counter

# 统计词频
counter = Counter(cleaned)

# 获取按频率排序的列表（词, 频数）
freq_list = counter.most_common()
hotword = counter.most_common(10)

print(hotword)

In [None]:
# 查询回复时间
import pandas as pd

filename = "./data/user_archive.csv"
col_name = "created_at"

post_create = pd.read_csv(filename, usecols=[col_name])[col_name].tolist()
print(post_create)

In [None]:
# 查询点赞时间
import pandas as pd

filename = "./data/likes.csv"
col_name = "updated_at"

like_create = pd.read_csv(filename, usecols=[col_name])[col_name].tolist()
print(like_create)

In [None]:
# 清洗数据，合并回复和点赞时间
import pandas as pd

# post_create 和 like_create 为原始时间字符串数组
all_times = post_create + like_create

# 先转成 Series 再解析为 UTC 时间
s = pd.to_datetime(pd.Series(all_times), utc=True, errors='coerce')

# 去掉无法解析的，转换为北京时间并按时间排序
s = s.dropna().dt.tz_convert('Asia/Shanghai').sort_values()

# 统一输出格式，存入 active_time 数组
active_time = s.dt.strftime('%Y-%m-%d %H:%M:%S').tolist()


print(active_time)

In [None]:
from matplotlib.font_manager import fontManager
print([font.name for font in fontManager.ttflist if 'Hei' in font.name or 'Arial' in font.name])

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
from mpl_toolkits.axes_grid1 import make_axes_locatable

# ====== 配置项 ======
# 设置要显示的月份，None 表示显示所有月份，或指定列表如 [10, 11, 12]
SHOW_MONTHS = None  # 例如: [10, 11, 12] 只显示最近3个月
# ===================

s = pd.to_datetime(active_time)

if OperatingSystem == 'Windows':
    matplotlib.rc('font', family='Microsoft YaHei')
elif OperatingSystem == 'Linux':
    matplotlib.rc('font', family='SimHei')
elif OperatingSystem == 'macOS':
    matplotlib.rc('font', family='Heiti TC')
    matplotlib.rcParams['axes.unicode_minus'] = False

# 设置全局颜色为灰色 (128, 128, 128)
gray_color = '#808080'
matplotlib.rcParams['text.color'] = gray_color
matplotlib.rcParams['axes.labelcolor'] = gray_color
matplotlib.rcParams['xtick.color'] = gray_color
matplotlib.rcParams['ytick.color'] = gray_color
matplotlib.rcParams['axes.edgecolor'] = gray_color

df = pd.DataFrame({'dt': s})
df['month'] = df['dt'].dt.month

# 根据配置筛选月份
if SHOW_MONTHS is not None:
    df = df[df['month'].isin(SHOW_MONTHS)]
    months_to_show = SHOW_MONTHS
else:
    months_to_show = sorted(df['month'].unique())

df['hour'] = df['dt'].dt.hour
df['minute_5'] = (df['dt'].dt.minute // 15) * 15
df['time_slot'] = df['hour'] + df['minute_5'] / 60.0

activity_counts = df.groupby(['month', 'time_slot']).size().reset_index(name='count')

# 点大小基于该时段活动频次
size_scale = activity_counts['count'] * 12 + 20

colors_list = [
    (0.0, '#9c27b0'),
    (0.25, '#ff9800'),
    (0.5, '#ffeb3b'),
    (0.75, '#4caf50'),
    (1.0, '#9c27b0'),
]

# 构建 colormap
cmap = LinearSegmentedColormap.from_list(
    'time_of_day',
    [(pos, color) for pos, color in colors_list]
)

# 根据月份数量调整图表高度
fig_height = max(3, len(months_to_show) * 1.5)
fig, ax = plt.subplots(figsize=(14, fig_height))

scatter = ax.scatter(
    activity_counts['time_slot'],
    activity_counts['month'],
    s=size_scale,
    c=activity_counts['time_slot'],
    cmap=cmap,
    vmin=0,
    vmax=24,
    alpha=0.7,
    edgecolors='white',
    linewidths=0.5
)

divider = make_axes_locatable(ax)
cax = divider.append_axes("bottom", size="5%", pad=0.0)

cbar = fig.colorbar(scatter, cax=cax, orientation='horizontal')
cbar.set_ticks(range(0, 25, 2))
cbar.set_ticklabels([f'{h}h' for h in range(0, 25, 2)])
cbar.set_label('小时')
cbar.ax.tick_params(colors=gray_color)
cbar.outline.set_edgecolor(gray_color)

ax.set_xlim(0, 24)
ax.set_xticks([])
ax.set_xlabel('')

ax.set_yticks(months_to_show)
ax.set_yticklabels([f'{m}月' for m in months_to_show])
ax.set_ylim(min(months_to_show) - 0.5, max(months_to_show) + 0.5)
ax.set_ylabel('月份')
ax.set_title('活跃时间')
ax.grid(axis='both', alpha=0.3, linestyle='--', color=gray_color)

plt.tight_layout()
plt.show()
plt.savefig('./fig/time.png', transparent=True)

In [None]:
# 提取分区数据和时间，绘制累积面积图
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# ====== 配置项：为每个分区设置颜色（来自 LinuxDO 官方配色）======
CATEGORY_COLORS = {
    '开发调优': '#32c3c3',
    '国产替代': '#D12C25',
    '资源荟萃': '#12A89D',
    '网盘资源': '#16b176',
    '文档共建': '#9cb6c4',
    '跳蚤市场': '#ED207B',
    '非我莫属': '#a8c6fe',
    '读书成诗': '#e0d900',
    '扬帆起航': '#ff9838',
    '前沿快讯': '#BB8FCE',
    '网络记忆': '#F7941D',
    '福利羊毛': '#E45735',
    '搞七捻三': '#3AB54A',
    '社区孵化': '#ffbb00',
    '运营反馈': '#808281',
    '深海幽域': '#45B7D1',
    # 默认颜色（未指定的分区）
    '_default': '#9e9e9e',
}
# ========================================

# 读取分区和时间数据
filename = "./data/user_archive.csv"
df = pd.read_csv(filename, usecols=['categories', 'created_at'])

# 清洗分区数据（只取第一个分区）
df['category'] = df['categories'].apply(
    lambda c: c.split('|', 1)[0].strip() if isinstance(c, str) else 'Unknown'
)
df = df[df['category'].str.strip() != '-']
df = df[df['category'].str.strip() != '']

# 解析时间并转换为北京时间
df['dt'] = pd.to_datetime(df['created_at'], utc=True, errors='coerce')
df = df.dropna(subset=['dt'])
df['dt'] = df['dt'].dt.tz_convert('Asia/Shanghai')

# 月份起始日（day of year）
month_starts = [1, 32, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335]
month_ends = [31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365]
month_labels = ['1月', '2月', '3月', '4月', '5月', '6月', '7月', '8月', '9月', '10月', '11月', '12月']

# 根据 SHOW_MONTHS 配置筛选月份
if SHOW_MONTHS is not None:
    df = df[df['dt'].dt.month.isin(SHOW_MONTHS)]
    x_min = month_starts[min(SHOW_MONTHS) - 1]
    x_max = month_ends[max(SHOW_MONTHS) - 1]
else:
    x_min = 1
    x_max = 365

df['day_of_year'] = df['dt'].dt.dayofyear

# 创建每天每分区的计数
daily_counts = df.groupby(['day_of_year', 'category']).size().unstack(fill_value=0)

# 确保所有天都有数据
all_days = pd.DataFrame(index=range(x_min, x_max + 1))
daily_counts = all_days.join(daily_counts).fillna(0)

# 计算累积值
cumulative = daily_counts.cumsum()

# 根据配置获取每个分区的颜色
colors = [CATEGORY_COLORS.get(col, CATEGORY_COLORS['_default']) for col in cumulative.columns]

# 设置灰色主题
gray_color = '#808080'
matplotlib.rcParams['text.color'] = gray_color
matplotlib.rcParams['axes.labelcolor'] = gray_color
matplotlib.rcParams['xtick.color'] = gray_color
matplotlib.rcParams['ytick.color'] = gray_color
matplotlib.rcParams['axes.edgecolor'] = gray_color

fig, ax = plt.subplots(figsize=(14, 6))

# 绘制堆叠面积图
ax.stackplot(
    cumulative.index,
    [cumulative[col] for col in cumulative.columns],
    labels=cumulative.columns,
    colors=colors,
    alpha=0.7
)

# 根据 SHOW_MONTHS 设置 x 轴范围和刻度
ax.set_xlim(x_min, x_max)
ax.set_xlabel('日期')
ax.set_ylabel('发帖数')
ax.set_title('各分区累积发帖')
ax.legend(loc='upper left', fontsize=8, ncol=2)
ax.grid(axis='y', alpha=0.3, linestyle='--', color=gray_color)

# 根据 SHOW_MONTHS 筛选要显示的月份刻度
if SHOW_MONTHS is not None:
    filtered_starts = [month_starts[m-1] for m in SHOW_MONTHS]
    filtered_labels = [month_labels[m-1] for m in SHOW_MONTHS]
    ax.set_xticks(filtered_starts)
    ax.set_xticklabels(filtered_labels)
else:
    ax.set_xticks(month_starts)
    ax.set_xticklabels(month_labels)

plt.tight_layout()
plt.show()
plt.savefig('./fig/cat.png', transparent=True)

In [None]:
# 提取分区数据
import pandas as pd

filename = "./data/user_archive.csv"
col_name = "categories"

categories = pd.read_csv(filename, usecols=[col_name])[col_name].tolist()
print(categories)

clean_categories = [
    (c.split('|', 1)[0].strip() if isinstance(c, str) else c)
    for c in categories
]

# 输出或写回文件
print(clean_categories)


In [None]:
from collections import Counter
import matplotlib.pyplot as plt

# 假设 clean_categories 已存在
data = [c for c in clean_categories if isinstance(c, str) and c.strip() and c.strip() != '-']

counter = Counter(data)

# 按数量排序
sorted_items = counter.most_common()
labels = [item[0] for item in sorted_items]
sizes = [item[1] for item in sorted_items]

# 获取每个分区对应的颜色
colors = [CATEGORY_COLORS.get(label, CATEGORY_COLORS['_default']) for label in labels]

# 设置灰色主题
gray_color = '#808080'

fig, ax = plt.subplots(figsize=(12, 8))

# 生成带百分比的标签
total = sum(sizes)
labels_with_pct = [f'{label} ({size/total*100:.1f}%)' if size/total*100 >= 2 else '' for label, size in zip(labels, sizes)]

wedges, texts = ax.pie(
    sizes,
    labels=labels_with_pct,
    startangle=90,
    counterclock=False,
    colors=colors,
    labeldistance=1.05,
    wedgeprops=dict(edgecolor='white', linewidth=1, alpha=0.7)
)

# 设置标签文字颜色
for text in texts:
    text.set_color(gray_color)
    text.set_fontsize(10)

ax.set_title('各分区发帖占比', color=gray_color)
ax.axis('equal')

plt.tight_layout()
# plt.show()
plt.savefig('./fig/pie.png', transparent=True)


In [None]:
# 还有个每日访问量统计，等会做吧