In [None]:
# %% 导入库和配置

import dask.dataframe as dd
import matplotlib.pyplot as plt
import seaborn as sns
from dask.diagnostics import ProgressBar
from dask.distributed import Client
import pandas as pd
from config import *
import os

pd.set_option("display.max_columns", None)  # 显示所有列
pd.set_option("display.max_rows", None)  # 显示所有行
pd.set_option("display.max_colwidth", None)  # 显示所有列宽

client = Client("tcp://127.0.0.1:8786")
TRAIN_FILE = 'ggg_sg.csv'

In [None]:
# %% 数据加载与初步预处理

def load_and_clean_data(file_path):
    """加载CSV数据，保留所需列并去除缺失值"""
    df = dd.read_csv(file_path, usecols=["Title", "DateTime", "ContextualText", "DomainCountryCode", "DocTone"]).dropna()
    return df

df = load_and_clean_data(TRAIN_FILE)

In [None]:
#%%

# 查看每个国家的数据量
country_counts = df.groupby("DomainCountryCode").size().compute()
country_counts

DomainCountryCode
AM       3973
AO        365
BT        470
DA       1344
IS      16332
JA      78673
PE        234
SL       2632
TD       3079
TP        462
BL          9
ML          6
IV          5
GR       4896
GY        737
MV        383
TZ        333
US    3699803
FI        265
MH         21
MB          8
AF       4898
BD       1478
EG       8715
KS      47103
LE        686
UK     425092
LH        219
CS        153
CV          2
MD         24
CI         57
RB         13
VI         12
SX          2
BM      11872
EC        259
GH      12531
GM      54186
GQ       1060
MX       4002
NO       1072
NZ      64846
SZ      19358
BN          7
DO         19
MR          1
AJ       6398
AS     263285
CH     403656
GI         98
IR       5338
KV        485
SA      23840
UG       1705
UV         20
MA          7
AL        246
BH        201
ID      34552
IZ       5688
LI        539
MJ         51
NG        153
NI      32191
PK      61944
ST        187
SY      15058
TT          1
BA       2114
CD

In [None]:
#%%

# 按照数量排序
country_counts = country_counts.sort_values(ascending=False)
country_counts

DomainCountryCode
US    3699803
SN    1597004
IN     520575
UK     425092
CH     403656
MY     344802
AS     263285
RP     165861
CA     127235
VM      83748
JA      78673
TH      71496
NZ      64846
PK      61944
GM      54186
QA      51453
KS      47103
AE      41134
CE      40349
BG      39774
SF      38463
ID      34552
NI      32191
TW      26591
BX      24940
HK      24078
SA      23840
RS      21721
SZ      19358
SP      17708
CB      17428
IS      16332
EI      16255
SY      15058
FR      14316
GH      12531
BM      11872
NP      11123
ZI      10625
KE       9317
EG       8715
IT       8485
TU       8255
RO       8097
MC       7495
AJ       6398
JM       6148
IZ       5688
IR       5338
AF       4898
GR       4896
KN       4694
FJ       4656
MU       4286
BR       4017
MX       4002
AM       3973
NL       3695
TD       3079
SL       2632
AR       2517
BA       2114
MT       2011
TV       2003
KU       1970
SW       1889
JO       1861
UG       1705
CU       1518
BD       1478
CY

In [None]:
# %% 数据过滤

def filter_countries_by_threshold(df, threshold=100000):
    """过滤掉条目数少于指定阈值的国家代码"""
    country_counts = df.groupby("DomainCountryCode").size()
    valid_countries = country_counts[country_counts > threshold].index.compute()
    return df[df["DomainCountryCode"].isin(valid_countries)]

with ProgressBar():
    df = filter_countries_by_threshold(df)

In [None]:
# %% 分组统计分析

def calculate_avg_doctone(df):
    """计算每个国家代码的平均DocTone值"""
    avg_doctone = df.groupby("DomainCountryCode")["DocTone"].mean().compute()
    return avg_doctone.sort_values(ascending=False)

def get_country_counts(df, avg_doctone):
    """获取国家代码的条目数"""
    country_counts = df.groupby("DomainCountryCode").size().compute()
    return country_counts.reindex(avg_doctone.index)

with ProgressBar():
    avg_doctone = calculate_avg_doctone(df)
    country_code_counts = get_country_counts(df, avg_doctone)

In [None]:
# %% 可视化国家代码的DocTone分布并保存到本地

def plot_doctone_distribution(avg_doctone, country_code_counts, filename="doctone_distribution.png"):
    """绘制按国家代码的DocTone平均值分布图并保存"""
    plt.figure(figsize=(14, 8))
    sns.barplot(x=avg_doctone.index, y=avg_doctone.values, palette="viridis")
    for index, value in enumerate(avg_doctone.values):
        plt.text(index, value, f"{value:.2f}\n({country_code_counts.iloc[index]})", ha="center", va="bottom")
    plt.title("Average DocTone by Country Code")
    plt.xlabel("Country Code")
    plt.ylabel("Average DocTone")
    plt.savefig(filename)  # 保存图片
    plt.close()

plot_doctone_distribution(avg_doctone, country_code_counts, 'graph/doctone_distribution.png')


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=avg_doctone.index, y=avg_doctone.values, palette="viridis")


In [None]:
# %% 分析目标国家的DocTone年度变化并保存到本地

def plot_doctone_change_over_years(df, target_countries, filename="doctone_yearly_change.png"):
    """绘制目标国家的DocTone年度变化曲线并保存"""
    df["DateTime"] = dd.to_datetime(df["DateTime"])
    df["Year"] = df["DateTime"].dt.year
    df_target = df[df["DomainCountryCode"].isin(target_countries)]

    avg_doctone_yearly = df_target.groupby(["Year", "DomainCountryCode"])["DocTone"].mean().compute().reset_index()
    
    plt.figure(figsize=(14, 8))
    sns.lineplot(data=avg_doctone_yearly, x="Year", y="DocTone", hue="DomainCountryCode", marker="o")
    plt.title("Average DocTone Change Over the Years by Country")
    plt.xlabel("Year")
    plt.ylabel("Average DocTone")
    plt.legend(title="Country Code")
    plt.savefig(filename)  # 保存图片
    plt.close()

target_countries = ["US", "SN", "CH", "MY", "IN"]
plot_doctone_change_over_years(df, target_countries, 'graph/doctone_yearly_change.png')

In [None]:
# %% 分析目标国家的DocTone月度变化并保存到本地

def plot_doctone_change_over_months(df, target_countries, window=6, filename="graph"):
    """绘制目标国家的DocTone月度变化曲线并保存"""
    df["DateTime"] = dd.to_datetime(df["DateTime"])
    df["YearMonth"] = df["DateTime"].dt.to_period("M").astype(str)
    df_target = df[df["DomainCountryCode"].isin(target_countries)]

    avg_doctone_monthly = df_target.groupby(["YearMonth", "DomainCountryCode"])["DocTone"].mean().compute().reset_index()
    smoothed_data = avg_doctone_monthly.groupby('DomainCountryCode').apply(
        lambda x: x.set_index('YearMonth')['DocTone'].rolling(window=window, min_periods=1).mean()
    ).reset_index()
    
    plt.figure(figsize=(14, 8))
    sns.lineplot(data=smoothed_data, x="YearMonth", y="DocTone", hue="DomainCountryCode", marker="o")
    plt.title("Average DocTone Change Over the Months by Country")
    plt.xlabel("Year-Month")
    plt.ylabel("Average DocTone")
    plt.xticks(rotation=45)
    plt.legend(title="Country Code")
    plt.savefig(filename)  # 保存图片
    plt.close()

target_countries = ["SN", "CH", "MY"]
plot_doctone_change_over_months(df, target_countries, filename='graph/doctone_monthly_change.png')

  smoothed_data = avg_doctone_monthly.groupby('DomainCountryCode').apply(
