In [4]:
import pandas as pd
import numpy as np
import ast
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
from itertools import combinations
from matplotlib.lines import Line2D # 用于创建自定义图例
from networkx.algorithms import community as nx_comm # 【核心修正】导入正确的社区发现模块
from IPython.display import display # 【新增】导入display函数以美化表格输出

# --- Configurations ---
input_filename = 'Spotify_Weekly_Data_Cleaned.csv'
output_centrality_filename = 'artist_centrality_scores.csv'
output_community_summary_filename = 'community_summary.csv'
output_top10_summary_filename = 'top_10_collaborators_summary.csv'
output_correlation_plot = 'hotness_vs_points_correlation.png'

def analyze_rq3_network_as_tables(file_path):
    """
    执行RQ3的深度分析，重点在于生成清晰的表格来展示网络结构和洞察。
    """
    try:
        # --- Step 1: 加载并准备数据 ---
        print("--- Step 1: Loading and preparing data ---")
        df = pd.read_csv(file_path, sep=';')
        
        if isinstance(df['Artist (Ind.)'].iloc[0], str):
            df['Artist (Ind.)'] = df['Artist (Ind.)'].apply(ast.literal_eval)
            df['Continent'] = df['Continent'].apply(ast.literal_eval)
            df['Nationality'] = df['Nationality'].apply(ast.literal_eval)

        print(f"File loaded successfully. Shape: {df.shape}")

        # --- 【优化建议】数据的进一步清洗和验证 ---
        print("\n--- Additional Data Validation ---")
        print("Distribution of Artist_Count:")
        print(df['Artist_Count'].describe())
        collab_df = df[df['Artist_Count'] > 1].copy()

        # --- Step 2: 构建艺人合作网络 ---
        print("\n--- Step 2: Building the artist collaboration network ---")
        
        edge_weights = defaultdict(int)
        for artists_list in collab_df['Artist (Ind.)']:
            unique_artists = sorted(list(set(artists_list)))
            if len(unique_artists) > 1:
                for artist1, artist2 in combinations(unique_artists, 2):
                    edge = tuple(sorted((artist1, artist2)))
                    edge_weights[edge] += 1
        
        G = nx.Graph()
        for (artist1, artist2), weight in edge_weights.items():
            G.add_edge(artist1, artist2, weight=weight)
            
        print(f"Network built successfully.")
        print(f"Number of artists (nodes): {G.number_of_nodes()}")
        print(f"Number of collaborations (edges): {G.number_of_edges()}")

        # --- Step 3: 计算多种网络中心性指标 ---
        print("\n--- Step 3: Calculating multiple network centrality metrics ---")
        
        degree_centrality = nx.degree_centrality(G)
        betweenness_centrality = nx.betweenness_centrality(G)
        closeness_centrality = nx.closeness_centrality(G)
        eigenvector_centrality = nx.eigenvector_centrality(G, max_iter=1000)
        
        centrality_df = pd.DataFrame({
            'Artist': list(degree_centrality.keys()),
            'Degree_Centrality': list(degree_centrality.values()),
            'Betweenness_Centrality': list(betweenness_centrality.values()),
            'Closeness_Centrality': list(closeness_centrality.values()),
            'Eigenvector_Centrality': list(eigenvector_centrality.values())
        }).sort_values(by='Degree_Centrality', ascending=False).reset_index(drop=True)
        
        centrality_df.to_csv(output_centrality_filename, index=False, sep=';', encoding='utf-8-sig')
        print(f"Centrality scores saved to '{output_centrality_filename}'")
        print("\nTop 10 most central artists (by Degree):")
        # 【修改】使用 display 进行美化输出
        display(centrality_df.head(10))

        # --- 【核心修改】步骤 4: 社区发现与表格化分析 ---
        print("\n--- Step 4: Performing community detection and generating summary tables ---")

        communities = nx_comm.louvain_communities(G, seed=42)
        communities = sorted(communities, key=len, reverse=True)
        print(f"Found {len(communities)} communities. Top 5 sizes: {[len(c) for c in communities[:5]]}")

        node_to_community = {node: i for i, community in enumerate(communities) for node in community}
        centrality_df['Community_ID'] = centrality_df['Artist'].map(node_to_community)

        artist_continent_map = {}
        artist_nationality_map = {} # 新增国籍映射
        unique_collab_df = collab_df.drop_duplicates(subset=['id'])
        for _, row in unique_collab_df.iterrows():
            artists = row['Artist (Ind.)']
            continents = row['Continent']
            nationalities = row['Nationality']
            if isinstance(artists, list) and isinstance(continents, list) and isinstance(nationalities, list):
                for artist, continent, nationality in zip(artists, continents, nationalities):
                    if artist not in artist_continent_map:
                        artist_continent_map[artist] = continent
                        artist_nationality_map[artist] = nationality
        centrality_df['Continent'] = centrality_df['Artist'].map(artist_continent_map)
        centrality_df['Nationality'] = centrality_df['Artist'].map(artist_nationality_map)

        summary_data = []
        for i in range(min(5, len(communities))):
            community_df = centrality_df[centrality_df['Community_ID'] == i]
            leader = community_df.loc[community_df['Degree_Centrality'].idxmax()]
            top_members = community_df.head(5)['Artist'].tolist()
            continent_dist = community_df['Continent'].value_counts(normalize=True).head(2)
            summary_data.append({
                'Community_ID': i, 'Total_Members': len(community_df),
                'Community_Leader': leader['Artist'], 'Leader_Degree': f"{leader['Degree_Centrality']:.3f}",
                'Top_5_Members': ", ".join(top_members),
                'Continent_Distribution': f"{continent_dist.index[0]} ({continent_dist.iloc[0]:.0%})" if not continent_dist.empty else "N/A"
            })
        community_summary_df = pd.DataFrame(summary_data)
        print("\n--- Community Analysis Summary ---")
        # 【修改】使用 display 进行美化输出
        display(community_summary_df)
        community_summary_df.to_csv(output_community_summary_filename, index=False, sep=';', encoding='utf-8-sig')
        print(f"\nCommunity summary saved to '{output_community_summary_filename}'")

        # --- 【新增】步骤 5: 生成Top 10合作者分析表 ---
        print("\n--- Step 5: Generating Top 10 Collaborators Analysis Table ---")
        top_10_artists = centrality_df.head(10).copy()
        # 计算合作次数（即网络图中的“度”）
        top_10_artists['Collaboration_Count'] = top_10_artists['Artist'].apply(lambda x: G.degree[x])
        top_10_summary = top_10_artists[['Artist', 'Collaboration_Count', 'Nationality', 'Continent', 'Degree_Centrality']]
        
        print("\n--- Top 10 Collaborators Summary ---")
        # 【修改】使用 display 进行美化输出
        display(top_10_summary)
        top_10_summary.to_csv(output_top10_summary_filename, index=False, sep=';', encoding='utf-8-sig')
        print(f"\nTop 10 collaborators summary saved to '{output_top10_summary_filename}'")
        
        # --- Step 6: 验证合作效应 (保持不变) ---
        print("\n--- Step 6: Validating the 'Collaboration Effect' ---")
        
        artist_to_degree = centrality_df.set_index('Artist')['Degree_Centrality'].to_dict()
        def get_collaborator_hotness(artists):
            if not isinstance(artists, list) or len(artists) < 2: return 0, 0
            degrees = [artist_to_degree.get(artist, 0) for artist in artists]
            return max(degrees), np.mean(degrees)
        hotness_scores = collab_df['Artist (Ind.)'].apply(get_collaborator_hotness)
        collab_df['Max_Collaborator_Hotness'] = [score[0] for score in hotness_scores]
        collab_df['Mean_Collaborator_Hotness'] = [score[1] for score in hotness_scores]
        corr_max = collab_df[['Max_Collaborator_Hotness', 'Points (Total)']].corr().iloc[0, 1]
        corr_mean = collab_df[['Mean_Collaborator_Hotness', 'Points (Total)']].corr().iloc[0, 1]
        print(f"\nCorrelation between Max Collaborator Hotness and Points (Total): {corr_max:.3f}")
        print(f"Correlation between Mean Collaborator Hotness and Points (Total): {corr_mean:.3f}")

        # 可视化相关性分析
        plt.figure(figsize=(10, 6))
        sns.regplot(x='Max_Collaborator_Hotness', y='Points (Total)', data=collab_df, scatter_kws={'alpha':0.2}, line_kws={'color':'red'})
        plt.title('Collaboration Hotness vs. Song Points', fontsize=16)
        plt.xlabel('Max Collaborator Degree Centrality (Hotness)')
        plt.ylabel('Points (Total)')
        plt.tight_layout()
        plt.savefig(output_correlation_plot)
        plt.close() # 关闭图形，因为我们只保存不显示
        print(f"\nCorrelation plot saved to '{output_correlation_plot}'")

    except (FileNotFoundError, KeyError, ValueError) as e:
        print(f"An error occurred: {e}. Please check the input file and its format.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# --- 主程序入口 ---
if __name__ == "__main__":
    analyze_rq3_network_as_tables(input_filename)



--- Step 1: Loading and preparing data ---
File loaded successfully. Shape: (464475, 21)

--- Additional Data Validation ---
Distribution of Artist_Count:
count    464475.00000
mean          1.39465
std           0.80745
min           1.00000
25%           1.00000
50%           1.00000
75%           2.00000
max           9.00000
Name: Artist_Count, dtype: float64

--- Step 2: Building the artist collaboration network ---
Network built successfully.
Number of artists (nodes): 1561
Number of collaborations (edges): 3037

--- Step 3: Calculating multiple network centrality metrics ---
Centrality scores saved to 'artist_centrality_scores.csv'

Top 10 most central artists (by Degree):


Unnamed: 0,Artist,Degree_Centrality,Betweenness_Centrality,Closeness_Centrality,Eigenvector_Centrality
0,Bad Bunny,0.040385,0.046588,0.213913,0.255943
1,J Balvin,0.035897,0.098182,0.234427,0.216252
2,Ozuna,0.029487,0.017412,0.209056,0.249955
3,Anuel AA,0.029487,0.035785,0.215694,0.250623
4,Daddy Yankee,0.026923,0.026356,0.209267,0.203761
5,Rauw Alejandro,0.026282,0.013446,0.202467,0.200873
6,Myke Towers,0.023718,0.007724,0.196654,0.1893
7,David Guetta,0.023718,0.044152,0.200459,0.052909
8,Drake,0.023077,0.016766,0.194029,0.010836
9,Farruko,0.022436,0.024973,0.205683,0.192215



--- Step 4: Performing community detection and generating summary tables ---
Found 192 communities. Top 5 sizes: [111, 104, 103, 97, 90]

--- Community Analysis Summary ---


Unnamed: 0,Community_ID,Total_Members,Community_Leader,Leader_Degree,Top_5_Members,Continent_Distribution
0,0,111,Drake,0.023,"Drake, Future, Young Thug, Travis Scott, Tripp...",Anglo-America (71%)
1,1,104,Bad Bunny,0.04,"Bad Bunny, J Balvin, Ozuna, Anuel AA, Daddy Ya...",Latin-America (57%)
2,2,103,David Guetta,0.024,"David Guetta, Becky G, Natti Natasha, Calvin H...",Europe (43%)
3,3,97,DJ Khaled,0.017,"DJ Khaled, Chance the Rapper, The Weeknd, Quav...",Anglo-America (76%)
4,4,90,Pharrell Williams,0.015,"Pharrell Williams, Tyler, The Creator, Capital...",Europe (73%)



Community summary saved to 'community_summary.csv'

--- Step 5: Generating Top 10 Collaborators Analysis Table ---

--- Top 10 Collaborators Summary ---


Unnamed: 0,Artist,Collaboration_Count,Nationality,Continent,Degree_Centrality
0,Bad Bunny,63,Puerto Rico,Latin-America,0.040385
1,J Balvin,56,Colombia,Latin-America,0.035897
2,Ozuna,46,Puerto Rico,Latin-America,0.029487
3,Anuel AA,46,Puerto Rico,Latin-America,0.029487
4,Daddy Yankee,42,Puerto Rico,Latin-America,0.026923
5,Rauw Alejandro,41,Puerto Rico,Latin-America,0.026282
6,Myke Towers,37,Puerto Rico,Latin-America,0.023718
7,David Guetta,37,France,Europe,0.023718
8,Drake,36,Canada,Anglo-America,0.023077
9,Farruko,35,Puerto Rico,Latin-America,0.022436



Top 10 collaborators summary saved to 'top_10_collaborators_summary.csv'

--- Step 6: Validating the 'Collaboration Effect' ---

Correlation between Max Collaborator Hotness and Points (Total): 0.120
Correlation between Mean Collaborator Hotness and Points (Total): 0.116

Correlation plot saved to 'hotness_vs_points_correlation.png'


# RQ3 分析报告：艺人合作网络与成功预测

RQ3的核心任务是探索艺人之间的合作关系网络，并分析“与热门艺人合作”这一策略是否能有效提升歌曲的榜单表现。通过构建一个包含1561位艺人和3037条合作关系的复杂网络，我们得出了以下核心洞察：

## 发现一：音乐世界的“中心-边缘”结构与拉丁音乐圈的核心地位

我们的网络可视化清晰地揭示了音乐合作并非随机，而是呈现出明显的**“中心-边缘”结构**。少数艺人位于网络的绝对核心，拥有极其密集的合作关系，而大量艺人则处于网络的边缘，合作稀少。

### 关键数据点：
- **核心玩家**: 在影响力排名前十的艺人中，有 8位 来自拉丁美洲（如 Bad Bunny, J Balvin, Ozuna），清晰地表明拉丁音乐生态是当前全球流行音乐合作网络的核心驱动力。
- **地区聚集性**: 社区发现算法进一步证实了这一点，最大的几个“艺人圈子”分别以拉丁音乐、北美嘻哈和欧洲流行为主，显示出合作的高度地区性和流派聚集性。

## 发现二：关键角色的量化识别：“社交中心”与“文化桥梁”

通过多种中心性指标，我们能够量化并区分艺人在网络中的不同战略角色：

### “社交中心” (高 Degree Centrality):
以 Bad Bunny（合作63次）为首的艺人，拥有最多的合作对象。他们是各自音乐圈内的“流量中心”，与他们合作意味着能直接触达最广泛的听众基础。

### “文化桥梁” (高 Betweenness Centrality):
J Balvin 的“桥梁”分数（0.098）遥遥领先，数据证明他不仅仅是一个合作者，更是连接拉丁音乐圈与北美流行/嘻哈圈的关键枢纽。

David Guetta（0.044）也扮演了类似的角色，有效连接了欧洲电音与美洲的主流市场。

这些“桥梁”艺人对于音乐风格的融合与全球化趋势的形成起到了不可或缺的作用。

## 发现三：对“合作效应”的量化验证：锦上添花，而非雪中送炭

我们的核心问题是：“抱大腿”真的有用吗？数据给出了一个微妙而清晰的答案。

### 数据证据:
我们发现，一首合作歌曲的“合作热度”（以合作者中最大Degree Centrality衡量）与其最终获得的榜单积分（Points (Total)）之间，存在一个统计上显著但微弱的正相关（r = 0.119）。

### 这说明了什么？
- **合作确实有效**: 与网络中心的“大牌”艺人合作，确实能为歌曲带来可量化的优势，使其榜单表现的期望值更高。
- **效果非常有限**: 然而，这种优势远非决定性因素。散点图显示，即使与顶级艺人合作，歌曲积分依然存在巨大波动。

### 最终结论:
这有力地证明了，合作的质量（如歌曲本身、营销契合度）远比单纯的合作行为更重要。与热门艺人合作更像是“锦上添花”，能够在一定程度上提升歌曲的表现，但并不是成功的决定性因素。
