In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
# 读取数据
medal_data = pd.read_csv("summerOly_medal_counts.csv")
sport_data = pd.read_csv("summary.csv")

# 查看数据结构
print(medal_data.head())
print(sport_data.head())


   Rank            NOC  Gold  Silver  Bronze  Total  Year
0     1  United States    11       7       2     20  1896
1     2         Greece    10      18      19     47  1896
2     3        Germany     6       5       2     13  1896
3     4         France     5       4       2     11  1896
4     5  Great Britain     2       3       2      7  1896
                    Team  Year         Sport  Gold_Count  Silver_Count  \
0                Februar  1952       Sailing           0             0   
1  A North American Team  1900          Polo           0             0   
2                    AIN  2024  Canoe Sprint           0             0   
3                    AIN  2024  Cycling Road           0             0   
4                    AIN  2024        Rowing           0             1   

   Bronze_Count  
0             0  
1             4  
2             0  
3             0  
4             0  


In [3]:
# 确保所有必要列存在
print(medal_data.columns)
print(sport_data.columns)

# 确保 Year 和 Total 列为数值类型
medal_data['Year'] = pd.to_numeric(medal_data['Year'], errors='coerce')
medal_data['Gold'] = pd.to_numeric(medal_data['Gold'], errors='coerce')
medal_data['Total'] = pd.to_numeric(medal_data['Total'], errors='coerce')

# 检查并处理空值
medal_data.dropna(subset=['Year', 'Gold', 'Total'], inplace=True)



In [10]:
# 检查数据列名
print(medal_data.columns)

# 确保 Year 和 Total 列是数值类型
medal_data['Year'] = pd.to_numeric(medal_data['Year'], errors='coerce')
medal_data['Total'] = pd.to_numeric(medal_data['Total'], errors='coerce')

# 加权滑动平均计算函数
def calculate_weighted_average(data, years, medal_col, decay=0.1):
    # 检查列是否存在
    if medal_col not in data.columns:
        raise KeyError(f"Column {medal_col} not found in data.")
    if years.name not in data.columns:
        raise KeyError(f"Column {years.name} not found in data.")
    
    # 计算权重
    weights = np.exp(-decay * (max(years) - years))
    if len(weights) == 0 or len(data[medal_col]) == 0:
        return 0  # 如果数据为空，返回0
    
    # 计算加权平均
    weighted_average = (data[medal_col] * weights).sum() / weights.sum()
    return weighted_average

# 计算每个国家的历史加权奖牌数
historical_medal_avg = []
for noc in medal_data['NOC'].unique():
    noc_data = medal_data[medal_data['NOC'] == noc]
    print(f"Processing NOC: {noc}")
    avg = calculate_weighted_average(noc_data, noc_data['Year'], 'Total')
    historical_medal_avg.append({'NOC': noc, 'Weighted_Total': avg})

# 转为DataFrame
weighted_medal_df = pd.DataFrame(historical_medal_avg)
print(weighted_medal_df.head())



Index(['Rank', 'NOC', 'Gold', 'Silver', 'Bronze', 'Total', 'Year'], dtype='object')
Processing NOC: United States
Processing NOC: Greece
Processing NOC: Germany
Processing NOC: France
Processing NOC: Great Britain
Processing NOC: Hungary
Processing NOC: Austria
Processing NOC: Australia
Processing NOC: Denmark
Processing NOC: Switzerland
Processing NOC: Mixed team
Processing NOC: Belgium
Processing NOC: Italy
Processing NOC: Cuba
Processing NOC: Canada
Processing NOC: Spain
Processing NOC: Luxembourg
Processing NOC: Norway
Processing NOC: Netherlands
Processing NOC: India
Processing NOC: Bohemia
Processing NOC: Sweden
Processing NOC: Australasia
Processing NOC: Russian Empire
Processing NOC: Finland
Processing NOC: South Africa
Processing NOC: Estonia
Processing NOC: Brazil
Processing NOC: Japan
Processing NOC: Czechoslovakia
Processing NOC: New Zealand
Processing NOC: Yugoslavia
Processing NOC: Argentina
Processing NOC: Uruguay
Processing NOC: Poland
Processing NOC: Haiti
Processing N

In [11]:
# 合并历史表现和项目参与度
merged_data = medal_data.merge(weighted_medal_df, on='NOC', how='left')

# 计算项目参与效益
sport_data['Total_Medals'] = sport_data['Gold_Count'] + sport_data['Silver_Count'] + sport_data['Bronze_Count']
project_efficiency = sport_data.groupby(['Team', 'Year'])['Total_Medals'].sum().reset_index()
project_efficiency.rename(columns={'Total_Medals': 'Project_Efficiency'}, inplace=True)

# 合并项目参与数据
final_data = merged_data.merge(project_efficiency, left_on=['NOC', 'Year'], right_on=['Team', 'Year'], how='left')
final_data.fillna(0, inplace=True)


In [12]:
# 特征选择
X = final_data[['Weighted_Total', 'Project_Efficiency']]
y = final_data['Total']

# 标准化特征
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 线性回归模型
reg = LinearRegression()
reg.fit(X_train, y_train)

# 预测和评估
y_pred = reg.predict(X_test)
print("总奖牌数模型评估:")
print("均方误差 (MSE):", mean_squared_error(y_test, y_pred))
print("决定系数 (R²):", r2_score(y_test, y_pred))


总奖牌数模型评估:
均方误差 (MSE): 91.60857547485452
决定系数 (R²): 0.8729050027301677


In [13]:
# 特征选择
X_gold = final_data[['Weighted_Total', 'Project_Efficiency']]
y_gold = final_data['Gold']

# 标准化特征
X_gold_scaled = scaler.fit_transform(X_gold)

# 划分训练集和测试集
X_train_gold, X_test_gold, y_train_gold, y_test_gold = train_test_split(X_gold_scaled, y_gold, test_size=0.2, random_state=42)

# 线性回归模型
gold_reg = LinearRegression()
gold_reg.fit(X_train_gold, y_train_gold)

# 预测和评估
y_pred_gold = gold_reg.predict(X_test_gold)
print("\n金牌数模型评估:")
print("均方误差 (MSE):", mean_squared_error(y_test_gold, y_pred_gold))
print("决定系数 (R²):", r2_score(y_test_gold, y_pred_gold))



金牌数模型评估:
均方误差 (MSE): 18.14166479023816
决定系数 (R²): 0.8480421973381872


In [14]:
# 特征选择
logistic_data = final_data.copy()
logistic_data['First_Medal'] = (logistic_data['Total'] > 0).astype(int)

X_logistic = logistic_data[['Project_Efficiency', 'Weighted_Total']]
y_logistic = logistic_data['First_Medal']

# Logistic 回归
log_reg = LogisticRegression()
log_reg.fit(X_logistic, y_logistic)

# 预测首次获奖概率
logistic_data['Medal_Prob'] = log_reg.predict_proba(X_logistic)[:, 1]
print("\n首次获奖国家预测:")
print(logistic_data[['NOC', 'Year', 'Medal_Prob']].head(10))


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1

In [None]:
# 总奖牌数预测 vs 真实值
plt.scatter(y_test, y_pred, alpha=0.7, label='Predicted vs Actual')
plt.xlabel("实际总奖牌数")
plt.ylabel("预测总奖牌数")
plt.title("总奖牌数预测")
plt.legend()
plt.show()

# 金牌数预测 vs 真实值
plt.scatter(y_test_gold, y_pred_gold, alpha=0.7, color='orange', label='Predicted vs Actual')
plt.xlabel("实际金牌数")
plt.ylabel("预测金牌数")
plt.title("金牌数预测")
plt.legend()
plt.show()
# 首次获奖概率分布
sns.histplot(logistic_data['Medal_Prob'], bins=20, kde=True, color='green')
plt.title("首次获奖概率分布")
plt.xlabel("获奖概率")
plt.ylabel("国家数量")
plt.show()
