# COMP5310 Project Stage 2 Code

## Group Component 1

In [1]:
# install the required packages
%pip install scikit-learn
%pip install pandas
%pip install numpy==1.26.4
%pip install scipy==1.11.4
%pip install matplotlib seaborn
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting scipy==1.11.4
  Downloading scipy-1.11.4-cp312-cp312-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.4 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.4 kB ? eta -:--:--
     ------------------- ------------------ 30.7/60.4 kB 262.6 kB/s eta 0:00:01
     -------------------------------------- 60.4/60.4 kB 401.3 kB/s eta 0:00:00
Downloading scipy-1.11.4-cp312-cp312-win_amd64.whl (43.7 MB)
   ---------------------------------------- 0.0/43.7 MB ? eta -:--:--
   ---------------------------------------- 0.3/43.7 MB 8.6 MB/s eta 0:00:06
    --------------------------------------- 0.7/43.7 MB 8.7 MB/s eta 0:00:05
    --------------------------------------- 1.0/43.7 MB 8.3 MB/s eta 0:00:06
   - -----------------------------------

In [7]:
# import the required packages
from sklearn.model_selection import train_test_split, GridSearchCV
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
# set display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

### Preliminary Changes to Data

In [4]:
# load the data and split it into features and labels
df = pd.read_csv('Group1_Activity9_FinalCleanData.csv')
labels = df['MarketValue']
features = df.drop(columns=['MarketValue'])

# drop the columns that relate to market
features_without_market_related_factors = features.drop(columns=['ReleaseClause', 'WeeklyWage'])

# drop the columns that relate to market and comprehensive evaluation
features_without_market_and_comprehensive_evaluation = features.drop(columns=['ReleaseClause', 'Rating in Scale 100', 'Potential in Scale 100', 'BestOverall', 'WeeklyWage'])

### Data split for train/validation/test sets

In [5]:
# split the data into training, validation and test sets
train_feature, rest_feature, train_labels, rest_labels = train_test_split(features, labels, test_size=0.3, random_state=42)
validation_feature, test_feature, validation_labels, test_labels = train_test_split(rest_feature, rest_labels, test_size=0.5, random_state=42)

train_feature_without_market_and_comprehensive_evaluation, rest_feature_without_market_and_comprehensive_evaluation, train_labels_without_market_and_comprehensive_evaluation, rest_labels_without_market_and_comprehensive_evaluation = train_test_split(features_without_market_and_comprehensive_evaluation, labels, test_size=0.3, random_state=42)
validation_feature_without_market_and_comprehensive_evaluation, test_feature_without_market_and_comprehensive_evaluation, validation_labels_without_market_and_comprehensive_evaluation, test_labels_without_market_and_comprehensive_evaluation = train_test_split(rest_feature_without_market_and_comprehensive_evaluation, rest_labels_without_market_and_comprehensive_evaluation, test_size=0.5, random_state=42)

In [6]:
# check the shape of the data
print('train_feature:')
print(train_feature.shape)
print('validation_feature:')
print(validation_feature.shape)
print('test_feature:')
print(test_feature.shape)
print('--------------------------------------------------------')
print('train_feature_without_market_and_comprehensive_evaluation')
print(train_feature_without_market_and_comprehensive_evaluation.shape)
print('validation_feature_without_market_and_comprehensive_evaluation')
print(validation_feature_without_market_and_comprehensive_evaluation.shape)
print('test_feature_without_market_and_comprehensive_evaluation')
print(test_feature_without_market_and_comprehensive_evaluation.shape)

train_feature:
(10048, 93)
validation_feature:
(2153, 93)
test_feature:
(2154, 93)
--------------------------------------------------------
train_feature_without_market_and_comprehensive_evaluation
(10048, 88)
validation_feature_without_market_and_comprehensive_evaluation
(2153, 88)
test_feature_without_market_and_comprehensive_evaluation
(2154, 88)


----------------------- End of Group Component1 -----------------------

# Individual Component - Unikey: cjia0865

Title: Gradient Boosting Decision Trees (GDBT)

Unikey: cjia0865

## Data Processing

In [17]:
# 检查缺失值
print(features_without_market_and_comprehensive_evaluation.isnull().sum())

Age                        0
Height                     0
Weight                     0
PreferredFoot              0
AttackingSkills            0
CrossingAbility            0
FinishingAbility           0
HeadingAccuracy            0
ShortPassing               0
Volleys                    0
SkillAttributes            0
Dribbling                  0
CurveAbility               0
FreeKickAccuracy           0
LongPassing                0
BallControl                0
MovementSkills             0
Acceleration               0
SprintSpeed                0
Agility                    0
Reactions                  0
Balance                    0
PowerSkills                0
ShotPower                  0
JumpingAbility             0
Stamina                    0
Strength                   0
LongShots                  0
MentalAttributes           0
Aggression                 0
Interceptions              0
Positioning                0
Vision                     0
Penalties                  0
Composure     

## Initial Model Development and Evaluation

### 初始化并训练模型

In [8]:
# initial
dt_regressor = DecisionTreeRegressor(random_state=42)

# training by train set
dt_regressor.fit(train_feature, train_labels)

### 验证模型

In [9]:
# 在验证集上进行预测
validation_predictions = dt_regressor.predict(validation_feature)

# 计算评估指标
mse = mean_squared_error(validation_labels, validation_predictions)
r2 = r2_score(validation_labels, validation_predictions)

print("验证集均方误差:", mse)
print("验证集R²得分:", r2)

验证集均方误差: 0.01975575986435505
验证集R²得分: 0.9867617675258415


## Model Optimization

In [10]:
# 定义参数网格
param_grid = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 初始化网格搜索
grid_search = GridSearchCV(
    estimator=DecisionTreeRegressor(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

# 训练网格搜索
grid_search.fit(train_feature, train_labels)

# 最佳参数
print("找到的最佳参数:", grid_search.best_params_)

# 最佳估计器
best_dt_regressor = grid_search.best_estimator_


找到的最佳参数: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 10}


In [11]:
# 使用最佳模型在验证集上进行预测
validation_predictions = best_dt_regressor.predict(validation_feature)

# 计算评估指标
mse = mean_squared_error(validation_labels, validation_predictions)
r2 = r2_score(validation_labels, validation_predictions)

print("调优后验证集均方误差:", mse)
print("调优后验证集R²得分:", r2)


调优后验证集均方误差: 0.017368161240901497
调优后验证集R²得分: 0.9883616850106298


In [12]:
# 在测试集上进行预测
test_predictions = best_dt_regressor.predict(test_feature)

# 计算评估指标
test_mse = mean_squared_error(test_labels, test_predictions)
test_r2 = r2_score(test_labels, test_predictions)

print("测试集均方误差:", test_mse)
print("测试集R²得分:", test_r2)


测试集均方误差: 0.019113272950273662
测试集R²得分: 0.9884849945052876


In [13]:
# 初始化模型
dt_regressor_alt = DecisionTreeRegressor(random_state=42)

# 训练模型
dt_regressor_alt.fit(train_feature_without_market_and_comprehensive_evaluation, train_labels_without_market_and_comprehensive_evaluation)

In [14]:
# 在验证集上进行预测
validation_predictions_alt = dt_regressor_alt.predict(validation_feature_without_market_and_comprehensive_evaluation)

# 评估
mse_alt = mean_squared_error(validation_labels_without_market_and_comprehensive_evaluation, validation_predictions_alt)
r2_alt = r2_score(validation_labels_without_market_and_comprehensive_evaluation, validation_predictions_alt)

print("替代模型验证集均方误差:", mse_alt)
print("替代模型验证集R²得分:", r2_alt)


替代模型验证集均方误差: 0.21998983400630162
替代模型验证集R²得分: 0.852585950400141


In [15]:
# 定义参数网格
param_grid_alt = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 初始化网格搜索
grid_search_alt = GridSearchCV(
    estimator=DecisionTreeRegressor(random_state=42),
    param_grid=param_grid_alt,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

# 训练网格搜索
grid_search_alt.fit(train_feature_without_market_and_comprehensive_evaluation, train_labels_without_market_and_comprehensive_evaluation)

# 最佳参数
print("替代模型找到的最佳参数:", grid_search_alt.best_params_)

# 最佳估计器
best_dt_regressor_alt = grid_search_alt.best_estimator_


替代模型找到的最佳参数: {'max_depth': 15, 'min_samples_leaf': 4, 'min_samples_split': 10}


In [16]:
# 使用最佳替代模型在验证集上进行预测
validation_predictions_alt = best_dt_regressor_alt.predict(validation_feature_without_market_and_comprehensive_evaluation)

# 计算评估指标
mse_alt = mean_squared_error(validation_labels_without_market_and_comprehensive_evaluation, validation_predictions_alt)
r2_alt = r2_score(validation_labels_without_market_and_comprehensive_evaluation, validation_predictions_alt)

print("替代模型调优后验证集均方误差:", mse_alt)
print("替代模型调优后验证集R²得分:", r2_alt)


替代模型调优后验证集均方误差: 0.1863324719165698
替代模型调优后验证集R²得分: 0.875139574602403


## Model results

----------------------- End of Individual Component - Unikey: jjia0646 -----------------------

## Group Component2

### Optimal Model Comparison

### Final Model Recommendation

----------------------- End of Group Component 2 -----------------------