In [54]:
import pandas as pd

# 设定文件路径
train_csv_path = 'train.csv'
test_csv_path = 'test.csv'

# 加载数据
train_data = pd.read_csv(train_csv_path)
test_data = pd.read_csv(test_csv_path)

# 显示训练集和测试集的前几行，以确认加载正确
print(train_data.head())
print(test_data.head())


   Unnamed: 0 policy_id  policy_tenure            age_of_car  \
0       15194        P1             54  5 years and 3 months   
1       26329        P2             36              8 months   
2        1174        P3             79              4 months   
3       55371        P4             46              0 months   
4       45259        P5             66  5 years and 3 months   

   age_of_policyholder area_cluster  population_density  make segment model  \
0                   37           C9               17804     3      C2    M4   
1                   42           C3                4076     1       A    M1   
2                   37           C5               34738     1       A    M1   
3                   37           C9               17804     1      B1    M8   
4                   36          C10               73430     3      C2    M4   

   ... is_brake_assist is_power_door_locks is_central_locking  \
0  ...             Yes                 Yes                Yes   
1  ...    

In [55]:
# 继续上述代码，首先检查缺失值
missing_values_train = train_data.isnull().sum()
missing_values_test = test_data.isnull().sum()

# 打印出每个数据集中缺失值的统计数据
print("Missing values in train data:\n", missing_values_train)
print("Missing values in test data:\n", missing_values_test)

# 接下来，检查数据集中的不一致数据
# 对于年龄，我们可能需要考虑一个合理的上限，例如100岁
# 车龄应该是非负数，我们也可以检查是否有异常高的车龄值



Missing values in train data:
 Unnamed: 0                          0
policy_id                           0
policy_tenure                       0
age_of_car                          0
age_of_policyholder                 0
area_cluster                        0
population_density                  0
make                                0
segment                             0
model                               0
fuel_type                           0
max_torque                          0
max_power                           0
engine_type                         0
airbags                             0
is_esc                              0
is_adjustable_steering              0
is_tpms                             0
is_parking_sensors                  0
is_parking_camera                   0
rear_brakes_type                    0
displacement                        0
cylinder                            0
transmission_type                   0
gear_box                            0
steering_type      

In [56]:
# 接下来，检查数据集中的不一致数据
# 对于年龄，我们可能需要考虑一个合理的上限，例如100岁
# 车龄应该是非负数，我们也可以检查是否有异常高的车龄值

# 我们可以假设 age_of_policyholder 应当是一个0到100之间的数
illogical_age_policyholder_train = train_data[(train_data['age_of_policyholder'] < 0) | (train_data['age_of_policyholder'] > 100)]
illogical_age_policyholder_test = test_data[(test_data['age_of_policyholder'] < 0) | (test_data['age_of_policyholder'] > 100)]

def convert_age_to_months(age_str):
    # 如果数据已经是整数，直接返回
    if isinstance(age_str, int):
        return age_str
    # 否则，执行字符串到月数的转换
    parts = age_str.split(' ')
    years = int(parts[0]) if parts[0].isdigit() else 0
    months = int(parts[2]) if len(parts) > 2 and parts[2].isdigit() else 0
    return years * 12 + months

# 应用转换函数到数据集中，并确保数据类型是字符串
train_data['age_of_car'] = train_data['age_of_car'].apply(lambda x: convert_age_to_months(x) if isinstance(x, str) else x)
test_data['age_of_car'] = test_data['age_of_car'].apply(lambda x: convert_age_to_months(x) if isinstance(x, str) else x)

illogical_age_of_car_train = train_data[train_data['age_of_car'] < 0]
illogical_age_of_car_test = test_data[test_data['age_of_car'] < 0]

# 打印出不一致数据的统计数据
print("train_data['age_of_car']", train_data['age_of_car'])
print("Illogical age of policyholder in train data:\n", illogical_age_policyholder_train)
print("Illogical age of policyholder in test data:\n", illogical_age_policyholder_test)
print("Illogical age of car in train data:\n", illogical_age_of_car_train)
print("Illogical age of car in test data:\n", illogical_age_of_car_test)

train_data['age_of_car'] 0        60
1        96
2        48
3         0
4        60
         ..
39995    12
39996    36
39997    96
39998    24
39999    24
Name: age_of_car, Length: 40000, dtype: int64
Illogical age of policyholder in train data:
 Empty DataFrame
Columns: [Unnamed: 0, policy_id, policy_tenure, age_of_car, age_of_policyholder, area_cluster, population_density, make, segment, model, fuel_type, max_torque, max_power, engine_type, airbags, is_esc, is_adjustable_steering, is_tpms, is_parking_sensors, is_parking_camera, rear_brakes_type, displacement, cylinder, transmission_type, gear_box, steering_type, turning_radius, length, width, height, gross_weight, is_front_fog_lights, is_rear_window_wiper, is_rear_window_washer, is_rear_window_defogger, is_brake_assist, is_power_door_locks, is_central_locking, is_power_steering, is_driver_seat_height_adjustable, is_day_night_rear_view_mirror, is_ecw, is_speed_alert, ncap_rating, is_claim]
Index: []

[0 rows x 45 columns]
Illogical 

In [57]:
import pandas as pd

# 设定年龄分组的区间
age_bins = [18, 30, 45, 60, 75, 90, 105]  # 你可以根据需要调整这些区间
age_labels = ['18-29', '30-44', '45-59', '60-74', '75-89', '90+']

# 对年龄进行分组
train_data['age_group'] = pd.cut(train_data['age_of_policyholder'], bins=age_bins, labels=age_labels, include_lowest=True)

# 计算每个年龄组的索赔比例
# claim_rates_by_age_group = train_data.groupby('age_group')['is_claim'].mean()
claim_rates_by_age_group = pd.crosstab(train_data['age_group'], train_data['is_claim'], normalize='index')
# 输出每个年龄组的索赔比例
print(claim_rates_by_age_group)


is_claim          0         1
age_group                    
18-29      0.946694  0.053306
30-44      0.934435  0.065565
45-59      0.933997  0.066003
60-74      0.903846  0.096154
75-89      0.833333  0.166667


In [58]:
# 基于12个月为间隔创建车辆年龄的分组
car_age_bins = list(range(0, 361, 12))  # 从0开始，每12个月作为一个界限，直到360个月
car_age_labels = [f'{i//12}-{(i//12)+1} years' for i in range(0, 360, 12)]  # 标签形式为 "0-1 years", "1-2 years", ...

# 应用分组
train_data['car_age_group'] = pd.cut(train_data['age_of_car'], bins=car_age_bins, labels=car_age_labels, include_lowest=True)

# 用pd.crosstab创建年龄组与车龄组的交叉表，并且归一化行
age_car_crosstab = pd.crosstab(train_data['age_group'], train_data['car_age_group'], normalize='index')

# 输出不同年龄段拥有不同年龄车辆的比例
age_car_crosstab


car_age_group,0-1 years,1-2 years,2-3 years,3-4 years,4-5 years,5-6 years,6-7 years,7-8 years,8-9 years,9-10 years,10-11 years,11-12 years,12-13 years,13-14 years,14-15 years,15-16 years,16-17 years,20-21 years
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
18-29,0.27134,0.157498,0.155144,0.223061,0.075787,0.020854,0.009737,0.082765,0.001785,0.001298,0.000487,8.1e-05,0.000162,0.0,0.0,0.0,0.0,0.0
30-44,0.283266,0.143337,0.139318,0.23174,0.075076,0.020855,0.009512,0.093845,0.000966,0.001068,0.000458,0.000305,0.0,5.1e-05,0.000102,5.1e-05,5.1e-05,0.0
45-59,0.314375,0.140344,0.115794,0.227632,0.060693,0.018412,0.010638,0.106383,0.001909,0.002319,0.000818,0.000273,0.0,0.0,0.0,0.000136,0.000136,0.000136
60-74,0.29142,0.14645,0.137574,0.224852,0.060651,0.026627,0.017751,0.08284,0.004438,0.004438,0.001479,0.0,0.001479,0.0,0.0,0.0,0.0,0.0
75-89,0.5,0.0,0.0,0.166667,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
# 基于12个月为界限创建保单时长的分组
policy_tenure_bins = list(range(0, int(train_data['policy_tenure'].max()) + 12, 12))  # 确保包含最大值
policy_tenure_labels = [f'{i}-{i+11} months' for i in range(0, int(train_data['policy_tenure'].max()), 12)]

# 应用分组
train_data['policy_tenure_group'] = pd.cut(train_data['policy_tenure'], bins=policy_tenure_bins, labels=policy_tenure_labels, include_lowest=True)

# 用pd.crosstab创建年龄组与保单时长组的交叉表，并且归一化行
age_policy_tenure_crosstab = pd.crosstab(train_data['age_group'], train_data['policy_tenure_group'], normalize='index')

# 输出不同年龄段内不同保单时长的比例
age_policy_tenure_crosstab


policy_tenure_group,0-11 months,12-23 months,24-35 months,36-47 months,48-59 months,60-71 months,72-83 months,84-95 months,96-107 months,108-119 months
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
18-29,0.247059,0.127059,0.095578,0.108154,0.074077,0.064828,0.059473,0.141501,0.08211,0.000162
30-44,0.205748,0.111139,0.08708,0.100305,0.07294,0.066277,0.069329,0.184944,0.101984,0.000254
45-59,0.164735,0.093277,0.072549,0.088095,0.06873,0.063685,0.07364,0.233329,0.141688,0.000273
60-74,0.116864,0.050296,0.050296,0.069527,0.047337,0.050296,0.075444,0.352071,0.18787,0.0
75-89,0.166667,0.166667,0.0,0.0,0.0,0.0,0.0,0.333333,0.333333,0.0


In [60]:
# 使用pd.crosstab来创建年龄组与车辆段的交叉表，并且归一化行
age_segment_crosstab = pd.crosstab(train_data['age_group'], train_data['segment'], normalize='index')

# 使用pd.crosstab来创建年龄组与车型的交叉表，并且归一化行
age_model_crosstab = pd.crosstab(train_data['age_group'], train_data['model'], normalize='index')

# 输出不同年龄段内不同车辆段的比例
print("Vehicle Segment Distribution by Age Group:\n", age_segment_crosstab)

# 输出不同年龄段内不同车型的比例
print("Model Distribution by Age Group:\n", age_model_crosstab)


Vehicle Segment Distribution by Age Group:
 segment           A        B1        B2        C1        C2   Utility
age_group                                                            
18-29      0.271400  0.092657  0.301014  0.073671  0.235862  0.025396
30-44      0.302645  0.067192  0.309868  0.058291  0.241913  0.020092
45-59      0.330697  0.042411  0.339425  0.045820  0.226783  0.014864
60-74      0.201183  0.047337  0.417160  0.036982  0.286982  0.010355
75-89      0.166667  0.166667  0.333333  0.000000  0.333333  0.000000
Model Distribution by Age Group:
 model            M1       M10       M11        M2        M3        M4  \
age_group                                                               
18-29      0.227262  0.025396  0.006815  0.020527  0.044138  0.235862   
30-44      0.260173  0.020092  0.006612  0.017854  0.042472  0.241913   
45-59      0.299605  0.014864  0.004091  0.015137  0.031092  0.226783   
60-74      0.174556  0.010355  0.010355  0.007396  0.026627  0.2869

In [61]:
# 首先，提取扭矩的数值部分
train_data['max_torque_value'] = train_data['max_torque'].str.extract(r'(\d+\.?\d*)').astype(float)

# 然后根据需要创建扭矩的分组
torque_bins = [0, 100, 200, 300, 400, 500, 600, 700]  # 这些分组界限可能需要根据数据分布进行调整
torque_labels = ['0-99 Nm', '100-199 Nm', '200-299 Nm', '300-399 Nm', '400-499 Nm', '500-599 Nm', '600+ Nm']
train_data['torque_group'] = pd.cut(train_data['max_torque_value'], bins=torque_bins, labels=torque_labels, include_lowest=True)

# 计算年龄组与扭矩组的交叉表
age_torque_crosstab = pd.crosstab(train_data['age_group'], train_data['torque_group'], normalize='index')
# 对于is_parking_camera的分析可以保持不变
age_camera_crosstab = pd.crosstab(train_data['age_group'], train_data['is_parking_camera'], normalize='index')
# 对于is_parking_camera的分析可以保持不变
age_ncap_rating_crosstab = pd.crosstab(train_data['age_group'], train_data['ncap_rating'], normalize='index')
# 对于is_parking_camera的分析可以保持不变
age_fuel_type_crosstab = pd.crosstab(train_data['age_group'], train_data['fuel_type'], normalize='index')

age_area_cluster_crosstab = pd.crosstab(train_data['age_group'], train_data['area_cluster'], normalize='index')

age_airbags_crosstab = pd.crosstab(train_data['age_group'], train_data['airbags'], normalize='index')

age_is_esc_crosstab = pd.crosstab(train_data['age_group'], train_data['is_esc'], normalize='index')
# population_density is_esc
# 打印结果
print(age_ncap_rating_crosstab)
print(age_fuel_type_crosstab)
print("Torque Distribution by Age Group:\n", age_torque_crosstab)
print("\nParking Camera Availability by Age Group:\n", age_camera_crosstab)
print("\nArea Cluster Distribution by Age Group:\n", age_area_cluster_crosstab)
print("\nAirbags Distribution by Age Group:\n", age_airbags_crosstab)
print("\nESC Availability by Age Group:\n", age_is_esc_crosstab)

ncap_rating         0         2         3         4         5
age_group                                                    
18-29        0.307099  0.380122  0.235862  0.046329  0.030588
30-44        0.330214  0.356968  0.241913  0.033825  0.037080
45-59        0.358380  0.356880  0.226783  0.026592  0.031365
60-74        0.233728  0.439349  0.286982  0.019231  0.020710
75-89        0.166667  0.500000  0.333333  0.000000  0.000000
fuel_type       CNG    Diesel    Petrol
age_group                              
18-29      0.345314  0.305963  0.348722
30-44      0.347457  0.306205  0.346338
45-59      0.356880  0.280649  0.362471
60-74      0.232249  0.316568  0.451183
75-89      0.333333  0.333333  0.333333
Torque Distribution by Age Group:
 torque_group   0-99 Nm  100-199 Nm  200-299 Nm
age_group                                     
18-29         0.389452    0.374686    0.235862
30-44         0.389929    0.368159    0.241913
45-59         0.387972    0.385245    0.226783
60-74         0.

In [63]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

# 假设 train_data 已经被定义并加载数据

# 定义特征列（'policy_tenure'是数值特征，所以不需要进行OneHot编码）
# 定义特征列（'policy_tenure'是数值特征，所以不需要进行OneHot编码）
feature_columns = ['policy_tenure', 'is_claim']  # 这里只加入了'policy_tenure'作为数值特征的例子
categorical_features = ['car_age_group', 'area_cluster',]  # 这是需要编码的分类特征

# 创建一个预处理器，使用OneHotEncoder对分类特征进行编码
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'  # 对其余的列直接传递
)

# 创建一个包含预处理器和 SVM 回归模型的管道
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', SVR())
])

# 准备特征和目标变量
X = train_data[categorical_features + feature_columns]  # 加入分类特征和数值特征
y = train_data['age_of_policyholder']  # 目标变量

# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 定义要搜索的参数网格
param_grid = {
    'model__C': [0.1, 1, 10, 100],  # SVM正则化参数
    'model__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # 核函数类型
    'model__gamma': ['scale', 'auto'],  # 核系数
    'model__epsilon': [0.01, 0.1, 0.5, 1]  # Epsilon in the epsilon-SVR model
}

# 创建网格搜索对象，使用模型管道和参数网格
grid_search = GridSearchCV(
    estimator=model_pipeline, 
    param_grid=param_grid,
    cv=5,  # 5折交叉验证
    scoring='neg_mean_squared_error',  # 使用负均方误差作为评分标准
    verbose=2,  # 打印详细信息
    n_jobs=-1  # 使用所有可用的CPU核心
)

# 训练网格搜索模型
grid_search.fit(X_train, y_train)

# 打印最佳参数和对应的MSE
print("Best parameters found: ", grid_search.best_params_)
best_cv_score = -grid_search.best_score_
print("Best CV MSE: ", best_cv_score)

# 使用最佳参数模型在验证集上的表现
best_model = grid_search.best_estimator_
y_pred_val = best_model.predict(X_val)
best_val_mse = mean_squared_error(y_val, y_pred_val)
print("Validation MSE with Best SVM Model: ", best_val_mse)

Fitting 5 folds for each of 128 candidates, totalling 640 fits


Fatal Python error: init_sys_streams: can't initialize sys standard streams
Python runtime state: core initialized
OSError: [Errno 9] Bad file descriptor

Current thread 0x00007ff84868b040 (most recent call first):
  <no Python frame>
Fatal Python error: init_sys_streams: can't initialize sys standard streams
Python runtime state: core initialized
OSError: [Errno 9] Bad file descriptor

Current thread 0x00007ff84868b040 (most recent call first):
  <no Python frame>
Fatal Python error: init_sys_streams: can't initialize sys standard streams
Python runtime state: core initialized
OSError: [Errno 9] Bad file descriptor

Current thread 0x00007ff84868b040 (most recent call first):
  <no Python frame>
Fatal Python error: init_sys_streams: can't initialize sys standard streams
Python runtime state: core initialized
OSError: [Errno 9] Bad file descriptor

Current thread 0x00007ff84868b040 (most recent call first):
  <no Python frame>
Fatal Python error: init_sys_streams: can't initialize sys s

BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd

# 假设 train_data 已经被定义并加载数据

# 定义特征列（'policy_tenure'是数值特征，所以不需要进行OneHot编码）
feature_columns = ['policy_tenure']  # 这里只加入了'policy_tenure'作为数值特征的例子
categorical_features = ['car_age_group', 'area_cluster', 'segment', 'ncap_rating']  # 这是需要编码的分类特征

# 创建一个预处理器，使用OneHotEncoder对分类特征进行编码
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'  # 对其余的列直接传递
)

# 创建一个包含预处理器和模型的管道
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

# 准备特征和目标变量
X = train_data[categorical_features + feature_columns]  # 加入分类特征和数值特征
y = train_data['age_of_policyholder']  # 目标变量

# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 训练管道
model_pipeline.fit(X_train, y_train)

# 使用交叉验证评估模型性能
scores = cross_val_score(model_pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

# 计算均方误差
mse_scores = -scores
print("Mean Squared Error (MSE) with Cross-Validation:", np.mean(mse_scores))

# 验证集上的均方误差
y_pred = model_pipeline.predict(X_val)
mse_val = mean_squared_error(y_val, y_pred)
print("Validation MSE:", mse_val)


In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd
from sklearn.svm import SVR

# 假设 train_data 已经被定义并加载数据

# 定义特征列（'policy_tenure'是数值特征，所以不需要进行OneHot编码）
feature_columns = ['policy_tenure','ncap_rating', 'is_claim']  # 这里只加入了'policy_tenure'作为数值特征的例子
categorical_features = ['car_age_group', 'area_cluster',]  # 这是需要编码的分类特征

# 创建一个预处理器，使用OneHotEncoder对分类特征进行编码
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'  # 对其余的列直接传递
)

# 创建一个包含预处理器和 SVM 回归模型的管道
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', SVR())
])

# 准备特征和目标变量
X = train_data[categorical_features + feature_columns]  # 加入分类特征和数值特征
y = train_data['age_of_policyholder']  # 目标变量

# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 训练管道
model_pipeline.fit(X_train, y_train)

# 使用交叉验证评估模型性能
scores = cross_val_score(model_pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

# 计算均方误差
mse_scores = -scores
print("Mean Squared Error (MSE) with Cross-Validation using SVM:", np.mean(mse_scores))

# 验证集上的均方误差
y_pred = model_pipeline.predict(X_val)
mse_val = mean_squared_error(y_val, y_pred)
print("Validation MSE using SVM:", mse_val)

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

# 假设 train_data 已经被定义并加载数据

# 定义特征列（'policy_tenure'是数值特征，所以不需要进行OneHot编码）
# 定义特征列（'policy_tenure'是数值特征，所以不需要进行OneHot编码）
feature_columns = ['policy_tenure','ncap_rating', 'is_claim']  # 这里只加入了'policy_tenure'作为数值特征的例子
categorical_features = ['car_age_group', 'area_cluster',]  # 这是需要编码的分类特征

# 创建一个预处理器，使用OneHotEncoder对分类特征进行编码
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'  # 对其余的列直接传递
)

# 创建一个包含预处理器和 SVM 回归模型的管道
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', SVR())
])

# 准备特征和目标变量
X = train_data[categorical_features + feature_columns]  # 加入分类特征和数值特征
y = train_data['age_of_policyholder']  # 目标变量

# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 定义要搜索的参数网格
param_grid = {
    'model__C': [0.1, 1, 10, 100],  # SVM正则化参数
    'model__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # 核函数类型
    'model__gamma': ['scale', 'auto'],  # 核系数
    'model__epsilon': [0.01, 0.1, 0.5, 1]  # Epsilon in the epsilon-SVR model
}

# 创建网格搜索对象，使用模型管道和参数网格
grid_search = GridSearchCV(
    estimator=model_pipeline, 
    param_grid=param_grid,
    cv=5,  # 5折交叉验证
    scoring='neg_mean_squared_error',  # 使用负均方误差作为评分标准
    verbose=2,  # 打印详细信息
    n_jobs=-1  # 使用所有可用的CPU核心
)

# 训练网格搜索模型
grid_search.fit(X_train, y_train)

# 打印最佳参数和对应的MSE
print("Best parameters found: ", grid_search.best_params_)
best_cv_score = -grid_search.best_score_
print("Best CV MSE: ", best_cv_score)

# 使用最佳参数模型在验证集上的表现
best_model = grid_search.best_estimator_
y_pred_val = best_model.predict(X_val)
best_val_mse = mean_squared_error(y_val, y_pred_val)
print("Validation MSE with Best SVM Model: ", best_val_mse)

