In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import warnings
warnings.filterwarnings('ignore')

# Load the train and test datasets
train_file_path = r"C:\Users\11953\M2_ISDS\ML_PROJET\train.csv"
test_file_path = r"C:\Users\11953\M2_ISDS\ML_PROJET\test.csv"

train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

# Display the first few rows of each dataset for initial inspection
train_head = train_data.head()
test_head = test_data.head()

In [23]:
train_head

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,Male,28.0,0,0,Yes,Private,Urban,79.53,31.1,never smoked,0
1,1,Male,33.0,0,0,Yes,Private,Rural,78.44,23.9,formerly smoked,0
2,2,Female,42.0,0,0,Yes,Private,Rural,103.0,40.3,Unknown,0
3,3,Male,56.0,0,0,Yes,Private,Urban,64.87,28.8,never smoked,0
4,4,Female,24.0,0,0,No,Private,Rural,73.36,28.8,never smoked,0


In [24]:
test_head

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,15304,Female,57.0,0,0,Yes,Private,Rural,82.54,33.4,Unknown
1,15305,Male,70.0,1,0,Yes,Private,Urban,72.06,28.5,Unknown
2,15306,Female,5.0,0,0,No,children,Urban,103.72,19.5,Unknown
3,15307,Female,56.0,0,0,Yes,Govt_job,Urban,69.24,41.4,smokes
4,15308,Male,32.0,0,0,Yes,Private,Rural,111.15,30.1,smokes


In [25]:
train_data.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [26]:
test_data.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
dtype: int64

In [27]:
train_data.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,15304.0,15304.0,15304.0,15304.0,15304.0,15304.0,15304.0
mean,7651.5,41.417708,0.049726,0.023327,89.039853,28.112721,0.041296
std,4418.028595,21.444673,0.217384,0.150946,25.476102,6.722315,0.198981
min,0.0,0.08,0.0,0.0,55.22,10.3,0.0
25%,3825.75,26.0,0.0,0.0,74.9,23.5,0.0
50%,7651.5,43.0,0.0,0.0,85.12,27.6,0.0
75%,11477.25,57.0,0.0,0.0,96.98,32.0,0.0
max,15303.0,82.0,1.0,1.0,267.6,80.1,1.0


In [28]:
test_data.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi
count,10204.0,10204.0,10204.0,10204.0,10204.0,10204.0
mean,20405.5,41.920713,0.05145,0.021756,89.197766,28.242351
std,2945.785407,21.097956,0.220925,0.145894,25.914364,6.733863
min,15304.0,0.08,0.0,0.0,55.12,11.5
25%,17854.75,26.0,0.0,0.0,75.06,23.7
50%,20405.5,43.5,0.0,0.0,85.07,27.7
75%,22956.25,57.0,0.0,0.0,96.75,32.0
max,25507.0,82.0,1.0,1.0,267.6,97.6


In [29]:
# Checking the distribution of the target variable 'stroke' in the train dataset
stroke_distribution = train_data['stroke'].value_counts(normalize=True)

stroke_distribution


0    0.958704
1    0.041296
Name: stroke, dtype: float64

In [30]:
# Correctly identifying and filling 'Unknown' values in 'smoking_status' column
np.random.seed(0)  # for reproducibility
train_data_corrected = train_data.copy()

# Identifying 'Unknown' values
unknown_indices = train_data_corrected[train_data_corrected['smoking_status'] == 'Unknown'].index

# Filtering out 'Unknown' values to get valid statuses
valid_smoking_statuses = train_data_corrected['smoking_status'].unique()
valid_smoking_statuses = valid_smoking_statuses[valid_smoking_statuses != 'Unknown']

# Randomly assigning a valid smoking status to each 'Unknown' entry
random_smoking_statuses = np.random.choice(valid_smoking_statuses, size=len(unknown_indices))
train_data_corrected.loc[unknown_indices, 'smoking_status'] = random_smoking_statuses

train_data=train_data_corrected
train_data['smoking_status'].value_counts()

never smoked       7784
formerly smoked    3875
smokes             3645
Name: smoking_status, dtype: int64

In [31]:
train_data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,Male,28.0,0,0,Yes,Private,Urban,79.53,31.1,never smoked,0
1,1,Male,33.0,0,0,Yes,Private,Rural,78.44,23.9,formerly smoked,0
2,2,Female,42.0,0,0,Yes,Private,Rural,103.0,40.3,never smoked,0
3,3,Male,56.0,0,0,Yes,Private,Urban,64.87,28.8,never smoked,0
4,4,Female,24.0,0,0,No,Private,Rural,73.36,28.8,never smoked,0


In [32]:
train_data['work_type'].value_counts()

Private          9752
children         2038
Self-employed    1939
Govt_job         1533
Never_worked       42
Name: work_type, dtype: int64

In [33]:
train_data['Residence_type'].value_counts()

Rural    7664
Urban    7640
Name: Residence_type, dtype: int64

In [34]:
train_data['ever_married'].value_counts()

Yes    10385
No      4919
Name: ever_married, dtype: int64

In [38]:
# 1. 独热编码
categorical_features = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
train_data_encoded = pd.get_dummies(train_data, columns=categorical_features)

# 2. 创建新的特征（这里是一个简单的例子，具体应根据你的数据和业务需求来定）
# 例如，我们可以创建一个基于年龄和健康指标的简单健康风险分数
def categorize_bmi(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif 18.5 <= bmi < 25:
        return 'Normal'
    elif 25 <= bmi < 30:
        return 'Overweight'
    else:
        return 'Obese'
def age_group(age):
    if age < 18:
        return 'Child'
    elif 18 <= age < 60:
        return 'Adult'
    else:
        return 'Senior'
    
train_data_encoded['health_risk_score'] = train_data['age'] / 50 + train_data['hypertension'] + train_data['heart_disease'] + train_data['avg_glucose_level'] / 200
train_data_encoded['bmi_category'] = train_data['bmi'].apply(categorize_bmi)
train_data_encoded['age_group'] = train_data['age'].apply(age_group)
train_data_encoded['total_health_score'] = train_data['hypertension'] + train_data['heart_disease'] + (train_data['avg_glucose_level'] / 100)
train_data_encoded['lifestyle_score'] = train_data_encoded['smoking_status_never smoked'] + train_data_encoded['ever_married_Yes'] + train_data_encoded['work_type_Private']

In [39]:
# 查看处理后的数据
train_data_encoded.head()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,...,Residence_type_Rural,Residence_type_Urban,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,health_risk_score,bmi_category,age_group,total_health_score,lifestyle_score
0,0,28.0,0,0,79.53,31.1,0,0,1,0,...,0,1,0,1,0,0.95765,Obese,Adult,0.7953,3
1,1,33.0,0,0,78.44,23.9,0,0,1,0,...,1,0,1,0,0,1.0522,Normal,Adult,0.7844,2
2,2,42.0,0,0,103.0,40.3,0,1,0,0,...,1,0,0,1,0,1.355,Obese,Adult,1.03,3
3,3,56.0,0,0,64.87,28.8,0,0,1,0,...,0,1,0,1,0,1.44435,Overweight,Adult,0.6487,3
4,4,24.0,0,0,73.36,28.8,0,1,0,0,...,1,0,0,1,0,0.8468,Overweight,Adult,0.7336,2


In [40]:
# 将新生成的分类特征添加到独热编码列表
new_categorical_features = ['bmi_category', 'age_group']

# 对所有分类特征进行独热编码
train_data_encoded = pd.get_dummies(train_data_encoded, columns=new_categorical_features)

# 查看处理后的数据
train_data_encoded.head()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,...,health_risk_score,total_health_score,lifestyle_score,bmi_category_Normal,bmi_category_Obese,bmi_category_Overweight,bmi_category_Underweight,age_group_Adult,age_group_Child,age_group_Senior
0,0,28.0,0,0,79.53,31.1,0,0,1,0,...,0.95765,0.7953,3,0,1,0,0,1,0,0
1,1,33.0,0,0,78.44,23.9,0,0,1,0,...,1.0522,0.7844,2,1,0,0,0,1,0,0
2,2,42.0,0,0,103.0,40.3,0,1,0,0,...,1.355,1.03,3,0,1,0,0,1,0,0
3,3,56.0,0,0,64.87,28.8,0,0,1,0,...,1.44435,0.6487,3,0,0,1,0,1,0,0
4,4,24.0,0,0,73.36,28.8,0,1,0,0,...,0.8468,0.7336,2,0,0,1,0,1,0,0


In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# 定义特征和目标变量（假设 train_data_encoded 是你的训练数据）
X_train = train_data_encoded.drop('stroke', axis=1)
y_train = train_data_encoded['stroke']

# 初始化逻辑回归模型
log_reg = LogisticRegression(max_iter=1000)  # 增加迭代次数以确保收敛

# 训练模型
log_reg.fit(X_train, y_train)


# 使用交叉验证评估模型
scores = cross_val_score(log_reg, X_train, y_train, cv=5, scoring='accuracy')

print("Précision de la validation croisée : ", scores)
print("Précision moyenne : ", scores.mean())

Précision de la validation croisée :  [0.95655015 0.9581836  0.95720353 0.9581836  0.95915033]
Précision moyenne :  0.9578542404656515


In [46]:
from sklearn.tree import DecisionTreeClassifier

# 初始化决策树模型
decision_tree = DecisionTreeClassifier(random_state=42)

# 训练模型
decision_tree.fit(X_train, y_train)

# 使用交叉验证评估模型
decision_tree_scores = cross_val_score(decision_tree, X_train, y_train, cv=5, scoring='accuracy')
print("Précision de la validation croisée: ", decision_tree_scores)
print("Précision moyenne: ", decision_tree_scores.mean())

Précision de la validation croisée:  [0.6723293  0.90460634 0.92747468 0.93694871 0.94117647]
Précision moyenne:  0.8765071007167977


In [53]:
# 获取特征重要性
importances = decision_tree.feature_importances_

# 打印特征及其重要性
feature_names = X_train.columns
feature_importances = pd.Series(importances, index=feature_names)
print(feature_importances.sort_values(ascending=False))

id                                0.179642
age                               0.178535
avg_glucose_level                 0.151346
health_risk_score                 0.136407
bmi                               0.113690
total_health_score                0.093481
lifestyle_score                   0.020059
Residence_type_Urban              0.015502
bmi_category_Overweight           0.012527
gender_Female                     0.011071
smoking_status_formerly smoked    0.009982
smoking_status_smokes             0.009806
work_type_Self-employed           0.008676
gender_Male                       0.008344
smoking_status_never smoked       0.007894
hypertension                      0.006719
work_type_Private                 0.005826
Residence_type_Rural              0.005130
work_type_Govt_job                0.005101
age_group_Senior                  0.003753
ever_married_No                   0.003665
heart_disease                     0.003605
age_group_Adult                   0.003313
bmi_categor

In [54]:
from sklearn.model_selection import GridSearchCV

# 定义要调优的参数范围
param_grid = {
    'max_depth': [3, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 6],
    'min_samples_split': [2, 4, 6, 8],
    'criterion': ['gini', 'entropy']
}

# 初始化带参数网格的决策树模型
grid_search = GridSearchCV(estimator=decision_tree, param_grid=param_grid, cv=5, scoring='f1')

# 进行网格搜索
grid_search.fit(X_train, y_train)

# 输出最佳参数和相应的F1分数
print("最佳参数：", grid_search.best_params_)
print("最佳F1分数：", grid_search.best_score_)

最佳参数： {'criterion': 'gini', 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2}
最佳F1分数： 0.19429475928289536


In [48]:
from sklearn.ensemble import RandomForestClassifier

# 初始化随机森林模型
random_forest = RandomForestClassifier(random_state=42)

# 训练模型
random_forest.fit(X_train, y_train)

random_forest_scores = cross_val_score(random_forest, X_train, y_train, cv=5, scoring='accuracy')
print("Précision de la validation croisée: ", random_forest_scores)
print("Précision moyenne: ", random_forest_scores.mean())


Précision de la validation croisée:  [0.70271153 0.95785691 0.95622346 0.95753022 0.95882353]
Précision moyenne:  0.9066291292734017


In [51]:
# 训练随机森林模型
random_forest.fit(X_train, y_train)

# 获取特征重要性
importances = random_forest.feature_importances_

# 打印特征及其重要性
feature_names = X_train.columns
feature_importances = pd.Series(importances, index=feature_names)
print(feature_importances.sort_values(ascending=False))


health_risk_score                 1.548545e-01
avg_glucose_level                 1.342893e-01
total_health_score                1.331078e-01
id                                1.232141e-01
age                               1.211615e-01
bmi                               1.107539e-01
age_group_Senior                  2.437578e-02
lifestyle_score                   1.788584e-02
Residence_type_Rural              1.447345e-02
Residence_type_Urban              1.440940e-02
gender_Female                     1.422218e-02
gender_Male                       1.332175e-02
smoking_status_formerly smoked    1.239896e-02
smoking_status_smokes             1.171959e-02
smoking_status_never smoked       1.119521e-02
bmi_category_Overweight           1.092109e-02
bmi_category_Obese                9.927775e-03
work_type_Private                 9.760007e-03
work_type_Self-employed           9.602314e-03
hypertension                      8.790896e-03
work_type_Govt_job                7.413675e-03
bmi_category_

In [52]:
from sklearn.model_selection import GridSearchCV

# 定义要调优的参数范围
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [4, 6, 8],
    'criterion': ['gini', 'entropy']
}

# 初始化带参数网格的随机森林模型
grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=5, scoring='f1')

# 进行网格搜索
grid_search.fit(X_train, y_train)

# 输出最佳参数和相应的F1分数
print("最佳参数：", grid_search.best_params_)
print("最佳F1分数：", grid_search.best_score_)


最佳参数： {'criterion': 'entropy', 'max_depth': 8, 'max_features': 'auto', 'n_estimators': 100}
最佳F1分数： 0.07588574000787743


In [49]:
from sklearn.svm import SVC

# 初始化SVM模型
svc = SVC()

# 训练模型
svc.fit(X_train, y_train)

svc_scores = cross_val_score(svc, X_train, y_train, cv=5, scoring='accuracy')
print("Précision de la validation croisée: ", svc_scores)
print("Précision moyenne: ", svc_scores.mean())


Précision de la validation croisée:  [0.95883698 0.95883698 0.95851029 0.95851029 0.95882353]
Précision moyenne:  0.9587036147356688


In [None]:
from sklearn.model_selection import GridSearchCV

# 定义要调优的参数范围
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

# 初始化带参数网格的SVM模型
grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv=5, scoring='f1')

# 进行网格搜索
grid_search.fit(X_train, y_train)

# 输出最佳参数和相应的F1分数
print("最佳参数：", grid_search.best_params_)
print("最佳F1分数：", grid_search.best_score_)

In [50]:
from xgboost import XGBClassifier

# 初始化XGBoost模型
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# 训练模型
xgb.fit(X_train, y_train)
xgb_scores = cross_val_score(xgb, X_train, y_train, cv=5, scoring='accuracy')
print("Précision de la validation croisée: ", xgb_scores)
print("Précision moyenne: ", xgb_scores.mean())


Précision de la validation croisée:  [0.79810519 0.87749102 0.90558641 0.89121202 0.95882353]
Précision moyenne:  0.8862436343371064


In [None]:
# 获取特征重要性
importances = xgb_model.feature_importances_

# 打印特征及其重要性
feature_names = X_train.columns
feature_importances = pd.Series(importances, index=feature_names)
print(feature_importances.sort_values(ascending=False))


In [None]:
from sklearn.model_selection import GridSearchCV

# 定义要调优的参数范围
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 7],
    'colsample_bytree': [0.5, 0.7, 1],
    'subsample': [0.6, 0.8, 1]
}

# 初始化带参数网格的XGBoost模型
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='f1')

# 进行网格搜索
grid_search.fit(X_train, y_train)

# 输出最佳参数和相应的F1分数
print("最佳参数：", grid_search.best_params_)
print("最佳F1分数：", grid_search.best_score_)


In [None]:
# 预处理测试数据
# 对分类变量进行独热编码
categorical_features = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
test_data_encoded = pd.get_dummies(test_data, columns=categorical_features)

test_data['health_risk_score'] = test_data['age'] / 50 + test_data['hypertension'] + test_data['heart_disease'] + test_data['avg_glucose_level'] / 200
test_data['bmi_category'] = test_data['bmi'].apply(categorize_bmi)
test_data['age_group'] = test_data['age'].apply(age_group)
test_data['total_health_score'] = test_data['hypertension'] + test_data['heart_disease'] + (test_data['avg_glucose_level'] / 100)

# 这里只是示例，你需要根据实际情况进行调整
test_data['lifestyle_score'] = (test_data['smoking_status'] == 'never smoked').astype(int) + \
                               (test_data['ever_married'] == 'Yes').astype(int) + \
                               (test_data['work_type'] == 'Private').astype(int)

# 现在进行独热编码
test_data_encoded = pd.get_dummies(test_data, columns=['bmi_category', 'age_group'])
test_data_encoded.head()

In [None]:
# 确保训练集和测试集有相同的列
for column in set(train_data_encoded.columns) - set(test_data_encoded.columns):
    test_data_encoded[column] = 0

# 确保测试数据集的列顺序与训练数据集的列顺序相同
test_data_encoded = test_data_encoded[train_data_encoded.columns.drop('stroke')]


In [None]:
# 确保训练集和测试集有相同的列，如果测试集中缺少任何列，我们需要添加这些列并用0填充
missing_cols = set(train_data_encoded.columns) - set(test_data_encoded.columns)
for c in missing_cols:
    test_data_encoded[c] = 0
# 确保列的顺序与训练集相同
test_data_encoded = test_data_encoded[train_data_encoded.columns]

# 删除'target'列和其他不需要的列
X_test = test_data_encoded.drop(['stroke'], axis=1)

# 使用之前训练好的SVM模型进行预测（这里假设svm模型已经被训练和保存好了）
# 这里需要你提供svm模型变量，如果模型尚未训练，请先运行之前提供的SVM训练代码
y_test_pred = svc.predict(X_test)

# 输出预测结果的前几行
y_test_pred[:5]

In [None]:
# 将预测结果保存到CSV文件
output = pd.DataFrame({'id': test_data['id'], 'stroke': y_test_pred})
output.to_csv(r"C:\Users\11953\M2_ISDS\ML_PROJET\prediction.csv", index=False)

print("预测结果已保存至 stroke_predictions_svm.csv")
