<a href="https://colab.research.google.com/github/SeanMuInCa/learn_python/blob/master/Assignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("shivadumnawar/titanic-dataset")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/shivadumnawar/titanic-dataset/versions/1


In [2]:
import pandas as pd
# load the data
trainDF = pd.read_csv(path + "/train.csv")
trainDF.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# 选择关键特征
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

# 提取特征和目标变量
X = trainDF[features]
y = trainDF['Survived']

# 显示特征的前几行
X.head()


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


In [4]:
# 处理 Age 列的缺失值
X = X.copy()  # 创建一个副本，避免 SettingWithCopyWarning
X['Age'] = X['Age'].fillna(X['Age'].mean())

# 处理 Embarked 列的缺失值
X['Embarked'] = X['Embarked'].fillna(X['Embarked'].mode()[0])

# 检查是否还有缺失值
X.isnull().sum()



Unnamed: 0,0
Pclass,0
Sex,0
Age,0
SibSp,0
Parch,0
Fare,0
Embarked,0


In [5]:
# 使用独热编码处理分类变量
X = pd.get_dummies(X, columns=['Sex', 'Embarked'], drop_first=True)

# 显示处理后的特征
X.head()


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.25,True,False,True
1,1,38.0,1,0,71.2833,False,False,False
2,3,26.0,0,0,7.925,False,False,True
3,1,35.0,1,0,53.1,False,False,True
4,3,35.0,0,0,8.05,True,False,True


In [6]:
from sklearn.model_selection import train_test_split

# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 显示分割后的数据集大小
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((623, 8), (268, 8), (623,), (268,))

In [7]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score

# 初始化模型
model = GaussianNB()

# 训练模型
model.fit(X_train, y_train)

# 使用 5 折交叉验证评估模型
cv_scores = cross_val_score(model, X_train, y_train, cv=5)

# 输出交叉验证得分
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())


Cross-validation scores: [0.792      0.84       0.76       0.75       0.78225806]
Mean cross-validation score: 0.7848516129032259


In [8]:
# 在测试集上评估模型
test_accuracy = model.score(X_test, y_test)

# 输出测试集准确率
print("Test set accuracy:", test_accuracy)


Test set accuracy: 0.7947761194029851


In [9]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# 定义 SVM 模型
svm = SVC()

# 定义超参数网格
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [0.01, 0.001],
    'kernel': ['rbf']
}


# 使用网格搜索寻找最佳超参数
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# 输出最佳超参数和最佳得分
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)


Best parameters found:  {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
Best cross-validation score:  0.7447870967741935


In [10]:
# 使用最佳超参数训练 SVM 模型
best_svm = SVC(C=10, gamma=0.01, kernel='rbf')

# 训练模型
best_svm.fit(X_train, y_train)

# 在测试集上评估模型
svm_test_accuracy = best_svm.score(X_test, y_test)

# 输出测试集准确率
print("SVM Test set accuracy:", svm_test_accuracy)


SVM Test set accuracy: 0.753731343283582


In [11]:
from sklearn.metrics import classification_report

# 预测测试集
nb_predictions = model.predict(X_test)
svm_predictions = best_svm.predict(X_test)

# 输出分类报告
print("Naive Bayes Classification Report:")
print(classification_report(y_test, nb_predictions))

print("SVM Classification Report:")
print(classification_report(y_test, svm_predictions))


Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.83      0.83       157
           1       0.75      0.75      0.75       111

    accuracy                           0.79       268
   macro avg       0.79      0.79      0.79       268
weighted avg       0.79      0.79      0.79       268

SVM Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.85      0.80       157
           1       0.75      0.61      0.67       111

    accuracy                           0.75       268
   macro avg       0.75      0.73      0.74       268
weighted avg       0.75      0.75      0.75       268



结果分析
准确率：朴素贝叶斯分类器在整体准确率上表现更好。
召回率：朴素贝叶斯分类器在两个类别上的召回率更为平衡，而 SVM 模型在类别 1 上的召回率较低。
F1 分数：朴素贝叶斯分类器在两个类别上的 F1 分数更高，表明其在精确率和召回率之间的平衡更好。
讨论
朴素贝叶斯分类器：在这个数据集上表现更好，可能是因为其假设特征之间的独立性在这个数据集上是一个合理的近似。
SVM 模型：虽然在类别 0 上的召回率较高，但在类别 1 上的表现较差，可能需要进一步调整超参数或特征选择。