In [230]:
# 导入需要的一些库
import pandas as pd 

In [231]:
# 读取数据集，进行一些基本的分析
data_raw = pd.read_csv('泰坦尼克号数据.csv')
data_raw.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [232]:
#读取列信息
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [233]:
# 数据整体情况
data_raw.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Braund, Mr. Owen Harris",male,,,,347082.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [234]:
# 读取每一列缺失值的个数
data_raw.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

通过分析我们可以发现，含有空值的列数有三个Age(年龄)，Cabin(船舱)，Embarked(登船口)，其中age为int类型，而其他两者为str类型


## 数据处理

##### 缺失值处理

In [235]:
# 我们通过补中位数的方式，对age列进行补全
data_raw['Age'].fillna(data_raw['Age'].median(),inplace=True)

In [236]:
data_raw['Age'].isnull().sum()

0

In [237]:
# 我们通过补众数的方式来，对Embarked进行补全
data_raw['Embarked'] = data_raw['Embarked'].fillna(data_raw['Embarked'].mode().iloc[0])


mode函数会返回一个dataframe，所以需要使用iloc进行切片得到第一个众数

In [238]:
data_raw['Embarked'].isnull().sum()

0

In [239]:
# 由于cabin缺了大部分数据，所以我们直接用U代表Unkown对其进行填补
data_raw['Cabin'].fillna('U',inplace=True)
data_raw['Cabin'].isnull().sum()

0

In [240]:
#接下来再来验证以下缺失值是否填写完毕’
data_raw.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

#### 数据标准化，归一化

In [241]:
# 只需处理age 与 fare
data_raw['Age_std'] = data_raw['Age'].apply(lambda x:(x - data_raw['Age'].mean())/data_raw['Age'].std())

In [242]:
data_raw['Fare_nol'] = data_raw['Fare'].apply(lambda x:(x - data_raw['Fare'].min())/data_raw['Fare'].max() - data_raw['Fare'].min())


#### 独热编码


In [243]:
# 我们先按照年龄以及票价对船员进行分类以减少onehot编码的维度
# 先对年龄进行分组
data_raw['Age_bin'] = pd.cut(data_raw['Age'],5)
# 再对票价进行分组
data_raw['Fare_bin'] = pd.cut(data_raw['Fare'],5)

In [244]:
# 对性别进行独热编码
Sex_one_hot_encode = pd.get_dummies(data_raw['Sex'],dtype=int)


In [245]:
# 对年龄组进行独热编码
Age_bin_one_hot_encode = pd.get_dummies(data_raw['Age_bin'],dtype=int)
Age_bin_one_hot_encode.columns = [0,1,2,3,4]

In [246]:
# 对票价进行独热编码

Fare_bin_one_hot_encode = pd.get_dummies(data_raw['Fare_bin'],dtype=int)
Fare_bin_one_hot_encode.columns = [0,1,2,3,4]


In [247]:
# 对Embarked进行编码
Embarked_one_hot_encode = pd.get_dummies(data_raw['Embarked'],dtype=int)

In [248]:
data_raw.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Age_std', 'Fare_nol',
       'Age_bin', 'Fare_bin'],
      dtype='object')

## 进阶任务

#### 特征选择
我们首先先选取一系列的特征作为预测依据

In [249]:
# 标签
Target = ['Survived'] 
# 用于预测的特征
fetures = ['Pclass','Sex','Age_bin','Parch','SibSp','Fare_bin','Embarked'] 
# #将target与features相加得到整个向量
# data = Target + fetures
# 将data进行编码
data_dummy = pd.get_dummies(data_raw[fetures],dtype=int)


In [270]:
#定义数据集

import torch
from torch.utils.data import Dataset, DataLoader
class MyDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sample = self.data[idx]
        label = self.labels[idx]
        return torch.tensor(sample, dtype=torch.float32), torch.tensor(label, dtype=torch.float32)


In [271]:
import numpy as np
def train_test_split(X, y, test_size=0.2, random_state=None):
    # 设置随机种子
    if random_state is not None:
        np.random.seed(random_state)
    
    # 确定测试集的大小（样本数）
    num_test = int(len(X) * test_size)
    
    # 生成随机索引
    indices = np.random.permutation(len(X))
    
    # 切分数据集
    X_train = X[indices[num_test:]]
    X_test = X[indices[:num_test]]
    y_train = y[indices[num_test:]]
    y_test = y[indices[:num_test]]
    
    return X_train, X_test, y_train, y_test

In [272]:
data_dummy.values # 将dataFrame转换为二维数组

array([[3, 0, 1, ..., 0, 0, 1],
       [1, 0, 1, ..., 1, 0, 0],
       [3, 0, 0, ..., 0, 0, 1],
       ...,
       [3, 2, 1, ..., 0, 0, 1],
       [1, 0, 0, ..., 1, 0, 0],
       [3, 0, 0, ..., 0, 1, 0]], dtype=int64)

In [273]:
# 将数据集随机切分为训练集，测试集
x_train, x_test, y_train, y_test = train_test_split(data_dummy.values,data_raw['Survived'],random_state=0)
print(x_train.shape) 
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(713, 18)
(178, 18)
(713,)
(178,)


In [276]:
# 制作训练集与测试集
train_set = MyDataset(x_train,y_train)
test_set = MyDataset(x_test,y_test)

#### 逻辑回归
别看它名字里面带有回归两个字，其实它是一个分类算法，处理的步骤如下：


In [380]:
# 构造dataloader
batch_size = 1
train_dataloader = DataLoader(dataset=train_set,
                              batch_size=batch_size,
                              num_workers=0,
                              shuffle=True,)
test_dataloader = DataLoader(dataset=test_set,
                              batch_size=batch_size,
                              num_workers=0,
                              shuffle=True,)

In [385]:

for batch_idx,(x,y) in enumerate(train_dataloader):
    print(f"Batch {batch_idx}:")
    print("Inputs:",x)
    print("Labels",y)
    print(x.shape,y.shape)

Batch 0:
Inputs: tensor([[2., 2., 1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0.]])
Labels tensor([0.])
torch.Size([1, 18]) torch.Size([1])


KeyError: 231

In [387]:
# 打印数据集的长度
print(f"Length of train_set: {len(train_set)}")
print(f"Length of test_set: {len(test_set)}")

# 迭代数据加载器，查看索引问题
for batch_idx, (x, y) in enumerate(train_dataloader):
    try:
        print(f"Batch {batch_idx}:")
        print("Inputs:", x)
        print("Labels:", y)
        
        # 在这里添加适当的索引检查，确保不超出范围
        if batch_idx >= len(train_set):
            print(f"Index {batch_idx} exceeds dataset length {len(train_set)}")
            break
    except IndexError as e:
        print(f"IndexError: {e}")

Length of train_set: 713
Length of test_set: 178
Batch 0:
Inputs: tensor([[1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1.]])
Labels: tensor([1.])
Batch 1:
Inputs: tensor([[2., 2., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.]])
Labels: tensor([1.])
Batch 2:
Inputs: tensor([[3., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.]])
Labels: tensor([0.])


KeyError: 34

In [405]:
        import numpy as np 
        X = np.array([[1],[2],[3]])
        W = np.arange(-10,10)
        b = 0
        y_hat = 1/(1+np.exp(-W))
        print(W,y_hat)


        

[-10  -9  -8  -7  -6  -5  -4  -3  -2  -1   0   1   2   3   4   5   6   7
   8   9] [4.53978687e-05 1.23394576e-04 3.35350130e-04 9.11051194e-04
 2.47262316e-03 6.69285092e-03 1.79862100e-02 4.74258732e-02
 1.19202922e-01 2.68941421e-01 5.00000000e-01 7.31058579e-01
 8.80797078e-01 9.52574127e-01 9.82013790e-01 9.93307149e-01
 9.97527377e-01 9.99088949e-01 9.99664650e-01 9.99876605e-01]
