# Readme

### 利用que_3中的数据处理和特征化

In [None]:
import pandas as pd

# 读取训练集和测试集
train_df = pd.read_csv('./data4_3/train.csv')  
test_df = pd.read_csv('./data4_3/test.csv')

print(train_df.head())
# print(train_df.tail())

print(f"训练集形状: {train_df.shape}")  #
print(f"测试集形状: {test_df.shape}\n")  #

print("特征列名: \n", train_df.columns.tolist()) #

print(train_df.info())  # info方法查看类型和缺失值
#print(test_df.info())
import numpy as np
from scipy import stats
num_cols = train_df.select_dtypes(include=[np.number]).columns

for col in num_cols:
    for i in range(len(train_df[col])):
        if train_df[col][i] > 4*train_df[col].std():
            train_df[col][i] = train_df[col].mean()

train_df[['Cabin_Deck', 'Cabin_Number', 'Cabin_Side']] = train_df['Cabin'].str.split('/', expand=True)
test_df[['Cabin_Deck', 'Cabin_Number', 'Cabin_Side']] = test_df['Cabin'].str.split('/', expand=True)

print(train_df.isnull().sum())
# 对数值列（如 gpa、gmat、work_exp）进行统计描述
print(train_df.describe())  # 包含均值、标准差、最小值、最大值、分位数等

non_num_cols = train_df.select_dtypes(include=['object'])

print(non_num_cols.count())
non_num_cols = non_num_cols.columns.tolist()

for col in non_num_cols:
    print(f"===== 特征：{col} =====")
    
    # 获取唯一类别（含缺失值可保留）
    unique_categories = train_df[col].unique()
    print(f"唯一类别（共{len(unique_categories)}种）：{unique_categories}")

    # 获取每个类别的出现次数排除缺失值，按降序排序
    value_counts = train_df[col].value_counts(dropna=False)  # dropna=False  包含缺失值
    print("类别出现次数：")
    print(value_counts)
    print("\n" + "-"*50 + "\n")  

# 按逻辑填补缺失值
# ['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'Transported']
train_df['HomePlanet'] = train_df['HomePlanet'].fillna(train_df['HomePlanet'].mode()[0])#填众数
train_df['CryoSleep'] = train_df['CryoSleep'].fillna(train_df['CryoSleep'].mode()[0])#填众数
train_df['Cabin_Deck'] = train_df['Cabin_Deck'].fillna(train_df['Cabin_Deck'].mode()[0])#填众数
train_df['Cabin_Number'] = train_df['Cabin_Number'].str[1].fillna(train_df['Cabin_Number'].mode()[0])#填众数
train_df['Cabin_Side'] = train_df['Cabin_Side'].str[2].fillna(train_df['Cabin_Side'].mode()[0])#填众数
train_df['Destination'] = train_df['Destination'].fillna(train_df['Destination'].mode()[0])
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].median())#填中位数
train_df['VIP'] = train_df['VIP'].fillna(train_df['VIP'].mode()[0])#填众数
train_df['RoomService'] = train_df['RoomService'].fillna(train_df['RoomService'].mean())#平均
train_df['FoodCourt'] = train_df['FoodCourt'].fillna(train_df['FoodCourt'].mean())#平均
train_df['ShoppingMall'] = train_df['ShoppingMall'].fillna(train_df['ShoppingMall'].mean())#平均
train_df['Spa'] = train_df['Spa'].fillna(train_df['Spa'].mean())#平均
train_df['VRDeck'] = train_df['VRDeck'].fillna(train_df['VRDeck'].mean())#平均
train_df['Name'] = train_df['Name'].fillna("somebody")


test_df['HomePlanet'] = test_df['HomePlanet'].fillna(test_df['HomePlanet'].mode()[0])#填众数
test_df['CryoSleep'] = test_df['CryoSleep'].fillna(test_df['CryoSleep'].mode()[0])#填众数
test_df['Cabin_Deck'] = test_df['Cabin_Deck'].fillna(test_df['Cabin_Deck'].mode()[0])#填众数
test_df['Cabin_Number'] = test_df['Cabin_Number'].str[1].fillna(test_df['Cabin_Number'].mode()[0])#填众数
test_df['Cabin_Side'] = test_df['Cabin_Side'].str[2].fillna(test_df['Cabin_Side'].mode()[0])#填众数
test_df['Destination'] = test_df['Destination'].fillna(test_df['Destination'].mode()[0])
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].median())#填中位数
test_df['VIP'] = test_df['VIP'].fillna(test_df['VIP'].mode()[0])#填众数
test_df['RoomService'] = test_df['RoomService'].fillna(test_df['RoomService'].mean())#平均
test_df['FoodCourt'] = test_df['FoodCourt'].fillna(test_df['FoodCourt'].mean())#平均
test_df['ShoppingMall'] = test_df['ShoppingMall'].fillna(test_df['ShoppingMall'].mean())#平均
test_df['Spa'] = test_df['Spa'].fillna(test_df['Spa'].mean())#平均
test_df['VRDeck'] = test_df['VRDeck'].fillna(test_df['VRDeck'].mean())#平均
test_df['Name'] = test_df['Name'].fillna("somebody")

print(train_df.isnull().sum())

# 注：mode() 返回 Series，取 [0] 获取第一个众数
columns_to_drop = ['Name','PassengerId','Cabin']
# 剔除指定列
train_df = train_df.drop(columns=columns_to_drop, errors='ignore')
test_df = test_df.drop(columns=columns_to_drop, errors='ignore')

# 获取剩余的object类型列（需要进行独热编码的列）
object_columns = train_df.select_dtypes(include=['object']).columns.tolist()

train_df = pd.get_dummies(train_df, columns=object_columns)
# train_df = train_df[col for col in columns.map({True: 1, False: 0})]
test_df = pd.get_dummies(test_df, columns=object_columns)

for col in train_df.columns:
    if col in test_df.columns and train_df[col].dtype == 'bool':
        train_df[col] = train_df[col].map({True: 1, False: 0})
    else:
        pass

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time

# 转换布尔类型列为0和1
for col in train_df.columns:
    if train_df[col].dtype == bool:
        train_df[col] = train_df[col].astype(int)

for col in test_df.columns:
    if test_df[col].dtype == bool:
        test_df[col] = test_df[col].astype(int)

feature_cols = [col for col in train_df.columns if col not in ['Transported']]

# 对齐训练集和测试集的特征列
common_features = list(set(feature_cols) & set(test_df.columns))
common_features.sort()  # 保持一致的顺序


X_train = train_df[common_features].to_numpy()  # 特征数据
train_df_ = pd.read_csv('./data4_3/train.csv')  
test_df_ = pd.read_csv('./data4_3/test.csv')#读取测试正确集
y_train = train_df_['Transported'].to_numpy().ravel()  # 目标变量，转为一维数组
for i in range(5):
    print(f"第{i}个样本的特征值: {X_train[i]}\n {y_train[i]} \n")
X_test = test_df[common_features].to_numpy()  # 特征数据
y_test = test_df_['Transported'].to_numpy().ravel()  # 目标变量，转为一维数组

  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0001_01     Europa     False  B/0/P  TRAPPIST-1e  39.0  False   
1     0002_01      Earth     False  F/0/S  TRAPPIST-1e  24.0  False   
2     0003_01     Europa     False  A/0/S  TRAPPIST-1e  58.0   True   
3     0003_02     Europa     False  A/0/S  TRAPPIST-1e  33.0  False   
4     0004_01      Earth     False  F/1/S  TRAPPIST-1e  16.0  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck               Name  \
0          0.0        0.0           0.0     0.0     0.0    Maham Ofracculy   
1        109.0        9.0          25.0   549.0    44.0       Juanna Vines   
2         43.0     3576.0           0.0  6715.0    49.0      Altark Susent   
3          0.0     1283.0         371.0  3329.0   193.0       Solam Susent   
4        303.0       70.0         151.0   565.0     2.0  Willy Santantines   

   Transported  
0        False  
1         True  
2        False  
3        False  
4         True  
训练

### 利用numpy实现逻辑回归

In [None]:
from sklearn.metrics import accuracy_score# 用与计算准确率

class LogisticRegression:
    """
    逻辑回归模型
    
    """
    def __init__(self, learning_rate=0.00001, num_iterations=10000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.weights = None
        self.bias = None

    def fit(self, X, y):# 训练模型
        num_samples, num_features = X.shape
        self.weights = np.zeros(num_features)
        self.bias = 0

        for _ in range(self.num_iterations):
            linear_model = np.dot(X, self.weights) + self.bias
            y_predicted = self._sigmoid(linear_model)
            dw = (1 / num_samples) * np.dot(X.T, (y_predicted - y))# 计算梯度
            db = (1 / num_samples) * np.sum(y_predicted - y)
            if _ % 1000 == 0:# 每1000次迭代打印loss值
                print('loss:',self.cross_entropy_loss(y,y_predicted))

            self.weights -= self.learning_rate * dw# 更新权重
            self.bias -= self.learning_rate * db

    def _sigmoid(self, z):#  sigmoid函数
        return 1 / (1 + np.exp(-z))
    
    def predict(self, X):# 预测函数
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self._sigmoid(linear_model)
        y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted]
        return y_predicted_cls
    
    def cross_entropy_loss(self, y_true, y_pred):# 分类任务,使用交叉熵做损失函数
        epsilon = 1e-15
        y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
        return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
    

    
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy_score = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_score}")

# loss = model.cross_entropy_loss(y_test, y_pred)
# print(f"Cross Entropy Loss: {loss}")



loss: 0.6931471805599453
loss: 0.585861743332071
loss: 0.5826594809928772
loss: 0.5821111661414096
loss: 0.5819468476055102
loss: 0.5818415355830882
loss: 0.5817456061436884
loss: 0.5816512768976051
loss: 0.581557314027715
loss: 0.5814635209501362
Accuracy: 0.75
