# 数据预处理

In [1]:
import pandas as pd
df = pd.read_csv(r'D:\Projects\VsCode\Python\img_processing_system\practice\pytorch\data\introvert_extro\train.csv')
print(df.columns)


Index(['id', 'Time_spent_Alone', 'Stage_fear', 'Social_event_attendance',
       'Going_outside', 'Drained_after_socializing', 'Friends_circle_size',
       'Post_frequency', 'Personality'],
      dtype='object')


# 空值处理

In [2]:
import numpy as np
from sklearn.impute import SimpleImputer
# 空值
isna_df = df.isna()
cols = df.columns
for col in cols:
    if(len(isna_df[col].unique()) > 1):
        print(col)

# Time_spent_Alone的缺失值用平均值代替
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
df["Time_spent_Alone"] = imp.fit_transform(df["Time_spent_Alone"].to_numpy().reshape(-1,1))

# Stage_fear 可以用IterativeImputer处理，但是其他列可能存在空值
# 如果是外向，那么Stage_fear是no。
for idx in df.index:
    # df.loc[idx, ["Stage_fear"]] 返回的是一个Series，不会直接等于None
    # 空值在pandas中通常是np.nan，不能用 is None 判断
    if pd.isna(df.loc[idx, "Stage_fear"]):
        if df.loc[idx, "Personality"] == "Extrovert":
            df.loc[idx, "Stage_fear"] = "No"
        elif df.loc[idx, "Personality"] == "Introvert":
            df.loc[idx, "Stage_fear"] = "Yes"


# Social_event_attendance
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
df["Social_event_attendance"] = imp.fit_transform(df["Social_event_attendance"].to_numpy().reshape(-1,1))

# Going_outside
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
df["Going_outside"] = imp.fit_transform(df["Going_outside"].to_numpy().reshape(-1,1))

# Drained_after_socializing
for idx in df.index:
    # df.loc[idx, ["Stage_fear"]] 返回的是一个Series，不会直接等于None
    # 空值在pandas中通常是np.nan，不能用 is None 判断
    if pd.isna(df.loc[idx, "Drained_after_socializing"]):
        if df.loc[idx, "Personality"] == "Extrovert":
            df.loc[idx, "Drained_after_socializing"] = "No"
        elif df.loc[idx, "Personality"] == "Introvert":
            df.loc[idx, "Drained_after_socializing"] = "Yes"

# Friends_circle_size
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
df["Friends_circle_size"] = imp.fit_transform(df["Friends_circle_size"].to_numpy().reshape(-1,1))

# Post_frequency
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
df["Post_frequency"] = imp.fit_transform(df["Post_frequency"].to_numpy().reshape(-1,1))

Time_spent_Alone
Stage_fear
Social_event_attendance
Going_outside
Drained_after_socializing
Friends_circle_size
Post_frequency


# 得到X与y

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

dummies = pd.get_dummies(df[["Stage_fear", "Drained_after_socializing"]])
X = pd.concat([
	df[["Time_spent_Alone", "Social_event_attendance", "Going_outside", "Friends_circle_size", "Post_frequency"]],
	dummies
], axis=1)
print(X.columns)
X_bool_to_int = X.copy()
bool_columns = X.select_dtypes(include=['bool']).columns
X_bool_to_int[bool_columns] = X_bool_to_int[bool_columns].astype(int)

y = LabelEncoder().fit_transform(df["Personality"])
print(type(y))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train_np_arr = X_train.to_numpy(dtype=np.float32)
X_test_np_arr = X_test.to_numpy(dtype=np.float32)

Index(['Time_spent_Alone', 'Social_event_attendance', 'Going_outside',
       'Friends_circle_size', 'Post_frequency', 'Stage_fear_No',
       'Stage_fear_Yes', 'Drained_after_socializing_No',
       'Drained_after_socializing_Yes'],
      dtype='object')
<class 'numpy.ndarray'>


In [4]:
import torch
device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")

X_train_tensor = torch.tensor(X_train_np_arr).to(device)
X_test_tensor = torch.tensor(X_test_np_arr).to(device)
y_train_tensor = torch.tensor(y_train).reshape(-1,1).type(torch.float32).to(device)
y_test_tensor = torch.tensor(y_test).reshape(-1,1).to(device)

Using cuda device


In [5]:
print(X_train_tensor.shape)

torch.Size([14819, 9])


# 训练模型

In [6]:
from torch import nn
class Classifier(nn.Module):
    def __init__(self):
        super().__init__()
        # 疑问：hidden1是隐藏层，但是接受数据的输入层呢？nn.Linear(9, 12)代表了输入层和一个隐藏层吗？
        # 答：hidden1不见得就是隐藏层吧。不是hidden1，而是first layer吧。
        self.hidden1 = nn.Linear(9, 12) 
        self.act1 = nn.ReLU()
        self.hidden2 = nn.Linear(12, 9)
        self.act2 = nn.ReLU()
        self.output = nn.Linear(9, 1)
        self.act_output = nn.Sigmoid()

    def forward(self, x):
        x = self.act1(self.hidden1(x))
        x = self.act2(self.hidden2(x))
        x = self.act_output(self.output(x))
        return x

model = Classifier()
model.to(device)
print(model)

Classifier(
  (hidden1): Linear(in_features=9, out_features=12, bias=True)
  (act1): ReLU()
  (hidden2): Linear(in_features=12, out_features=9, bias=True)
  (act2): ReLU()
  (output): Linear(in_features=9, out_features=1, bias=True)
  (act_output): Sigmoid()
)


In [7]:
import torch.optim as optim

loss_fn = nn.BCELoss()  # binary cross entropy
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
n_epochs = 64 # 有什么讲究吗？刚开始设置100，但是看打印结果，64时，loss的值最小。
batch_size = 10 # 有什么讲究吗？

for epoch in range(n_epochs): # 疑问：为什么模型和X,Y 都放入GPU了，速度还是很慢？
    for i in range(0, len(X_train_tensor), batch_size):
        Xbatch = X_train_tensor[i:i+batch_size]
        y_train_pred = model(Xbatch) # .type(torch.int32)
        ybatch = y_train_tensor[i:i+batch_size]
        loss = loss_fn(y_train_pred, ybatch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Finished epoch {epoch}, latest loss {loss}')

tensor([[0.9569],
        [0.0276],
        [0.0235],
        [0.0299],
        [0.0198],
        [0.0193],
        [0.0229],
        [0.0106],
        [0.0129],
        [0.0257]], device='cuda:0', grad_fn=<SigmoidBackward0>)
tensor([[0.0136],
        [0.0195],
        [0.9123],
        [0.0265],
        [0.0329],
        [0.9217],
        [0.9654],
        [0.9221],
        [0.0271],
        [0.9376]], device='cuda:0', grad_fn=<SigmoidBackward0>)
tensor([[0.0222],
        [0.0185],
        [0.0262],
        [0.0395],
        [0.0285],
        [0.0369],
        [0.0098],
        [0.0137],
        [0.0343],
        [0.0122]], device='cuda:0', grad_fn=<SigmoidBackward0>)
tensor([[0.0293],
        [0.0227],
        [0.9411],
        [0.0143],
        [0.9351],
        [0.0284],
        [0.9767],
        [0.0427],
        [0.0292],
        [0.0426]], device='cuda:0', grad_fn=<SigmoidBackward0>)
tensor([[0.0111],
        [0.0182],
        [0.0166],
        [0.9340],
        [0.0376],
      

KeyboardInterrupt: 

# 评估模型

In [9]:
# compute accuracy (no_grad is optional)
with torch.no_grad():
    y_test_pred = model(X_test_tensor)
accuracy = (y_test_pred.round() == y_test_tensor).float().mean()
print(f"Accuracy {accuracy}")

Accuracy 0.9743590354919434
