# 数据预处理

In [1]:
import pandas as pd
df = pd.read_csv(r'D:\Projects\VsCode\Python\img_processing_system\practice\pytorch\data\introvert_extro\train.csv')
print(df.columns)

Index(['id', 'Time_spent_Alone', 'Stage_fear', 'Social_event_attendance',
       'Going_outside', 'Drained_after_socializing', 'Friends_circle_size',
       'Post_frequency', 'Personality'],
      dtype='object')


# 空值处理

In [2]:
import numpy as np
from sklearn.impute import SimpleImputer
# 空值
isna_df = df.isna()
cols = df.columns
for col in cols:
    if(len(isna_df[col].unique()) > 1):
        print(col)

# Time_spent_Alone的缺失值用平均值代替
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
df["Time_spent_Alone"] = imp.fit_transform(df["Time_spent_Alone"].to_numpy().reshape(-1,1))

# Stage_fear 可以用IterativeImputer处理，但是其他列可能存在空值
# 如果是外向，那么Stage_fear是no。
for idx in df.index:
    # df.loc[idx, ["Stage_fear"]] 返回的是一个Series，不会直接等于None
    # 空值在pandas中通常是np.nan，不能用 is None 判断
    if pd.isna(df.loc[idx, "Stage_fear"]):
        if df.loc[idx, "Personality"] == "Extrovert":
            df.loc[idx, "Stage_fear"] = "No"
        elif df.loc[idx, "Personality"] == "Introvert":
            df.loc[idx, "Stage_fear"] = "Yes"


# Social_event_attendance
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
df["Social_event_attendance"] = imp.fit_transform(df["Social_event_attendance"].to_numpy().reshape(-1,1))

# Going_outside
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
df["Going_outside"] = imp.fit_transform(df["Going_outside"].to_numpy().reshape(-1,1))

# Drained_after_socializing
for idx in df.index:
    # df.loc[idx, ["Stage_fear"]] 返回的是一个Series，不会直接等于None
    # 空值在pandas中通常是np.nan，不能用 is None 判断
    if pd.isna(df.loc[idx, "Drained_after_socializing"]):
        if df.loc[idx, "Personality"] == "Extrovert":
            df.loc[idx, "Drained_after_socializing"] = "No"
        elif df.loc[idx, "Personality"] == "Introvert":
            df.loc[idx, "Drained_after_socializing"] = "Yes"

# Friends_circle_size
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
df["Friends_circle_size"] = imp.fit_transform(df["Friends_circle_size"].to_numpy().reshape(-1,1))

# Post_frequency
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
df["Post_frequency"] = imp.fit_transform(df["Post_frequency"].to_numpy().reshape(-1,1))

Time_spent_Alone
Stage_fear
Social_event_attendance
Going_outside
Drained_after_socializing
Friends_circle_size
Post_frequency


# 得到X与y

In [9]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

dummies = pd.get_dummies(df[["Stage_fear", "Drained_after_socializing"]])
X = pd.concat([
	df[["Time_spent_Alone", "Social_event_attendance", "Going_outside", "Friends_circle_size", "Post_frequency"]],
	dummies
], axis=1)
print(X.columns)
X_bool_to_int = X.copy()
bool_columns = X.select_dtypes(include=['bool']).columns
X_bool_to_int[bool_columns] = X_bool_to_int[bool_columns].astype(int)

y = LabelEncoder().fit_transform(df["Personality"])
print(type(y))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train_np_arr = X_train.to_numpy(dtype=np.float32)
X_test_np_arr = X_test.to_numpy(dtype=np.float32)

Index(['Time_spent_Alone', 'Social_event_attendance', 'Going_outside',
       'Friends_circle_size', 'Post_frequency', 'Stage_fear_No',
       'Stage_fear_Yes', 'Drained_after_socializing_No',
       'Drained_after_socializing_Yes'],
      dtype='object')
<class 'numpy.ndarray'>


In [7]:
import torch
X_train_tensor = torch.tensor(X_train_np_arr)
X_test_tensor = torch.tensor(X_test_np_arr)
y_train_tensor = torch.tensor(y_train)
y_test_tensor = torch.tensor(y_test)

# 训练模型

In [None]:
from torch import nn
class Classifier(nn.Module):
    def __init__(self):
        super().__init__()
        # 疑问：hidden1是隐藏层，但是接受数据的输入层呢？
        self.hidden1 = nn.Linear(9, 12) # 问题：out_features应该填多少？
        self.act1 = nn.ReLU()
        self.hidden2 = nn.Linear(12, 8)
        self.act2 = nn.ReLU()
        self.output = nn.Linear(8, 1)
        self.act_output = nn.Sigmoid()

    def forward(self, x):
        x = self.act1(self.hidden1(x))
        x = self.act2(self.hidden2(x))
        x = self.act_output(self.output(x))
        return x

model = Classifier()
print(model)