<a href="https://colab.research.google.com/github/ShawnFromChineseTaipei/class_test/blob/main/%E5%B0%88%E6%A1%882ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [59]:
import os
from re import L
from time import asctime, localtime, time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
import lightgbm as lgbm

x_train = pd.read_csv('/content/train.csv')
x_test = pd.read_csv('/content/test.csv')

submission = pd.DataFrame(
    columns=["PassengerId", "Transported"], data=x_test["PassengerId"])

y_train = x_train["Transported"]
x_train = x_train.drop(columns=["Transported", ])

float_features = ["Age", "RoomService",
                  "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "most_spent", "least_spent", "std_spent", "total_spent"]

label_encoders = ["FirstName",
                  "LastName",
                  "num", "GroupId", ]
onehot_encoders = ["HomePlanet", "CryoSleep",
                   "deck", "side", "Destination", "VIP"]


def fill_nulls(df):

    # 缺失值利用平均數來進行補值，年齡的部分則用0進行補值
    for i in float_features:
        if i != "Age":
            df[i] = df[i].fillna(0)
        else:
            df[i] = SimpleImputer(
                strategy="mean").fit_transform(df[[i]])

    # label encoding and one hot encoding
    for j in label_encoders:
        df[j] = LabelEncoder().fit_transform(df[j])
    for k in onehot_encoders:
        df[k] = OneHotEncoder().fit_transform(df[[i]]).toarray()
    return df


def feature_engineering(df):

    # 計算各特徵的統計資訊
    df["most_spent"] = df[["RoomService",
                           "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]].max(axis=1)
    df["least_spent"] = df[["RoomService",
                            "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]].min(axis=1)
    df["std_spent"] = df[["RoomService",
                          "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]].std(axis=1)
    df["total_spent"] = df[["RoomService",
                            "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]].sum(axis=1)

    # 將機艙分成三個特徵
    df[['deck', 'num', 'side']] = df['Cabin'].str.split('/', expand=True)
    df = df.drop(columns=["Cabin", ])

    # 如果他在睡眠中，或是年齡不及12歲，則假設為0
    df['total_spent'] = df.apply(
        lambda row: 0 if row["CryoSleep"] == True or row["Age"] <= 12 else row['total_spent'],
        axis=1
    )
    df['most_spent'] = df.apply(
        lambda row: 0 if row["CryoSleep"] == True or row["Age"] <= 12 else row['most_spent'],
        axis=1
    )
    df['least_spent'] = df.apply(
        lambda row: 0 if row["CryoSleep"] == True or row["Age"] <= 12 else row['least_spent'],
        axis=1
    )
    df['std_spent'] = df.apply(
        lambda row: 0 if row["CryoSleep"] == True or row["Age"] <= 12 else row['std_spent'],
        axis=1
    )

    df['FirstName'] = df['Name'].str.split(' ', expand=True)[0]
    df['LastName'] = df['Name'].str.split(' ', expand=True)[1]
    df.drop(columns=['Name'], inplace=True)

    df['GroupId'] = df['PassengerId'].str.split('_', expand=True)[
        0]
    return df


# 轉換測試集以及訓練集
x_train = feature_engineering(x_train)
x_train = fill_nulls(x_train)
x_train = x_train.drop(columns=['PassengerId'])

x_test = feature_engineering(x_test)
x_test = fill_nulls(x_test)
x_test = x_test.drop(columns=['PassengerId'])

feature_names = x_train.columns
print("Number of features: ", len(feature_names))

y_preds = []

skfold = StratifiedKFold(n_splits=5)
for fold, (train_id, test_id) in enumerate(skfold.split(x_train, y_train)):

    # split into the folds
    X_train = x_train.iloc[train_id]
    Y_train = y_train.iloc[train_id]
    X_test = x_train.iloc[test_id]
    Y_test = y_train.iloc[test_id]

    X_train = np.asarray(X_train).astype('float32')
    X_test = np.asarray(X_test).astype('float32')
    Y_train = np.asarray(Y_train).astype('float32')
    Y_test = np.asarray(Y_test).astype('float32')

    # # run the model on the fold
    model = RandomForestClassifier(n_estimators=500, max_depth=5)
    model.fit(X_train, Y_train)
    print(f"Model score: {model.score(X_test, Y_test)}")
    pred = model.predict(x_test)
    y_preds.append(pred)

pred = sum(y_preds) / len(y_preds)
submission['Transported'] = pred
submission['Transported'] = np.where(
    submission['Transported'] > 0.5, True, False)

os.makedirs('submissions/random_forests', exist_ok=True)
submission.to_csv('submissions/random_forests/out.csv', index=False)
plt.show()

Number of features:  20
Model score: 0.5227142035652674
Model score: 0.7607820586543991
Model score: 0.7590569292696953
Model score: 0.7871116225546605
Model score: 0.6052934407364787
