<a href="https://colab.research.google.com/github/RyutoYoda/wood_life_from_pytorch/blob/main/SBMC_wood_PyTorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir('/content/drive/MyDrive/SMBC')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import nn

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
sample_submit = pd.read_csv('sample_submission.csv', index_col=0, header=None) # 応募用サンプルファイル

In [None]:
# 'Unnamed: 0' 列を保存、提出のときに列を追加する
train_ids = train_df['Unnamed: 0']
test_ids = test_df['Unnamed: 0']

In [None]:
print("テストデータの行数:", len(test_df))
print("サンプル提出データの行数:", len(sample_submit))

テストデータの行数: 19702
サンプル提出データの行数: 19702


In [None]:
# 予測するターゲット変数
target = 'health'

In [None]:
# カテゴリカル変数と数値変数を分ける
categorical_features = train_df.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_features.remove('created_at')  # 日付はこの段階では除外
numerical_features = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_features.remove(target)

In [None]:
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# カテゴリカル変数のためのパイプライン（欠損値を最頻値で埋め、ワンホットエンコーディングを行う）
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# カラムトランスフォーマーを使用して、数値変数とカテゴリカル変数の処理を統合
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

In [None]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,created_at,tree_dbh,curb_loc,health,steward,guards,sidewalk,user_type,problems,...,nta,nta_name,borocode,boro_ct,boroname,zip_city,cb_num,st_senate,st_assem,cncldist
0,0,2015-06-29,14,OnCurb,1,,,Damage,Volunteer,,...,QN45,Douglas Manor-Douglaston-Little Neck,4,4152901,Queens,Little Neck,411,11,25,23
1,1,2016-09-21,5,OnCurb,1,3or4,Helpful,NoDamage,Volunteer,,...,BX05,Bedford Park-Fordham North,2,2039901,Bronx,Bronx,207,33,78,15
2,2,2015-09-13,26,OnCurb,2,,,NoDamage,Volunteer,StonesBranchLights,...,SI01,Annadale-Huguenot-Prince's Bay-Eltingville,5,5017011,Staten Island,Staten Island,503,24,62,51
3,3,2016-05-09,15,OnCurb,0,,,Damage,NYC Parks Staff,,...,SI11,Charleston-Richmond Valley-Tottenville,5,5024401,Staten Island,Staten Island,503,24,62,51
4,4,2016-06-24,23,OnCurb,1,,,NoDamage,Volunteer,Stones,...,MN03,Central Harlem North-Polo Grounds,1,1022102,Manhattan,New York,110,30,70,9


In [None]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,created_at,tree_dbh,curb_loc,health,steward,guards,sidewalk,user_type,problems,...,nta,nta_name,borocode,boro_ct,boroname,zip_city,cb_num,st_senate,st_assem,cncldist
0,0,2015-06-29,14,OnCurb,1,,,Damage,Volunteer,,...,QN45,Douglas Manor-Douglaston-Little Neck,4,4152901,Queens,Little Neck,411,11,25,23
1,1,2016-09-21,5,OnCurb,1,3or4,Helpful,NoDamage,Volunteer,,...,BX05,Bedford Park-Fordham North,2,2039901,Bronx,Bronx,207,33,78,15
2,2,2015-09-13,26,OnCurb,2,,,NoDamage,Volunteer,StonesBranchLights,...,SI01,Annadale-Huguenot-Prince's Bay-Eltingville,5,5017011,Staten Island,Staten Island,503,24,62,51
3,3,2016-05-09,15,OnCurb,0,,,Damage,NYC Parks Staff,,...,SI11,Charleston-Richmond Valley-Tottenville,5,5024401,Staten Island,Staten Island,503,24,62,51
4,4,2016-06-24,23,OnCurb,1,,,NoDamage,Volunteer,Stones,...,MN03,Central Harlem North-Polo Grounds,1,1022102,Manhattan,New York,110,30,70,9


In [None]:
X = train_df.drop(target, axis=1)
y = train_df[target]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 前処理パイプラインをトレーニングデータに適用
X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)

In [None]:
# データセットとデータローダーの設定
X_train_tensor = torch.tensor(X_train_processed.toarray().astype(np.float32))
y_train_tensor = torch.tensor(y_train.values.astype(np.int64))
X_val_tensor = torch.tensor(X_val_processed.toarray().astype(np.float32))
y_val_tensor = torch.tensor(y_val.values.astype(np.int64))

In [None]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# ニューラルネットワークモデルの定義
class NeuralNet(nn.Module):
    def __init__(self, num_features):
        super(NeuralNet, self).__init__()
        self.layer1 = nn.Linear(num_features, 64)
        self.layer2 = nn.Linear(64, 64)
        self.output_layer = nn.Linear(64, 3)  # 3つのクラスに分類

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        x = self.output_layer(x)
        return x

In [None]:
model = NeuralNet(X_train_tensor.shape[1])

In [None]:
# 損失関数とオプティマイザーの設定
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
from sklearn.metrics import f1_score

# 1. モデルのトレーニングとバリデーションステップ
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # バリデーション
    model.eval()
    all_labels = []
    all_predictions = []
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            all_labels.extend(labels.numpy())
            all_predictions.extend(predicted.numpy())

    val_f1 = f1_score(all_labels, all_predictions, average='macro')
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Validation F1 Score (Macro): {val_f1}')

Epoch 1/10, Loss: 0.7279682755470276, Validation F1 Score (Macro): 0.2938295788442703
Epoch 2/10, Loss: 0.3397897183895111, Validation F1 Score (Macro): 0.2938295788442703
Epoch 3/10, Loss: 0.6515147089958191, Validation F1 Score (Macro): 0.2938295788442703
Epoch 4/10, Loss: 0.5735658407211304, Validation F1 Score (Macro): 0.2938295788442703
Epoch 5/10, Loss: 0.535921037197113, Validation F1 Score (Macro): 0.2938295788442703
Epoch 6/10, Loss: 0.40898531675338745, Validation F1 Score (Macro): 0.2935685615607206
Epoch 7/10, Loss: 0.6121312975883484, Validation F1 Score (Macro): 0.2938295788442703
Epoch 8/10, Loss: 0.44027993083000183, Validation F1 Score (Macro): 0.2950893900976466
Epoch 9/10, Loss: 0.668121337890625, Validation F1 Score (Macro): 0.301191229086203
Epoch 10/10, Loss: 0.5917460322380066, Validation F1 Score (Macro): 0.3085346918200726


In [None]:
# 2. テストデータに対する予測の生成
X_test_processed = preprocessor.transform(test_df)
X_test_tensor = torch.tensor(X_test_processed.toarray().astype(np.float32))

In [None]:
X_test_tensor = torch.tensor(X_test_processed.toarray().astype(np.float32))

In [None]:
# テストデータに対する予測
model.eval()
with torch.no_grad():
    test_outputs = model(X_test_tensor)
    _, test_predictions = torch.max(test_outputs, 1)

In [None]:
# 提出ファイルの作成
submission_df = pd.DataFrame({
    'Id': test_ids,
    'Prediction': test_predictions.numpy()
})

# 提出ファイルの保存
submission_df.to_csv('/content/drive/MyDrive/SMBC/submission2.csv', index=False,header=None)