In [3]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, f1_score, recall_score
import numpy as np

# Загрузка данных
df = pd.read_csv("/home/xlordplay/tree_state_classificator/tree_state_classification/data/2015-street-tree-census-tree-data.csv")

# Кодирование целевой переменной
df['health_encoded'] = df['health'].map({"Good": 2, "Fair": 1, "Poor": 0})

# Удаление строк с NaN
important_features = [
    'y_sp', 'x_sp', 'longitude', 'latitude', 'tree_dbh', 'census tract',
    'council district', 'trnk_other', 'brch_other', 'spc_latin'
]
df = df.dropna(subset=important_features + ['health_encoded'])

# Выбор признаков и целевой переменной
X = df[important_features]
y = df['health_encoded']

# Кодирование категориальных признаков
column_transformer = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=False), ['council district', 'trnk_other', 'brch_other', 'spc_latin']),
    ],
    remainder='passthrough'
)

X_encoded = column_transformer.fit_transform(X)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

# Преобразование данных в тензоры
X_tensor = torch.FloatTensor(X_scaled)
y_tensor = torch.LongTensor(y.values)

class ImprovedHealthModel(nn.Module):
    def __init__(self):
        super(ImprovedHealthModel, self).__init__()
        self.fc1 = nn.Linear(X_scaled.shape[1], 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(64, 3)
        self.relu = nn.ReLU()
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = self.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        x = self.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)
        x = self.softmax(self.fc3(x))
        return x

# Инициализация модели, функции потерь и оптимизатора
model = HealthModel()
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Кросс-валидация
kf = KFold(n_splits=5, shuffle=True, random_state=42)

accuracies = []
f1_scores = []
recalls = []

for train_index, test_index in kf.split(X_tensor):
    X_train_tensor, X_test_tensor = X_tensor[train_index], X_tensor[test_index]
    y_train_tensor, y_test_tensor = y_tensor[train_index], y_tensor[test_index]

    # Обучение модели
    model.train()
    for epoch in range(50):
        optimizer.zero_grad()
        output = model(X_train_tensor)
        loss = criterion(output, y_train_tensor)

        loss.backward()
        optimizer.step()

    # Оценка производительности модели
    model.eval()
    with torch.no_grad():
        output_test = model(X_test_tensor)
        _, predicted = torch.max(output_test, 1)
        
        accuracy = accuracy_score(y_test_tensor, predicted)
        f1 = f1_score(y_test_tensor, predicted, average='weighted')
        recall = recall_score(y_test_tensor, predicted, average='weighted')

        accuracies.append(accuracy)
        f1_scores.append(f1)
        recalls.append(recall)

# Вывод результатов
print(f"Средняя точность: {np.mean(accuracies):.2f} ± {np.std(accuracies):.2f}")
print(f"Средняя F1-мера: {np.mean(f1_scores):.2f} ± {np.std(f1_scores):.2f}")
print(f"Средняя полнота: {np.mean(recalls):.2f} ± {np.std(recalls):.2f}")

Средняя точность: 0.81 ± 0.00
Средняя F1-мера: 0.73 ± 0.00
Средняя полнота: 0.81 ± 0.00


In [1]:
import pandas as pd

# Создание примера данных для инференса
inference_data = {
    'y_sp': [40.7128, 40.7128, 40.7129],
    'x_sp': [-74.0060, -74.0061, -74.0062],
    'longitude': [-74.0060, -74.0061, -74.0062],
    'latitude': [40.7128, 40.7129, 40.7130],
    'tree_dbh': [10.5, 12.0, 15.0],
    'census_tract': ['123456', '123456', '123456'],
    'council_district': ['1', '1', '2'],
    'trnk_other': [None, None, None],
    'brch_other': [None, None, None],
    'spc_latin': ['Acer saccharum', 'Quercus rubra', 'Pinus sylvestris']
}

inference_df = pd.DataFrame(inference_data)
inference_df.to_csv("inference_data.csv", index=False)

In [2]:
import pandas as pd
import numpy as np

# Задание параметров
num_samples = 100  # Количество образцов
species = ['Acer saccharum', 'Quercus rubra', 'Pinus sylvestris']  # Примеры видов
census_tracts = [123456, 123457, 123458]  # Примеры участков переписи
council_districts = [1, 2, 3]  # Примеры районов

# Генерация случайных данных
data = {
    'y_sp': np.random.uniform(40.5, 41.0, num_samples),
    'x_sp': np.random.uniform(-75.0, -73.5, num_samples),
    'longitude': np.random.uniform(-75.0, -73.5, num_samples),
    'latitude': np.random.uniform(40.5, 41.0, num_samples),
    'tree_dbh': np.random.uniform(10, 20, num_samples),  # Диаметр дерева
    'census tract': np.random.choice(census_tracts, num_samples),
    'council district': np.random.choice(council_districts, num_samples),
    'trnk_other': np.random.choice([np.nan, 'value1', 'value2'], num_samples),  # Используем известные значения
    'brch_other': np.random.choice([np.nan, 'valueA', 'valueB'], num_samples),  # Используем известные значения
    'spc_latin': np.random.choice(species, num_samples)
}

# Создание DataFrame
inference_df = pd.DataFrame(data)

# Сохранение в CSV
inference_df.to_csv('inference_data.csv', index=False)

print("Данные успешно сгенерированы и сохранены в inference_data.csv!")

Данные успешно сгенерированы и сохранены в inference_data.csv!
