In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# set seed

seed = 42
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7a7ee236ff30>

In [None]:
# URL с данными California Housing Prices
url = 'https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv'

housing = pd.read_csv(url)

print("Первые несколько строк датасета:")
print(housing.head())

unique_values = housing['ocean_proximity'].unique()
print("\nУникальные значения в столбце 'ocean_proximity':")
print(unique_values)

Первые несколько строк датасета:
   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  

Уникальные значения в столбце 'ocean_prox

In [None]:
# GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [None]:
print(housing.head())

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  


In [None]:
print(housing.describe())

          longitude      latitude  housing_median_age   total_rooms  \
count  20640.000000  20640.000000        20640.000000  20640.000000   
mean    -119.569704     35.631861           28.639486   2635.763081   
std        2.003532      2.135952           12.585558   2181.615252   
min     -124.350000     32.540000            1.000000      2.000000   
25%     -121.800000     33.930000           18.000000   1447.750000   
50%     -118.490000     34.260000           29.000000   2127.000000   
75%     -118.010000     37.710000           37.000000   3148.000000   
max     -114.310000     41.950000           52.000000  39320.000000   

       total_bedrooms    population    households  median_income  \
count    20433.000000  20640.000000  20640.000000   20640.000000   
mean       537.870553   1425.476744    499.539680       3.870671   
std        421.385070   1132.462122    382.329753       1.899822   
min          1.000000      3.000000      1.000000       0.499900   
25%        296.00000

In [None]:
print(housing.isnull().sum())

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64


In [None]:
# Преобразование категориальных данных о близости к океану в числовые ['NEAR BAY' '<1H OCEAN' 'INLAND' 'NEAR OCEAN' 'ISLAND']
ocean_proximity_mapping = {
    '<1H OCEAN': 0,
    'INLAND': 1,
    'NEAR OCEAN': 2,
    'NEAR BAY': 3,
    'ISLAND': 4
}
housing['ocean_proximity'] = housing['ocean_proximity'].map(ocean_proximity_mapping)

# Проверка на наличие NaN значений
print(housing.isnull().sum())

# Заполнение NaN значений медианными значениями столбцов
housing.fillna(housing.median(), inplace=True)

# Повторная проверка на наличие NaN значений после заполнения
print("Проверка на наличие NaN значений после заполнения:")
print(housing.isnull().sum())

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64
Проверка на наличие NaN значений после заполнения:
longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64


In [None]:
# Разделение признаков и целевой переменной
X = housing.drop('median_house_value', axis=1).values
y = housing['median_house_value'].values

# Нормализация данных
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Нормализация целевой переменной
y = (y - y.mean()) / y.std()

# Дополнительная проверка на NaN после нормализации
print("Проверка на наличие NaN значений после нормализации:")
print(np.isnan(X).sum())
print(np.isnan(y).sum())

Проверка на наличие NaN значений после нормализации:
0
0


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Преобразование данных в тензоры
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
y_test = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

# 3. Создание модель (объект) класса Sequential
model = nn.Sequential(
    nn.Linear(X_train.shape[1], 64),
    nn.ReLU(),
    nn.Linear(64, 32),
    nn.ReLU(),
    nn.Linear(32, 1)
)


In [None]:
# Задаем критерий и оптимизатор
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)

# 4. Обучаем модель на train данных
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    # Прямой проход
    outputs = model(X_train)
    loss = criterion(outputs, y_train)

    # Обратный проход и оптимизация
    loss.backward()
    optimizer.step()

    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# 5. Проверяем качество модели на тестовых данных
model.eval()
with torch.no_grad():
    predictions = model(X_test)
    test_loss = criterion(predictions, y_test)
    rmse = torch.sqrt(test_loss)
    print(f'Test RMSE: {rmse.item():.4f}')

# Оценка модели
predictions_np = predictions.numpy()
y_test_np = y_test.numpy()
mse = mean_squared_error(y_test_np, predictions_np)
rmse = np.sqrt(mse)
print(f'Test RMSE: {rmse:.4f}')


Epoch [10/100], Loss: 0.5611
Epoch [20/100], Loss: 0.4171
Epoch [30/100], Loss: 0.3419
Epoch [40/100], Loss: 0.3191
Epoch [50/100], Loss: 0.3031
Epoch [60/100], Loss: 0.2911
Epoch [70/100], Loss: 0.2811
Epoch [80/100], Loss: 0.2724
Epoch [90/100], Loss: 0.2649
Epoch [100/100], Loss: 0.2582
Test RMSE: 0.5205
Test RMSE: 0.5205
