In [1]:
import pandas as pd
import numpy as np
from sklearn import ensemble
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

df_read = pd.read_csv('nyc-rolling-sales.csv', encoding='iso-8859-1')

Удаление лишних столбцов

In [2]:
df = df_read.copy()

del df['BLOCK']
del df['LOT']
del df['ADDRESS']
del df['APARTMENT NUMBER']
del df['ZIP CODE']
del df['Unnamed: 0']

categorical_column = ["BUILDING CLASS AT TIME OF SALE", "TAX CLASS AT TIME OF SALE", "EASE-MENT","BUILDING CLASS AT PRESENT", "TAX CLASS AT PRESENT", "BUILDING CLASS CATEGORY", "NEIGHBORHOOD", "BOROUGH"]

num_column = ["SALE PRICE", "GROSS SQUARE FEET", "LAND SQUARE FEET", "TOTAL UNITS", "COMMERCIAL UNITS", "RESIDENTIAL UNITS"]

date_column = ["SALE DATE", "YEAR BUILT"]

df.info

<bound method DataFrame.info of        BOROUGH   NEIGHBORHOOD                      BUILDING CLASS CATEGORY  \
0            1  ALPHABET CITY  07 RENTALS - WALKUP APARTMENTS                
1            1  ALPHABET CITY  07 RENTALS - WALKUP APARTMENTS                
2            1  ALPHABET CITY  07 RENTALS - WALKUP APARTMENTS                
3            1  ALPHABET CITY  07 RENTALS - WALKUP APARTMENTS                
4            1  ALPHABET CITY  07 RENTALS - WALKUP APARTMENTS                
...        ...            ...                                          ...   
84543        5        WOODROW  02 TWO FAMILY DWELLINGS                       
84544        5        WOODROW  02 TWO FAMILY DWELLINGS                       
84545        5        WOODROW  02 TWO FAMILY DWELLINGS                       
84546        5        WOODROW  22 STORE BUILDINGS                            
84547        5        WOODROW  35 INDOOR PUBLIC AND CULTURAL FACILITIES      

      TAX CLASS AT PRESENT EASE

Форматирование данных

In [3]:
df_form = df.copy()


df_form.replace({' - ': np.nan, '\\N': np.nan, 'NaN': np.nan}, inplace=True)
df_form = df_form.dropna()

# Факторизация категориальных столбцов
encoder = OneHotEncoder(sparse_output=False)
encoded_df_list = []

for name in categorical_column:
    # Применение OneHotEncoder к целевому столбцу
    encoded_columns = encoder.fit_transform(df_form[[name]])
    # Преобразование закодированных данных в DataFrame
    encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out([name]))
    encoded_df_list.append(encoded_df)

# Объединение закодированных столбцов с исходным DataFrame (удалив при этом исходные столбцы)
df_form = df_form.drop(columns=categorical_column)
df_form = pd.concat([df_form] + encoded_df_list, axis=1)

# Приведение столбцов с датами, к виду отрезка времени до сегодняшней даты
# Преобразование столбца 'SALE DATE' в формат datetime
df_form['SALE DATE'] = pd.to_datetime(df_form['SALE DATE'])

# Извлечение года из столбца 'SALE DATE'
df_form['SALE YEAR'] = df_form['SALE DATE'].dt.year

df_form['YEARS DIFFERENCE'] = df_form['SALE YEAR'] - df_form['YEAR BUILT']
df_form.drop(columns=['YEAR BUILT', 'SALE DATE', 'SALE YEAR'], inplace=True)

#Приведение всех числовых столбцов к типу int
for column in num_column :
    df_form[column] = pd.to_numeric(df_form[column], errors='coerce')

df_form = df_form.dropna()

Создание тестового и тренировочного датасетов

In [4]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

Y = df_form['SALE PRICE']
X = df_form.drop(['SALE PRICE'], axis = 1)
X = scaler.fit_transform(X)
train_points, test_points, train_values, test_values = train_test_split(X, Y, test_size = 0.2)

Создание модели

In [5]:

# Определение и компиляция модели
nn_model = Sequential()
nn_model.add(Dense(17, activation='relu', input_shape=(train_points.shape[1],)))
nn_model.add(Dense(15, activation='relu'))
nn_model.add(Dense(15, activation='relu'))
nn_model.add(Dense(1))

nn_model.compile(loss='mean_absolute_error', optimizer='adam')

# Обучение модели
results = nn_model.fit(
    train_points, train_values,
    epochs=5,
    batch_size=100,
    validation_data=(test_points, test_values)
)

# Предсказание
nn_predict = nn_model.predict(test_points)

# Удаление NaN и бесконечных значений перед расчетом mean_absolute_error
test_values_clean = test_values[~np.isnan(nn_predict).flatten() & ~np.isinf(nn_predict).flatten()]
nn_predict_clean = nn_predict[~np.isnan(nn_predict).flatten() & ~np.isinf(nn_predict).flatten()]

# Расчет mean_absolute_error
print(mean_absolute_error(test_values_clean, nn_predict_clean))

Epoch 1/5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m386/386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 1322006.2500 - val_loss: 1069307.6250
Epoch 2/5
[1m386/386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 1149811.6250 - val_loss: 947153.9375
Epoch 3/5
[1m386/386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 1121754.6250 - val_loss: 897799.2500
Epoch 4/5
[1m386/386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 968326.8750 - val_loss: 876338.1875
Epoch 5/5
[1m386/386[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 934508.6250 - val_loss: 862617.3750
[1m302/302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 912us/step
862617.5208734794


In [6]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(train_points, train_values)
lr_predict = model.predict(test_points)
print(mean_absolute_error(test_values, lr_predict))

5.557537518842349e+18


In [7]:
import xgboost as xgb
xg_reg = xgb.XGBRegressor(objective = 'reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 100)
xg_reg.fit(train_points, train_values)
xgb_predict = xg_reg.predict(test_points)
print(mean_absolute_error(test_values, xgb_predict))

849064.600479972
