## Машинное обучение на больших данных. Регрессия
Используемый датасет: https://www.kaggle.com/datasets/hm-land-registry/uk-housing-prices-paid

В датасете содержатся данные о продаже недвижимости в Великобритании в период с 1995 г. по 2017 г.

Был использован датасет, получившийся в результате выполнений л/р №1.

## Импорты
Общие модули, которые будут использованы для работы с данными

In [1]:
# Import other modules not related to PySpark
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
import os
import sys
import pandas as pd
from pandas import DataFrame
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
import math
from IPython.core.interactiveshell import InteractiveShell
from datetime import *
import statistics as stats
# This helps auto print out the items without explixitly using 'print'
InteractiveShell.ast_node_interactivity = "all" 
%matplotlib inline
pd.set_option("display.max_columns", None)

Модули библиотеки PySpark, служащие для взаимодействия с движком Spark. Настройка Spark и запуск сессии

In [2]:
# Import PySpark related modules
import pyspark
from pyspark.rdd import RDD

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer, MinMaxScaler, OneHotEncoder
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator, RegressionEvaluator

from pyspark.sql import Row
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql import functions as f
from pyspark.sql.functions import lit, desc, col, size, array_contains\
, isnan, udf, hour, array_min, array_max, countDistinct
from pyspark.sql.types import *

MAX_MEMORY = '14G'
# Initialize a spark session.
conf = pyspark.SparkConf().setMaster("local[*]") \
        .set('spark.executor.heartbeatInterval', 10000) \
        .set('spark.network.timeout', 10000) \
        .set("spark.core.connection.ack.wait.timeout", "3600") \
        .set("spark.executor.memory", MAX_MEMORY) \
        .set("spark.driver.memory", MAX_MEMORY)
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Pyspark guide") \
        .config(conf=conf) \
        .getOrCreate()
    return spark

spark = init_spark()

## Подготовка данных
Чтение подготовленного датасета из файла

In [3]:
df = spark.read.csv("data/prepered.csv", header=True, inferSchema=True)
df.limit(10).toPandas()

Unnamed: 0,Transaction unique identifier,Price,Date of Transfer,Town/City,District,County,additional_entry,freehold,new,property_type,terraced,semi_detached,detached,flats,other
0,{81B82214-7FBC-4129-9F6B-4956B4A663AD},25000,1995-08-18,OLDHAM,OLDHAM,GREATER MANCHESTER,0,1,0,terraced,1,0,0,0,0
1,{8046EC72-1466-42D6-A753-4956BF7CD8A2},42500,1995-08-09,GRAYS,THURROCK,THURROCK,0,1,0,semi_detached,0,1,0,0,0
2,{278D581A-5BF3-4FCE-AF62-4956D87691E6},45000,1995-06-30,HIGHBRIDGE,SEDGEMOOR,SOMERSET,0,1,0,terraced,1,0,0,0,0
3,{1D861C06-A416-4865-973C-4956DB12CD12},43150,1995-11-24,BEDFORD,NORTH BEDFORDSHIRE,BEDFORDSHIRE,0,1,0,terraced,1,0,0,0,0
4,{DD8645FD-A815-43A6-A7BA-4956E58F1874},18899,1995-06-23,WAKEFIELD,LEEDS,WEST YORKSHIRE,0,1,0,semi_detached,0,1,0,0,0
5,{895E4E63-203F-476A-9AA9-42389DD0AE5C},81750,1995-05-19,SALISBURY,SALISBURY,WILTSHIRE,0,1,0,semi_detached,0,1,0,0,0
6,{FB195C27-E790-45FD-847A-4238BC94546A},56000,1995-03-10,WITNEY,WEST OXFORDSHIRE,OXFORDSHIRE,0,1,0,semi_detached,0,1,0,0,0
7,{1D6B01EC-DC33-4147-8A21-4238BEB2D4C1},31000,1995-03-02,ST. AUSTELL,RESTORMEL,CORNWALL,0,1,0,semi_detached,0,1,0,0,0
8,{B8D0F817-4553-448D-A2C1-4238BF81C6FA},82000,1995-06-16,GREENFORD,EALING,GREATER LONDON,0,1,0,semi_detached,0,1,0,0,0
9,{6DD27423-CC39-4B31-A848-4238D58268D4},10000,1995-05-17,FERNDALE,RHONDDA,MID GLAMORGAN,0,1,0,terraced,1,0,0,0,0


Добавим в датасет числовые признаки, которые получим, упорядочив графства, районы и города по средней цене в них и проиндексировав их.

In [4]:
def translate(dictionary):
    def translate_(column):
        return dictionary.get(column)
    return udf(translate_, IntegerType())

def index_in_order(dataframe, input_col, output_col):
    names = dataframe.select(input_col, "Price").groupBy(input_col).avg().orderBy("avg(Price)").select(input_col).rdd.flatMap(lambda x: x).collect()
    mapping = {name: i for i, name in enumerate(names)}
    
    return dataframe.withColumn(output_col, translate(mapping)(input_col))

In [5]:
df = index_in_order(df, "County", "county_index")
df = index_in_order(df, "District", "district_index")
df = index_in_order(df, "Town/City", "city_index")
df.limit(10).toPandas()

Unnamed: 0,Transaction unique identifier,Price,Date of Transfer,Town/City,District,County,additional_entry,freehold,new,property_type,terraced,semi_detached,detached,flats,other,county_index,district_index,city_index
0,{81B82214-7FBC-4129-9F6B-4956B4A663AD},25000,1995-08-18,OLDHAM,OLDHAM,GREATER MANCHESTER,0,1,0,terraced,1,0,0,0,0,43,87,74
1,{8046EC72-1466-42D6-A753-4956BF7CD8A2},42500,1995-08-09,GRAYS,THURROCK,THURROCK,0,1,0,semi_detached,0,1,0,0,0,75,246,542
2,{278D581A-5BF3-4FCE-AF62-4956D87691E6},45000,1995-06-30,HIGHBRIDGE,SEDGEMOOR,SOMERSET,0,1,0,terraced,1,0,0,0,0,84,249,544
3,{1D861C06-A416-4865-973C-4956DB12CD12},43150,1995-11-24,BEDFORD,NORTH BEDFORDSHIRE,BEDFORDSHIRE,0,1,0,terraced,1,0,0,0,0,68,54,697
4,{DD8645FD-A815-43A6-A7BA-4956E58F1874},18899,1995-06-23,WAKEFIELD,LEEDS,WEST YORKSHIRE,0,1,0,semi_detached,0,1,0,0,0,41,206,255
5,{895E4E63-203F-476A-9AA9-42389DD0AE5C},81750,1995-05-19,SALISBURY,SALISBURY,WILTSHIRE,0,1,0,semi_detached,0,1,0,0,0,103,297,873
6,{FB195C27-E790-45FD-847A-4238BC94546A},56000,1995-03-10,WITNEY,WEST OXFORDSHIRE,OXFORDSHIRE,0,1,0,semi_detached,0,1,0,0,0,115,384,908
7,{1D6B01EC-DC33-4147-8A21-4238BEB2D4C1},31000,1995-03-02,ST. AUSTELL,RESTORMEL,CORNWALL,0,1,0,semi_detached,0,1,0,0,0,89,185,257
8,{B8D0F817-4553-448D-A2C1-4238BF81C6FA},82000,1995-06-16,GREENFORD,EALING,GREATER LONDON,0,1,0,semi_detached,0,1,0,0,0,125,425,865
9,{6DD27423-CC39-4B31-A848-4238D58268D4},10000,1995-05-17,FERNDALE,RHONDDA,MID GLAMORGAN,0,1,0,terraced,1,0,0,0,0,1,0,2


Разделение датасета на тренировочную и тестовую часть

In [6]:
splits = df.randomSplit([0.95, 0.05])
train = splits[0]
test = splits[1].withColumnRenamed("Price", "true_price")
train_rows = train.count()
test_rows = test.count()
print("Training Rows:", train_rows, " Testing Rows:", test_rows)

Training Rows: 21366174  Testing Rows: 1122494


## Регрессия

**Градиентный бустинг** — метод машинного обучения, который создает решающую модель прогнозирования в виде ансамбля слабых моделей прогнозирования, обычно деревьев решений. Он строит модель поэтапно, позволяя оптимизировать произвольную дифференцируемую функцию потерь.

В PySpark ML присутсвует класс **GBTRegressor**(Gradient Boosting Tree Regressor), осуществляющий обучение с использованием градиентного бустинга на основе анасамбля деревьев решений.

Построим пайплайн для осуществления обучения и предсказания. 

In [7]:
regression_pipeline = Pipeline(stages = [
    VectorAssembler(inputCols = ["additional_entry", "freehold", "new", "terraced", "semi_detached", "detached", "flats", "other"], outputCol="catFeatures"), 
    VectorIndexer(inputCol = "catFeatures", outputCol = "idxCatFeatures"),
    VectorAssembler(inputCols = ["county_index", "district_index", "city_index"], outputCol="numFeatures"),
    MinMaxScaler(inputCol = "numFeatures", outputCol="normFeatures"),
    VectorAssembler(inputCols=["idxCatFeatures", "normFeatures"], outputCol="features"),
    GBTRegressor(featuresCol="features", labelCol = "Price")
])

Проведем обучение модели при помощи пайплайна

In [8]:
regression_pipline_model = regression_pipeline.fit(train)

Получим предсказания обученой модели

In [9]:
regression_prediction = regression_pipline_model.transform(test)

Посмотрим на первые строки получившегося датасета

In [10]:
regression_prediction.limit(10).toPandas()

Unnamed: 0,Transaction unique identifier,true_price,Date of Transfer,Town/City,District,County,additional_entry,freehold,new,property_type,terraced,semi_detached,detached,flats,other,county_index,district_index,city_index,catFeatures,idxCatFeatures,numFeatures,normFeatures,features,prediction
0,{000030F3-AC07-48F5-9B8F-42C4222AB501},127000,1995-06-28,LONDON,WANDSWORTH,GREATER LONDON,0,1,0,semi_detached,0,1,0,0,0,125,444,1134,"(0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","[125.0, 444.0, 1134.0]","[0.992063492063492, 0.9801324503311258, 0.9700...","(0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.992...",899992.008833
1,{0000DBDA-B01A-4D21-BFE1-417EEFC2E601},59500,1995-02-13,BOURNEMOUTH,BOURNEMOUTH,BOURNEMOUTH,0,1,0,detached,0,0,1,0,0,91,291,647,"(0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0)","[91.0, 291.0, 647.0]","[0.7222222222222222, 0.6423841059602649, 0.553...","(0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.722...",215658.823824
2,{0000E5F7-05E7-4739-A2AE-18458717F15B},255000,1995-02-22,LONDON,HARINGEY,GREATER LONDON,0,1,0,terraced,1,0,0,0,0,125,424,1134,"(0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","[125.0, 424.0, 1134.0]","[0.992063492063492, 0.9359823399558499, 0.9700...","(0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.992...",313855.941126
3,{000224B0-8C1E-4CBE-8FD8-AAFA4A7EE7F5},38600,1995-08-31,LIVERPOOL,KNOWSLEY,MERSEYSIDE,0,1,1,semi_detached,0,1,0,0,0,30,90,173,"(0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0)","[30.0, 90.0, 173.0]","[0.23809523809523808, 0.1986754966887417, 0.14...","(0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.238...",107686.259272
4,{00023543-ED62-49E1-95CA-5F648DC2AE4C},62000,1995-06-02,LUTON,LUTON,LUTON,0,1,0,semi_detached,0,1,0,0,0,52,181,326,"(0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","[52.0, 181.0, 326.0]","[0.4126984126984127, 0.3995584988962472, 0.278...","(0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.412...",113452.058179
5,{000266A5-12BE-4485-AAA8-4AD26CCE9493},60000,1995-03-17,COALVILLE,NORTH WEST LEICESTERSHIRE,LEICESTERSHIRE,0,1,0,detached,0,0,1,0,0,78,227,285,"(0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0)","[78.0, 227.0, 285.0]","[0.6190476190476191, 0.5011037527593819, 0.243...","(0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.619...",164321.728075
6,{0002886E-D128-458D-BA06-6D3A4E550C41},56250,1995-06-16,REIGATE,REIGATE AND BANSTEAD,SURREY,0,1,0,semi_detached,0,1,0,0,0,124,417,1097,"(0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","[124.0, 417.0, 1097.0]","[0.9841269841269841, 0.9205298013245033, 0.938...","(0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.984...",254479.057598
7,{0005CF39-C4C0-4B46-B900-BEC04DAA057A},37500,1995-08-22,BURY,BURY,GREATER MANCHESTER,0,0,0,semi_detached,0,1,0,0,0,43,133,211,"(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0)","[43.0, 133.0, 211.0]","[0.3412698412698413, 0.293598233995585, 0.1804...","(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.341...",107015.335973
8,{000645D5-D783-4AE8-885F-4658672A84BB},75000,1995-05-10,LONDON,WANDSWORTH,GREATER LONDON,0,0,0,flats,0,0,0,1,0,125,444,1134,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0)","[125.0, 444.0, 1134.0]","[0.992063492063492, 0.9801324503311258, 0.9700...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.992...",383226.971552
9,{0006C9C6-11C2-4F90-94D0-922AA25D2CC6},51000,1996-12-05,AYLESBURY,AYLESBURY VALE,BUCKINGHAMSHIRE,0,1,0,terraced,1,0,0,0,0,123,366,850,"(0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","[123.0, 366.0, 850.0]","[0.9761904761904762, 0.8079470198675497, 0.727...","(0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.976...",181102.186931


Проведем оценку полученной модели используя метрику $r^2$

In [11]:
regression_evaluator = RegressionEvaluator(labelCol="true_price", predictionCol="prediction", metricName="r2")
regression_evaluator.evaluate(regression_prediction)

0.17428210174653347