# Connect to Hive

In [1]:
from pyspark.sql import SparkSession

# Add here your team number teamx
team = 19

# location of your Hive database in HDFS
# warehouse = "project/hive/warehouse"
warehouse = "/user/team19/project/warehouse"

spark = SparkSession.builder\
        .appName("{} - spark ML".format(team))\
        .master("yarn")\
        .config("hive.metastore.uris", "thrift://hadoop-02.uni.innopolis.ru:9883")\
        .config("spark.sql.warehouse.dir", warehouse)\
        .config("spark.sql.avro.compression.codec", "snappy")\
        .enableHiveSupport()\
        .getOrCreate()

sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/05 20:45:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/05 20:45:14 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/05/05 20:45:14 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/05/05 20:45:15 WARN DomainSocketFactory: The short-circuit local reads feature cannot be used because libhadoop cannot be loaded.
25/05/05 20:45:15 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [2]:
# pip freeze
!java -version

openjdk version "1.8.0_402"
OpenJDK Runtime Environment (build 1.8.0_402-b06)
OpenJDK 64-Bit Server VM (build 25.402-b06, mixed mode)


In [3]:
spark

# list Hive databases

In [4]:
# print(spark.catalog.listDatabases())
# spark.sql("SHOW DATABASES;").show()

# Specify the input and output features

In [5]:
spark.sql("SHOW DATABASES").show()

+--------------------+
|           namespace|
+--------------------+
|             default|
|             retake1|
|             root_db|
|                show|
|     team0_projectdb|
|    team11_projectdb|
|           team12_db|
|team12_hive_proje...|
|    team12_projectdb|
|    team13_projectdb|
|    team14_projectdb|
|    team15_projectdb|
|    team16_projectdb|
|    team17_projectdb|
|    team18_projectdb|
|    team19_projectdb|
|     team1_projectdb|
|    team20_projectdb|
| team21_projectdb_v2|
| team21_projectdb_v3|
+--------------------+
only showing top 20 rows



# Read hive tables

In [6]:
spark.sql("USE team19_projectdb").show()
spark.sql("SHOW TABLES").show()
# spark.sql("SELECT * FROM <db_name>.<table_name>").show()

++
||
++
++

+----------------+--------------------+-----------+
|       namespace|           tableName|isTemporary|
+----------------+--------------------+-----------+
|team19_projectdb|      hosts_bucketed|      false|
|team19_projectdb|listings_partitioned|      false|
|team19_projectdb|          q1_results|      false|
|team19_projectdb|          q2_results|      false|
|team19_projectdb|          q3_results|      false|
|team19_projectdb|          q4_results|      false|
|team19_projectdb|          q5_results|      false|
|team19_projectdb|          q6_results|      false|
|team19_projectdb|review_scores_buc...|      false|
+----------------+--------------------+-----------+



In [7]:
hosts_bucketed = spark.read.format("avro").table('team19_projectdb.hosts_bucketed')
listings_partitioned = spark.read.format("avro").table('team19_projectdb.listings_partitioned')
review_scores_bucketed = spark.read.format("avro").table('team19_projectdb.review_scores_bucketed')

In [8]:
# hosts_bucketed.show()
hosts_bucketed.columns

['host_id',
 'host_url',
 'host_name',
 'host_since',
 'host_location',
 'host_about',
 'host_response_time',
 'host_response_rate',
 'host_acceptance_rate',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications']

In [9]:
hosts_bucketed.show(1)

25/05/05 20:45:38 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
[Stage 0:>                                                          (0 + 1) / 1]

+----------------+--------------------+---------+----------+-------------+----------+------------------+------------------+--------------------+-------------------+-------------------------+-------------------+
|         host_id|            host_url|host_name|host_since|host_location|host_about|host_response_time|host_response_rate|host_acceptance_rate|host_listings_count|host_total_listings_count| host_verifications|
+----------------+--------------------+---------+----------+-------------+----------+------------------+------------------+--------------------+-------------------+-------------------------+-------------------+
|71592872.0000000|https://www.airbn...|      Ian|2016-05-12|         NULL|      NULL|within a few hours|       100.0000000|                NULL|          1.0000000|                1.0000000|email,phone,reviews|
+----------------+--------------------+---------+----------+-------------+----------+------------------+------------------+--------------------+------------

                                                                                

In [10]:
listings_partitioned.columns

['id',
 'listing_url',
 'scrape_id',
 'last_scraped',
 'name',
 'host_id',
 'street',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'price',
 'number_of_reviews',
 'reviews_per_month',
 'city',
 'country']

In [11]:
listings_partitioned.show(1)

[Stage 1:>                                                          (0 + 1) / 1]

+-------+--------------------+--------------+------------+--------------------+----------------+--------------------+-------------+---------------+------------+---------+---------+---------+-----------+-----------------+-----------------+---------+-----------+
|     id|         listing_url|     scrape_id|last_scraped|                name|         host_id|              street|property_type|      room_type|accommodates|bathrooms| bedrooms|     beds|      price|number_of_reviews|reviews_per_month|     city|    country|
+-------+--------------------+--------------+------------+--------------------+----------------+--------------------+-------------+---------------+------------+---------+---------+---------+-----------+-----------------+-----------------+---------+-----------+
|4008728|https://www.airbn...|20170402075052|  2017-04-02|Luxurious 3 bedro...|20786453.0000000|Oud-West, Amsterd...|    Apartment|Entire home/apt|   6.0000000|1.0000000|3.0000000|3.0000000|600.0000000|       31.00000

                                                                                

In [12]:
review_scores_bucketed.columns

['listing_id',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value']

In [13]:
review_scores_bucketed.show(1)

+----------+--------------------+----------------------+-------------------------+---------------------+---------------------------+----------------------+-------------------+
|listing_id|review_scores_rating|review_scores_accuracy|review_scores_cleanliness|review_scores_checkin|review_scores_communication|review_scores_location|review_scores_value|
+----------+--------------------+----------------------+-------------------------+---------------------+---------------------------+----------------------+-------------------+
|  17895559|                NULL|                  NULL|                     NULL|                 NULL|                       NULL|                  NULL|               NULL|
+----------+--------------------+----------------------+-------------------------+---------------------+---------------------------+----------------------+-------------------+
only showing top 1 row



In [14]:
from pyspark.sql.functions import col

# Основное соединение
result_df = (
    listings_partitioned
    .join(
        hosts_bucketed,
        on="host_id",       # Ключ из listings -> hosts
        how="left"          # Все записи listings, даже без host
    )
    .join(
        review_scores_bucketed,
        on=col("id") == col("listing_id"),  # Ключ listings.id -> reviews.listing_id
        how="left"                          # Все записи listings, даже без reviews
    )
)


In [15]:
result_df.columns

['host_id',
 'id',
 'listing_url',
 'scrape_id',
 'last_scraped',
 'name',
 'street',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'price',
 'number_of_reviews',
 'reviews_per_month',
 'city',
 'country',
 'host_url',
 'host_name',
 'host_since',
 'host_location',
 'host_about',
 'host_response_time',
 'host_response_rate',
 'host_acceptance_rate',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'listing_id',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value']

In [16]:
total_rows = result_df.count()
print(f"Total rows: {total_rows}")



Total rows: 247476


                                                                                

In [17]:
from pyspark.sql.functions import col, count, when

null_counts = result_df.agg(
    *[count(when(col(c).isNull(), c)).alias(c) for c in result_df.columns]
).show(vertical=True)

25/05/05 20:45:58 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 18:>                                                         (0 + 2) / 2]

-RECORD 0-----------------------------
 host_id                     | 0      
 id                          | 0      
 listing_url                 | 0      
 scrape_id                   | 0      
 last_scraped                | 0      
 name                        | 223    
 street                      | 0      
 property_type               | 4      
 room_type                   | 0      
 accommodates                | 39     
 bathrooms                   | 764    
 bedrooms                    | 301    
 beds                        | 476    
 price                       | 4076   
 number_of_reviews           | 0      
 reviews_per_month           | 60968  
 city                        | 235    
 country                     | 0      
 host_url                    | 0      
 host_name                   | 260    
 host_since                  | 259    
 host_location               | 1143   
 host_about                  | 98959  
 host_response_time          | 57987  
 host_response_rate      

                                                                                

In [18]:
features = ['last_scraped', 'street', 'property_type', 'room_type', 'accommodates','bathrooms', 'bedrooms', 'beds', 'number_of_reviews', 'reviews_per_month', 'city',
           'country', 'host_since', 'host_location', 'host_response_time','host_response_rate','host_listings_count', 'host_total_listings_count', 'host_verifications', 'review_scores_rating',
           'review_scores_accuracy','review_scores_cleanliness','review_scores_checkin','review_scores_communication','review_scores_location','review_scores_value']
label = 'price'

In [19]:
result_df = result_df.select(features + [label]).na.drop()
# emps = emps.withColumn("ename_job", F.concat(F.col('ename'), F.lit("_"), F.col('job')))
result_df = result_df.withColumnRenamed("price","label")

result_df.show()

[Stage 30:>                                                         (0 + 2) / 2]

+------------+--------------------+-------------+---------------+------------+---------+---------+---------+-----------------+-----------------+-------------+--------------+----------+--------------------+------------------+------------------+-------------------+-------------------------+--------------------+--------------------+----------------------+-------------------------+---------------------+---------------------------+----------------------+-------------------+-----------+
|last_scraped|              street|property_type|      room_type|accommodates|bathrooms| bedrooms|     beds|number_of_reviews|reviews_per_month|         city|       country|host_since|       host_location|host_response_time|host_response_rate|host_listings_count|host_total_listings_count|  host_verifications|review_scores_rating|review_scores_accuracy|review_scores_cleanliness|review_scores_checkin|review_scores_communication|review_scores_location|review_scores_value|      label|
+------------+--------------

                                                                                

In [20]:
total_rows = result_df.count()
print(f"Total rows: {total_rows}")

[Stage 41:>                                                         (0 + 2) / 2]

Total rows: 151055


                                                                                

In [21]:
result_df.columns

['last_scraped',
 'street',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'number_of_reviews',
 'reviews_per_month',
 'city',
 'country',
 'host_since',
 'host_location',
 'host_response_time',
 'host_response_rate',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'label']

In [22]:
from pyspark.sql.functions import to_date, year, month, dayofmonth

df_with_dates = result_df.withColumn(
    "date_parsed",
    to_date("last_scraped", format="yyyy-MM-dd")
).withColumn(
    "last_scraped_year", year("date_parsed")
).withColumn(
    "month", month("date_parsed")
).withColumn(
    "day", dayofmonth("date_parsed")
)

In [23]:
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCols
from pyspark.sql.functions import sin, cos, pi
import math

class CyclicDateEncoder(Transformer, HasInputCol, HasOutputCols):
    def __init__(self, inputCol=None, outputCols=None):
        super(CyclicDateEncoder, self).__init__()
        self._set(inputCol=inputCol, outputCols=outputCols)
        
    def _transform(self, df):
        input_col = self.getInputCol()
        output_cols = self.getOutputCols()
        if input_col == "month":
            return df.withColumn(
                output_cols[0],  # Например, "month_sin"
                sin(2 * pi() * col(input_col) / 12)  # Нормировка для месяца
            ).withColumn(
                output_cols[1],  # Например, "month_cos"
                cos(2 * pi() * col(input_col) / 12)
            )
        elif input_col == "day":
            return df.withColumn(
                output_cols[0],  # Например, "month_sin"
                sin(2 * pi() * col(input_col) / 31)  # Нормировка для месяца
            ).withColumn(
                output_cols[1],  # Например, "month_cos"
                cos(2 * pi() * col(input_col) / 31)
            )

In [24]:
# Для месяца
month_encoder = CyclicDateEncoder(
    inputCol="month",
    outputCols=["last_scrapped_month_sin", "last_scrapped_month_cos"]
)

# Для дня (аналогично, нормировка на 31)
day_encoder = CyclicDateEncoder(
    inputCol="day",
    outputCols=["last_scrapped_day_sin", "last_scrapped_day_cos"]
)

# Применяем трансформеры
df_encoded = month_encoder.transform(df_with_dates)
df_encoded = day_encoder.transform(df_encoded)

# Результат
# df_encoded.select("last_scraped_year", "month", "day", "month_sin", "month_cos", "day_sin", "day_cos").show(5)

In [25]:
df_encoded.columns

['last_scraped',
 'street',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'number_of_reviews',
 'reviews_per_month',
 'city',
 'country',
 'host_since',
 'host_location',
 'host_response_time',
 'host_response_rate',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'label',
 'date_parsed',
 'last_scraped_year',
 'month',
 'day',
 'last_scrapped_month_sin',
 'last_scrapped_month_cos',
 'last_scrapped_day_sin',
 'last_scrapped_day_cos']

# Feature selection

In [26]:
df_encoded = df_encoded.drop("last_scraped", "date_parsed","day","month")
df_encoded.columns

['street',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'number_of_reviews',
 'reviews_per_month',
 'city',
 'country',
 'host_since',
 'host_location',
 'host_response_time',
 'host_response_rate',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'label',
 'last_scraped_year',
 'last_scrapped_month_sin',
 'last_scrapped_month_cos',
 'last_scrapped_day_sin',
 'last_scrapped_day_cos']

In [27]:
from pyspark.sql.functions import to_date, year, month, dayofmonth

df_with_dates = df_encoded.withColumn(
    "date_parsed",
    to_date("host_since", format="yyyy-MM-dd")  # Уточните формат вашей даты!
).withColumn(
    "year_host_since", year("date_parsed")
).withColumn(
    "month", month("date_parsed")
).withColumn(
    "day", dayofmonth("date_parsed")
)

In [28]:
df_with_dates.columns

['street',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'number_of_reviews',
 'reviews_per_month',
 'city',
 'country',
 'host_since',
 'host_location',
 'host_response_time',
 'host_response_rate',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'label',
 'last_scraped_year',
 'last_scrapped_month_sin',
 'last_scrapped_month_cos',
 'last_scrapped_day_sin',
 'last_scrapped_day_cos',
 'date_parsed',
 'year_host_since',
 'month',
 'day']

In [29]:
# Для месяца
month_encoder = CyclicDateEncoder(
    inputCol="month",
    outputCols=["host_since_month_sin", "host_since_month_cos"]
)

# Для дня (аналогично, нормировка на 31)
day_encoder = CyclicDateEncoder(
    inputCol="day",
    outputCols=["host_since_day_sin", "host_since_day_cos"]
)

# Применяем трансформеры
df_encoded = month_encoder.transform(df_with_dates)
df_encoded = day_encoder.transform(df_encoded)


In [30]:
# df_encoded.columns
df_encoded = df_encoded.drop("month", "day","date_parsed","host_since")
df_encoded.columns

['street',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'number_of_reviews',
 'reviews_per_month',
 'city',
 'country',
 'host_location',
 'host_response_time',
 'host_response_rate',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'label',
 'last_scraped_year',
 'last_scrapped_month_sin',
 'last_scrapped_month_cos',
 'last_scrapped_day_sin',
 'last_scrapped_day_cos',
 'year_host_since',
 'host_since_month_sin',
 'host_since_month_cos',
 'host_since_day_sin',
 'host_since_day_cos']

In [31]:
df_encoded.columns

['street',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'number_of_reviews',
 'reviews_per_month',
 'city',
 'country',
 'host_location',
 'host_response_time',
 'host_response_rate',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'label',
 'last_scraped_year',
 'last_scrapped_month_sin',
 'last_scrapped_month_cos',
 'last_scrapped_day_sin',
 'last_scrapped_day_cos',
 'year_host_since',
 'host_since_month_sin',
 'host_since_month_cos',
 'host_since_day_sin',
 'host_since_day_cos']

In [32]:
df_encoded = df_encoded.drop("last_scraped")

In [33]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, Word2Vec, Tokenizer, RegexTokenizer
from pyspark.sql.functions import col

categoricalCols = ['city','country','host_location','host_verifications', 'property_type','room_type', 'host_response_time']
textCols = ['street']
others = ['accommodates','bathrooms','bedrooms','beds','number_of_reviews','reviews_per_month',
          'host_response_rate','host_listings_count', 'host_total_listings_count','review_scores_rating','review_scores_accuracy',
          'review_scores_cleanliness', 'review_scores_checkin','review_scores_communication', 'review_scores_location', 'review_scores_value',
         'last_scraped_year','host_since_month_sin','host_since_month_cos','host_since_day_sin','host_since_day_cos','year_host_since']

In [34]:
df_encoded.select('street').show(5)
# df_encoded.select('city').show(5)
# df_encoded.select('country').show(5)
# df_encoded.select('host_location').show(5)
# df_encoded.select('host_verifications').show(5)

[Stage 58:>                                                         (0 + 2) / 2]

+--------------------+
|              street|
+--------------------+
|Mission District,...|
|New York, NY 1002...|
|Ficial District, ...|
|Panthéon, Paris, ...|
|Copenhagen, Capit...|
+--------------------+
only showing top 5 rows



                                                                                

In [35]:
# unique_count = df_encoded.select("city").distinct().count()
# print(unique_count)

In [36]:
# unique_count = df_encoded.select("host_location").distinct().count()
# print(unique_count)

In [37]:
# unique_count = df_encoded.select("host_verifications").distinct().count()
# print(unique_count)

In [38]:
# unique_count = df_encoded.select("country").distinct().count()
# print(unique_count)

In [39]:
# unique_count = df_encoded.select("street").distinct().count()
# print(unique_count)

In [40]:
from pyspark.ml.feature import RegexTokenizer, Word2Vec
from pyspark.sql.functions import col

tokenizer = RegexTokenizer(
    inputCol="street", 
    outputCol="city_tokens",
    pattern="[,\s]+"
)

# df_encoded_tok = tokenizer.transform(df_encoded)
# df_encoded_tok.select('city_tokens').show(1)


In [41]:
word2Vec = Word2Vec(
    vectorSize=10,
    minCount=1,
    windowSize=5,
    inputCol="city_tokens",
    outputCol="city_vec"
)
# word2VecModel = word2Vec.fit(df_encoded_tok)
# print(word2VecModel)

# df_encoded_tok = word2VecModel.transform(df_encoded_tok)
# # df_encoded_tok.show()

# # Adding the encoded ename_job to the list of other columns
# others += [city_vec]


In [42]:
indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)).setHandleInvalid("skip") for c in categoricalCols ]
encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol="{0}_encoded".format(indexer.getOutputCol())) for indexer in indexers ]
assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders] + others, outputCol= "features")

In [43]:
# df_encoded_tok.show(1)

In [44]:
# You can create a pipeline to use only a single fit and transform on the data.
pipeline = Pipeline(stages=[tokenizer, word2Vec] + indexers + encoders + [assembler])


# Fit the pipeline ==> This will call the fit functions for all transformers if exist
model=pipeline.fit(df_encoded)
# Fit the pipeline ==> This will call the transform functions for all transformers
data = model.transform(df_encoded)

# data.show()

# We delete all features and keep only the features and label columns
data = data.select(["features", "label"])


from pyspark.ml.feature import VectorIndexer

# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4
# distinct values are treated as continuous.
featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)
transformed = featureIndexer.transform(data)

# Display the output Spark DataFrame
# transformed.show()


25/05/05 20:49:10 WARN DAGScheduler: Broadcasting large task binary with size 1584.1 KiB
                                                                                

# Feature extraction

# Split the dataset

In [45]:
transformed = transformed.withColumn("label", col("label").cast("double"))

In [46]:
#  split the data into 60% training and 40% test (it is not stratified)
(train_data, test_data) = transformed.randomSplit([0.4, 0.6], seed = 10)

def run(command):
    import os
    return os.popen(command).read()

train_data.select("features", "label")\
    .coalesce(1)\
    .write\
    .mode("overwrite")\
    .format("json")\
    .save("project/data/train")

# Run it from root directory of the repository
run("hdfs dfs -cat project/data/train/*.json > ../data/train.json")

test_data.select("features", "label")\
    .coalesce(1)\
    .write\
    .mode("overwrite")\
    .format("json")\
    .save("project/data/test")

# Run it from root directory of the repository
run("hdfs dfs -cat project/data/test/*.json > ../data/test.json")

25/05/05 20:49:46 WARN DAGScheduler: Broadcasting large task binary with size 3.3 MiB
25/05/05 20:50:08 WARN DAGScheduler: Broadcasting large task binary with size 3.3 MiB
                                                                                

''

In [47]:
optimal_partitions = 12
train_data = train_data.repartition(optimal_partitions)

In [48]:
train_data.rdd.getNumPartitions()

25/05/05 20:50:29 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB

12

# First model

## Build a model

In [49]:
from pyspark.ml.regression import LinearRegression
# Create Linear Regression Model
lr = LinearRegression(maxIter=30)

# Fit the data to the pipeline stages
model_lr = lr.fit(train_data)

25/05/05 20:50:45 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
25/05/05 20:51:00 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
25/05/05 20:51:02 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
25/05/05 20:51:03 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
25/05/05 20:51:06 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/05/05 20:51:06 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
25/05/05 20:51:06 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
25/05/05 20:51:07 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
25/05/05 20:51:07 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
25/05/05 20:51:08 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
25/05/05 20:51:08 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
25/05/05 20:51:08 WARN DAGSchedul

## Predict for test data

In [50]:
predictions = model_lr.transform(test_data)
predictions.show()

25/05/05 20:52:00 WARN DAGScheduler: Broadcasting large task binary with size 3.1 MiB
[Stage 534:>                                                        (0 + 1) / 1]

+--------------------+-----+--------------------+------------------+
|            features|label|     indexedFeatures|        prediction|
+--------------------+-----+--------------------+------------------+
|(9436,[0,3082,309...| 56.0|(9436,[0,3082,309...| 76.48293773257774|
|(9436,[0,3082,309...| 35.0|(9436,[0,3082,309...| 59.71153674780908|
|(9436,[0,3082,309...| 65.0|(9436,[0,3082,309...| 94.83769864920032|
|(9436,[0,3082,309...| 99.0|(9436,[0,3082,309...|  97.2498026751291|
|(9436,[0,3082,309...| 55.0|(9436,[0,3082,309...| 68.96146220938499|
|(9436,[0,3082,309...|175.0|(9436,[0,3082,309...| 98.94145036682039|
|(9436,[0,3082,309...| 70.0|(9436,[0,3082,309...| 97.42505646392692|
|(9436,[0,3082,309...| 98.0|(9436,[0,3082,309...|107.52511472300466|
|(9436,[0,3082,309...| 80.0|(9436,[0,3082,309...|100.47217419403933|
|(9436,[0,3082,309...| 75.0|(9436,[0,3082,309...| 97.75201057293452|
|(9436,[0,3082,309...| 80.0|(9436,[0,3082,309...|102.51445763252195|
|(9436,[0,3082,309...| 85.0|(9436,

                                                                                

## Evaluate the model

In [51]:
from pyspark.ml.evaluation import RegressionEvaluator 

# Evaluate the performance of the model
evaluator1_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator1_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse = evaluator1_rmse.evaluate(predictions)
r2 = evaluator1_r2.evaluate(predictions)

print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse))
print("R^2 on test data = {}".format(r2))

25/05/05 20:52:14 WARN DAGScheduler: Broadcasting large task binary with size 3.1 MiB
25/05/05 20:52:30 WARN DAGScheduler: Broadcasting large task binary with size 3.1 MiB
[Stage 556:>                                                        (0 + 2) / 2]

Root Mean Squared Error (RMSE) on test data = 74.3531944947312
R^2 on test data = 0.7198554400810679


                                                                                

## Hyperparameter optimization

In [52]:
model_lr.params

[Param(parent='LinearRegression_0d67ccc2ed51', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2).'),
 Param(parent='LinearRegression_0d67ccc2ed51', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'),
 Param(parent='LinearRegression_0d67ccc2ed51', name='epsilon', doc='The shape parameter to control the amount of robustness. Must be > 1.0. Only valid when loss is huber'),
 Param(parent='LinearRegression_0d67ccc2ed51', name='featuresCol', doc='features column name.'),
 Param(parent='LinearRegression_0d67ccc2ed51', name='fitIntercept', doc='whether to fit an intercept term.'),
 Param(parent='LinearRegression_0d67ccc2ed51', name='labelCol', doc='label column name.'),
 Param(parent='LinearRegression_0d67ccc2ed51', name='loss', doc='The loss function to be optimized. Supported options: squaredError, huber.'),
 Param(parent='LinearRegression_0d67ccc2ed51', name='m

In [53]:
train_data.cache()
# sample_data = train_data.sample(False, 0.001, seed=42)
# sample_data.cache()


DataFrame[features: vector, label: double, indexedFeatures: vector]

In [54]:
# sample_data.rdd.getNumPartitions()

In [55]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator 

import numpy as np
# Подготовка данных
# train_data.cache()
# sample_data = train_data.sample(False, 0.001, seed=42)
# sample_data.cache()
grid = ParamGridBuilder()\
    .addGrid(model_lr.regParam, np.logspace(-2, -1, 2))\
    .addGrid(lr.elasticNetParam, [0.2, 0.6])\
    .build()

cv = CrossValidator(estimator = lr, 
                    estimatorParamMaps = grid, 
                    evaluator = evaluator1_rmse,
                    parallelism = 5,
                    numFolds=3)

cvModel = cv.fit(train_data)
bestModel = cvModel.bestModel
bestModel

25/05/05 20:52:45 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB
25/05/05 20:52:48 WARN DAGScheduler: Broadcasting large task binary with size 1588.4 KiB
25/05/05 20:52:48 WARN DAGScheduler: Broadcasting large task binary with size 1608.5 KiB
25/05/05 20:52:48 WARN DAGScheduler: Broadcasting large task binary with size 1608.5 KiB
25/05/05 20:52:48 WARN DAGScheduler: Broadcasting large task binary with size 1608.5 KiB
25/05/05 20:52:48 WARN DAGScheduler: Broadcasting large task binary with size 1608.5 KiB
25/05/05 20:52:49 WARN DAGScheduler: Broadcasting large task binary with size 1652.3 KiB
25/05/05 20:52:49 WARN DAGScheduler: Broadcasting large task binary with size 1652.3 KiB
25/05/05 20:52:49 WARN DAGScheduler: Broadcasting large task binary with size 1652.3 KiB
25/05/05 20:52:49 WARN DAGScheduler: Broadcasting large task binary with size 1652.3 KiB
25/05/05 20:52:49 WARN DAGScheduler: Broadcasting large task binary with size 1653.5 KiB
25/05/05 20:52:50 WARN D

LinearRegressionModel: uid=LinearRegression_0d67ccc2ed51, numFeatures=9436

## Best model 1


In [56]:
from pprint import pprint
model1 = bestModel
pprint(model1.extractParamMap())

{Param(parent='LinearRegression_0d67ccc2ed51', name='labelCol', doc='label column name.'): 'label',
 Param(parent='LinearRegression_0d67ccc2ed51', name='fitIntercept', doc='whether to fit an intercept term.'): True,
 Param(parent='LinearRegression_0d67ccc2ed51', name='epsilon', doc='The shape parameter to control the amount of robustness. Must be > 1.0. Only valid when loss is huber'): 1.35,
 Param(parent='LinearRegression_0d67ccc2ed51', name='regParam', doc='regularization parameter (>= 0).'): 0.1,
 Param(parent='LinearRegression_0d67ccc2ed51', name='featuresCol', doc='features column name.'): 'features',
 Param(parent='LinearRegression_0d67ccc2ed51', name='maxIter', doc='max number of iterations (>= 0).'): 30,
 Param(parent='LinearRegression_0d67ccc2ed51', name='tol', doc='the convergence tolerance for iterative algorithms (>= 0).'): 1e-06,
 Param(parent='LinearRegression_0d67ccc2ed51', name='solver', doc='The solver algorithm for optimization. Supported options: auto, normal, l-bfgs

## Save the model to HDFS

In [57]:
model1.write().overwrite().save("project/big_data_project/models/model1")

# Run it from root directory of the repository
run("hdfs dfs -get project/big_data_project/models/model1 model1")

get: `model1/model1/data/_SUCCESS': File exists                                 
get: `model1/model1/metadata/_SUCCESS': File exists
get: `model1/model1/metadata/part-00000': File exists


''

In [58]:
# run("hdfs dfs -ls project/big_data_project/models/model1")

## Predict for test data using best model1

In [59]:
predictions = model1.transform(test_data)
predictions.show()

25/05/05 20:55:27 WARN DAGScheduler: Broadcasting large task binary with size 3.1 MiB
[Stage 1996:>                                                       (0 + 1) / 1]

+--------------------+-----+--------------------+------------------+
|            features|label|     indexedFeatures|        prediction|
+--------------------+-----+--------------------+------------------+
|(9436,[0,3082,309...| 56.0|(9436,[0,3082,309...| 75.48453010114736|
|(9436,[0,3082,309...| 35.0|(9436,[0,3082,309...|59.208723893013484|
|(9436,[0,3082,309...| 65.0|(9436,[0,3082,309...|  93.7936765696968|
|(9436,[0,3082,309...| 99.0|(9436,[0,3082,309...| 96.83784538840973|
|(9436,[0,3082,309...| 55.0|(9436,[0,3082,309...|   65.514911412728|
|(9436,[0,3082,309...|175.0|(9436,[0,3082,309...| 98.32958577716545|
|(9436,[0,3082,309...| 70.0|(9436,[0,3082,309...| 96.92847238517697|
|(9436,[0,3082,309...| 98.0|(9436,[0,3082,309...|105.23352105140384|
|(9436,[0,3082,309...| 80.0|(9436,[0,3082,309...|100.63568541619861|
|(9436,[0,3082,309...| 75.0|(9436,[0,3082,309...| 96.06447718111394|
|(9436,[0,3082,309...| 80.0|(9436,[0,3082,309...|101.76763617829147|
|(9436,[0,3082,309...| 85.0|(9436,

                                                                                

In [60]:
predictions.select("label", "prediction")\
    .coalesce(1)\
    .write\
    .mode("overwrite")\
    .format("csv")\
    .option("sep", ",")\
    .option("header","true")\
    .save("project/big_data_project/output/model1_predictions.csv")

# Run it from root directory of the repository
run("hdfs dfs -cat project/big_data_project/output/model1_predictions.csv/*.csv > ../output/model1_predictions.csv")

25/05/05 20:55:42 WARN DAGScheduler: Broadcasting large task binary with size 3.3 MiB
                                                                                

''

## Evaluate the best model1

In [61]:
from pyspark.ml.evaluation import RegressionEvaluator 

# Evaluate the performance of the model
evaluator1_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator1_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse1 = evaluator1_rmse.evaluate(predictions)
r21 = evaluator1_r2.evaluate(predictions)

print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse1))
print("R^2 on test data = {}".format(r21))

25/05/05 20:56:02 WARN DAGScheduler: Broadcasting large task binary with size 3.1 MiB
25/05/05 20:56:17 WARN DAGScheduler: Broadcasting large task binary with size 3.1 MiB
[Stage 2029:>                                                       (0 + 2) / 2]

Root Mean Squared Error (RMSE) on test data = 73.69420253157189
R^2 on test data = 0.7247992734344332


                                                                                

# Second model

## Build a model

In [62]:
from pyspark.ml.regression import GBTRegressor

# Create Linear Regression Model
gbt = GBTRegressor(maxIter=10)

# Fit the data to the pipeline stages
model_gbt = gbt.fit(train_data)

25/05/05 20:56:20 WARN DAGScheduler: Broadcasting large task binary with size 1630.7 KiB
25/05/05 20:56:20 WARN DAGScheduler: Broadcasting large task binary with size 1630.8 KiB
25/05/05 20:56:20 WARN DAGScheduler: Broadcasting large task binary with size 1634.9 KiB
25/05/05 20:56:21 WARN DAGScheduler: Broadcasting large task binary with size 1895.2 KiB
25/05/05 20:56:30 WARN DAGScheduler: Broadcasting large task binary with size 1896.1 KiB
25/05/05 20:56:34 WARN DAGScheduler: Broadcasting large task binary with size 1896.8 KiB
25/05/05 20:56:37 WARN DAGScheduler: Broadcasting large task binary with size 1898.0 KiB
25/05/05 20:56:41 WARN DAGScheduler: Broadcasting large task binary with size 1900.4 KiB
25/05/05 20:56:45 WARN DAGScheduler: Broadcasting large task binary with size 1907.8 KiB
25/05/05 20:56:49 WARN DAGScheduler: Broadcasting large task binary with size 1908.3 KiB
25/05/05 20:56:52 WARN DAGScheduler: Broadcasting large task binary with size 1908.9 KiB
25/05/05 20:56:55 WAR

## Predict for test data

In [63]:
predictions = model_gbt.transform(test_data)
predictions.show()

25/05/05 20:59:45 WARN DAGScheduler: Broadcasting large task binary with size 3.1 MiB
[Stage 2409:>                                                       (0 + 1) / 1]

+--------------------+-----+--------------------+-----------------+
|            features|label|     indexedFeatures|       prediction|
+--------------------+-----+--------------------+-----------------+
|(9436,[0,3082,309...| 56.0|(9436,[0,3082,309...|88.00206916226688|
|(9436,[0,3082,309...| 35.0|(9436,[0,3082,309...|76.01269335997698|
|(9436,[0,3082,309...| 65.0|(9436,[0,3082,309...|88.00206916226688|
|(9436,[0,3082,309...| 99.0|(9436,[0,3082,309...|89.86170069017531|
|(9436,[0,3082,309...| 55.0|(9436,[0,3082,309...|83.48651310211898|
|(9436,[0,3082,309...|175.0|(9436,[0,3082,309...|89.86170069017531|
|(9436,[0,3082,309...| 70.0|(9436,[0,3082,309...|89.86170069017531|
|(9436,[0,3082,309...| 98.0|(9436,[0,3082,309...|89.86170069017531|
|(9436,[0,3082,309...| 80.0|(9436,[0,3082,309...|89.86170069017531|
|(9436,[0,3082,309...| 75.0|(9436,[0,3082,309...|83.48651310211898|
|(9436,[0,3082,309...| 80.0|(9436,[0,3082,309...|89.86170069017531|
|(9436,[0,3082,309...| 85.0|(9436,[0,3082,309...

                                                                                

## Evaluate the model

In [64]:
from pyspark.ml.evaluation import RegressionEvaluator 

# Evaluate the performance of the model
evaluator2_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator2_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse2 = evaluator2_rmse.evaluate(predictions)
r22 = evaluator2_r2.evaluate(predictions)

print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse2))
print("R^2 on test data = {}".format(r22))

25/05/05 21:00:00 WARN DAGScheduler: Broadcasting large task binary with size 3.1 MiB
25/05/05 21:00:16 WARN DAGScheduler: Broadcasting large task binary with size 3.1 MiB
[Stage 2431:>                                                       (0 + 2) / 2]

Root Mean Squared Error (RMSE) on test data = 70.8211085495895
R^2 on test data = 0.7458393164580558


                                                                                

## Hyperparameter optimization

In [65]:
model_gbt.params

[Param(parent='GBTRegressor_e7007ff84291', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.'),
 Param(parent='GBTRegressor_e7007ff84291', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.'),
 Param(parent='GBTRegressor_e7007ff84291', name='featureSubsetStrategy', doc="The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to 'onethird' for regression), 'a

In [66]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator 

import numpy as np
# grid = ParamGridBuilder()
# grid = grid.addGrid(model_gbt.regParam, np.logspace(-2, -1, 3)).build()

grid = ParamGridBuilder()\
    .addGrid(gbt.maxDepth, [3, 5])\
    .addGrid(gbt.minInstancesPerNode, [1, 2])\
    .build()
cv = CrossValidator(estimator = gbt, 
                    estimatorParamMaps = grid, 
                    evaluator = evaluator2_rmse,
                    parallelism = 5,
                    numFolds=3)

cvModel = cv.fit(train_data)
bestModel = cvModel.bestModel
bestModel

25/05/05 21:00:18 WARN DAGScheduler: Broadcasting large task binary with size 1608.5 KiB
25/05/05 21:00:18 WARN DAGScheduler: Broadcasting large task binary with size 1608.5 KiB
25/05/05 21:00:18 WARN DAGScheduler: Broadcasting large task binary with size 1608.5 KiB
25/05/05 21:00:18 WARN DAGScheduler: Broadcasting large task binary with size 1608.5 KiB
25/05/05 21:00:19 WARN DAGScheduler: Broadcasting large task binary with size 1648.4 KiB
25/05/05 21:00:19 WARN DAGScheduler: Broadcasting large task binary with size 1648.4 KiB
25/05/05 21:00:19 WARN DAGScheduler: Broadcasting large task binary with size 1648.4 KiB
25/05/05 21:00:19 WARN DAGScheduler: Broadcasting large task binary with size 1648.4 KiB
25/05/05 21:00:19 WARN DAGScheduler: Broadcasting large task binary with size 1648.4 KiB
25/05/05 21:00:19 WARN DAGScheduler: Broadcasting large task binary with size 1648.4 KiB
25/05/05 21:00:20 WARN DAGScheduler: Broadcasting large task binary with size 1648.4 KiB
25/05/05 21:00:20 WAR

GBTRegressionModel: uid=GBTRegressor_e7007ff84291, numTrees=10, numFeatures=9436

## Best model 2


In [67]:
from pprint import pprint
model2 = bestModel
pprint(model2.extractParamMap())

{Param(parent='GBTRegressor_e7007ff84291', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 5,
 Param(parent='GBTRegressor_e7007ff84291', name='maxBins', doc='Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.'): 32,
 Param(parent='GBTRegressor_e7007ff84291', name='stepSize', doc='Step size (a.k.a. learning rate) in interval (0, 1] for shrinking the contribution of each estimator.'): 0.1,
 Param(parent='GBTRegressor_e7007ff84291', name='featuresCol', doc='features column name.'): 'features',
 Param(parent='GBTRegressor_e7007ff84291', name='maxIter', doc='max number of iterations (>= 0).'): 10,
 Param(parent='GBTRegressor_e7007ff84291', name='minInfoGain', doc='Minimum information gain for a split to be considered at a tree node.'): 0.0,
 Param(parent='GBTRegressor_e7007ff84291', name='validationTol

## Save the model to HDFS

In [68]:
model2.write().overwrite().save("project/big_data_project/models/model2")

# Run it from root directory of the repository
run("hdfs dfs -get project/big_data_project/models/model1 model2")

get: `model2/model1/data/_SUCCESS': File exists
get: `model2/model1/metadata/_SUCCESS': File exists
get: `model2/model1/metadata/part-00000': File exists


''

## Predict for test data using best model2

In [69]:
predictions = model2.transform(test_data)
predictions.show()

25/05/05 21:27:38 WARN DAGScheduler: Broadcasting large task binary with size 3.1 MiB
[Stage 3893:>                                                       (0 + 1) / 1]

+--------------------+-----+--------------------+-----------------+
|            features|label|     indexedFeatures|       prediction|
+--------------------+-----+--------------------+-----------------+
|(9436,[0,3082,309...| 56.0|(9436,[0,3082,309...|88.87672922731338|
|(9436,[0,3082,309...| 35.0|(9436,[0,3082,309...|77.02098142809005|
|(9436,[0,3082,309...| 65.0|(9436,[0,3082,309...|89.37869939027341|
|(9436,[0,3082,309...| 99.0|(9436,[0,3082,309...|90.48910757060489|
|(9436,[0,3082,309...| 55.0|(9436,[0,3082,309...|82.66544892380604|
|(9436,[0,3082,309...|175.0|(9436,[0,3082,309...|90.48910757060489|
|(9436,[0,3082,309...| 70.0|(9436,[0,3082,309...|90.48910757060489|
|(9436,[0,3082,309...| 98.0|(9436,[0,3082,309...|94.74653543497664|
|(9436,[0,3082,309...| 80.0|(9436,[0,3082,309...|90.99107773356492|
|(9436,[0,3082,309...| 75.0|(9436,[0,3082,309...|82.66544892380604|
|(9436,[0,3082,309...| 80.0|(9436,[0,3082,309...|90.99107773356492|
|(9436,[0,3082,309...| 85.0|(9436,[0,3082,309...

                                                                                

In [70]:
predictions.select("label", "prediction")\
    .coalesce(1)\
    .write\
    .mode("overwrite")\
    .format("csv")\
    .option("sep", ",")\
    .option("header","true")\
    .save("project/big_data_project/output/model2_predictions.csv")

# Run it from root directory of the repository
run("hdfs dfs -cat project/big_data_project/output/model2_predictions.csv/*.csv > ../output/model2_predictions.csv")

25/05/05 21:27:54 WARN DAGScheduler: Broadcasting large task binary with size 3.3 MiB
                                                                                

''

## Evaluate the best model2

In [71]:
from pyspark.ml.evaluation import RegressionEvaluator 

# Evaluate the performance of the model
evaluator2_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
evaluator2_r2 = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")

rmse2 = evaluator2_rmse.evaluate(predictions)
r22 = evaluator2_r2.evaluate(predictions)

print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse2))
print("R^2 on test data = {}".format(r22))

25/05/05 21:28:14 WARN DAGScheduler: Broadcasting large task binary with size 3.1 MiB
25/05/05 21:28:31 WARN DAGScheduler: Broadcasting large task binary with size 3.1 MiB
[Stage 3926:>                                                       (0 + 2) / 2]

Root Mean Squared Error (RMSE) on test data = 70.7189766537025
R^2 on test data = 0.7465718436919963


                                                                                

# Compare best models

In [72]:
models = [[str(model1),rmse1, r21], [str(model2),rmse2, r22]]

df = spark.createDataFrame(models, ["model", "RMSE", "R2"])
df.show(truncate=False)

[Stage 3928:>                                                       (0 + 1) / 1]

+--------------------------------------------------------------------------------+-----------------+------------------+
|model                                                                           |RMSE             |R2                |
+--------------------------------------------------------------------------------+-----------------+------------------+
|LinearRegressionModel: uid=LinearRegression_0d67ccc2ed51, numFeatures=9436      |73.69420253157189|0.7247992734344332|
|GBTRegressionModel: uid=GBTRegressor_e7007ff84291, numTrees=10, numFeatures=9436|70.7189766537025 |0.7465718436919963|
+--------------------------------------------------------------------------------+-----------------+------------------+



                                                                                

In [73]:
df.coalesce(1)\
    .write\
    .mode("overwrite")\
    .format("csv")\
    .option("sep", ",")\
    .option("header","true")\
    .save("project/big_data_project/output/evaluation.csv")
# Run it from root directory of the repository
run("hdfs dfs -cat project/big_data_project/output/evaluation.csv/*.csv > ../output/evaluation.csv")

''

In [74]:
train_data.count()

25/05/05 21:28:39 WARN DAGScheduler: Broadcasting large task binary with size 1595.0 KiB


60340