In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("CIAN").getOrCreate()
spark

23/01/13 13:31:34 WARN Utils: Your hostname, spark-server resolves to a loopback address: 127.0.1.1; using 10.0.0.31 instead (on interface eth0)
23/01/13 13:31:34 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/01/13 13:31:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/01/13 13:31:36 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/01/13 13:31:36 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [2]:
cian = spark.read\
.option("header", "true")\
.option("inferSchema", "true")\
.option("multiline", "true")\
.option("escape", '"')\
.csv("cian_phase-0.csv").drop('_c0')

In [3]:
cian.show()

+-------+-------+--------+---------+-------+---------------+------------+----------+---------------+
|area_m2|parking|bathroom|rent_cost|rooms:Q|metro_name_code|total_floors|house_type|renovation_type|
+-------+-------+--------+---------+-------+---------------+------------+----------+---------------+
|   58.0|      0|       1| 225000.0|      2|       190000.0|          26|   43000.0|        55000.0|
|   98.0|      1|       1| 250000.0|      3|       170000.0|           4|   70000.0|        55000.0|
|  120.0|      0|       0| 130000.0|      3|       170000.0|          10|   95000.0|        55000.0|
|   90.0|      0|       1| 210000.0|      4|       190000.0|           7|   95000.0|        55000.0|
|  170.0|      1|       1| 290000.0|      4|       170000.0|           7|   49000.0|        55000.0|
|   80.0|      0|       0| 100000.0|      3|       170000.0|           8|   53000.0|        55000.0|
|  100.0|      0|       1| 200000.0|      3|       170000.0|           4|   53000.0|       

In [4]:
cian.printSchema()

root
 |-- area_m2: double (nullable = true)
 |-- parking: integer (nullable = true)
 |-- bathroom: integer (nullable = true)
 |-- rent_cost: double (nullable = true)
 |-- rooms:Q: integer (nullable = true)
 |-- metro_name_code: double (nullable = true)
 |-- total_floors: integer (nullable = true)
 |-- house_type: double (nullable = true)
 |-- renovation_type: double (nullable = true)



In [5]:
from pyspark.ml.linalg import Vectors

def transformToLabeledPoint(row) :
    lp = ( row["rent_cost"], Vectors.dense([row["area_m2"],\
                        row["parking"], \
                        row['bathroom'], \
                        row['metro_name_code'], \
                        row['total_floors'], \
                        row['house_type'], \
                        row["renovation_type"]]))
    return lp
    
Lp = cian.rdd.map(transformToLabeledPoint)
DF = spark.createDataFrame(Lp,["label", "features"])
DF.select("label","features").show(10)

                                                                                

+--------+--------------------+
|   label|            features|
+--------+--------------------+
|225000.0|[58.0,0.0,1.0,190...|
|250000.0|[98.0,1.0,1.0,170...|
|130000.0|[120.0,0.0,0.0,17...|
|210000.0|[90.0,0.0,1.0,190...|
|290000.0|[170.0,1.0,1.0,17...|
|100000.0|[80.0,0.0,0.0,170...|
|200000.0|[100.0,0.0,1.0,17...|
|120000.0|[30.0,0.0,0.0,190...|
|180000.0|[150.0,0.0,1.0,18...|
|190000.0|[201.0,0.0,1.0,19...|
+--------+--------------------+
only showing top 10 rows



In [6]:
trainingData, testData = DF.randomSplit([0.8, 0.2])
trainingData.count()

[Stage 5:>                                                          (0 + 1) / 1]

15237

                                                                                

In [7]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(maxIter=10)
lrModel = lr.fit(trainingData)

print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

23/01/13 13:32:26 WARN Instrumentation: [a094617e] regParam is zero, which might cause numerical instability and overfitting.
23/01/13 13:32:26 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/01/13 13:32:26 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
23/01/13 13:32:27 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
[Stage 10:>                                                         (0 + 1) / 1]

Coefficients: [638.1813924487893,17627.44516588717,23911.10340750069,0.5594312622882817,128.2818486988835,0.09128495727650487,0.4115414015360142]
Intercept: -37455.49746347456


                                                                                

In [8]:
predictions = lrModel.transform(testData)
predictions.select("prediction","label","features").show()

+------------------+-------+--------------------+
|        prediction|  label|            features|
+------------------+-------+--------------------+
|171398.38937020028| 5000.0|[160.0,1.0,1.0,72...|
| 396452.4272870311| 9800.0|[500.0,1.0,1.0,75...|
| 382133.9041900095|10000.0|[371.0,0.0,1.0,20...|
|342513.86999052984|10000.0|[400.0,1.0,1.0,60...|
| 26660.88166454623|18000.0|[36.0,0.0,0.0,380...|
| 50824.61325037223|20000.0|[20.0,0.0,1.0,475...|
|21803.612502389784|20000.0|[30.0,0.0,0.0,330...|
| 29391.04866287748|22000.0|[38.0,0.0,0.0,330...|
| 30863.38485697357|22000.0|[38.9,0.0,0.0,330...|
| 39158.73174476143|22000.0|[40.0,0.0,0.0,500...|
|30948.060556260665|22000.0|[42.0,0.0,0.0,342...|
|13355.931385962613|23000.0|[12.0,0.0,0.0,370...|
|18267.345826981786|23000.0|[20.5,0.0,0.0,370...|
|  18249.7099536858|23000.0|[27.0,0.0,0.0,330...|
|27686.656628974975|23000.0|[30.0,0.0,0.0,430...|
| 27121.25383951972|23000.0|[35.0,0.0,0.0,370...|
|30285.793752724043|23000.0|[39.0,0.0,0.0,330...|


In [9]:
# R2 Linear Regression
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="label",metricName="r2")
evaluator.evaluate(predictions)

0.7691334272740756