In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [3]:
sc = SparkContext(master='local',appName='Chapter 6')
spark = SparkSession(sc)

In [4]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.functions import corr
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [31]:
data = spark.read.csv('./flights.csv',inferSchema=True,header=True)

In [32]:
data.printSchema()

root
 |-- mon: integer (nullable = true)
 |-- dom: integer (nullable = true)
 |-- dow: integer (nullable = true)
 |-- carrier: string (nullable = true)
 |-- flight: integer (nullable = true)
 |-- org: string (nullable = true)
 |-- mile: integer (nullable = true)
 |-- depart: double (nullable = true)
 |-- duration: integer (nullable = true)
 |-- delay: string (nullable = true)



In [33]:
data.toPandas().head()

Unnamed: 0,mon,dom,dow,carrier,flight,org,mile,depart,duration,delay
0,11,20,6,US,19,JFK,2153,9.48,351,
1,0,22,2,UA,1107,ORD,316,16.33,82,30.0
2,2,20,4,UA,226,SFO,337,6.17,82,-8.0
3,9,13,1,AA,419,ORD,1236,10.33,195,-5.0
4,4,2,5,AA,325,ORD,258,8.92,65,


In [None]:
# mile - input, duration - output

In [34]:
data.columns

['mon',
 'dom',
 'dow',
 'carrier',
 'flight',
 'org',
 'mile',
 'depart',
 'duration',
 'delay']

In [37]:
assembler = VectorAssembler( inputCols=['mile'], outputCol='features')

In [38]:
data_pre = assembler.transform(data)

In [11]:
data_pre[['features']].show(2)

+--------------------+
|            features|
+--------------------+
|[34.4972677251122...|
|[31.9262720263601...|
+--------------------+
only showing top 2 rows



In [39]:
final_data = data_pre.select('features','duration')

In [40]:
final_data.show(2,truncate=False)

+--------+--------+
|features|duration|
+--------+--------+
|[2153.0]|351     |
|[316.0] |82      |
+--------+--------+
only showing top 2 rows



In [44]:
train_data, test_data = final_data.randomSplit([0.8,0.2])

In [45]:
train_data.describe().show()

+-------+-----------------+
|summary|         duration|
+-------+-----------------+
|  count|            39984|
|   mean|152.1970288115246|
| stddev|87.20878068195009|
|    min|               30|
|    max|              560|
+-------+-----------------+



In [46]:
test_data.describe().show()

+-------+------------------+
|summary|          duration|
+-------+------------------+
|  count|             10016|
|   mean|150.04442891373802|
| stddev| 86.37133732348227|
|    min|                30|
|    max|               556|
+-------+------------------+



In [47]:
lr = LinearRegression(featuresCol='features',
                      labelCol='duration',
                      predictionCol='prediction')

In [48]:
lrModel = lr.fit(train_data)

In [49]:
print('coefficients: {} intercept: {}'.format(lrModel.coefficients,lrModel.intercept))

coefficients: [0.12183220307321005] intercept: 44.37086714518754


In [50]:
test_results = lrModel.evaluate(test_data)

In [51]:
test_results.residuals.show(5)

+-----------------+
|        residuals|
+-----------------+
|-9.53362475109261|
|-8.53362475109261|
|-8.53362475109261|
|-6.53362475109261|
|-6.53362475109261|
+-----------------+
only showing top 5 rows



In [52]:
#check

In [53]:
test_results.rootMeanSquaredError

16.7512478656199

In [54]:
test_results.meanSquaredError

280.60430505543525

In [55]:
test_results.r2

0.9623817659706799

In [56]:
#predict

In [57]:
test_model = lrModel.transform(test_data)

In [58]:
test_model.select('prediction','duration').show()

+-----------------+--------+
|       prediction|duration|
+-----------------+--------+
|52.53362475109261|      43|
|52.53362475109261|      44|
|52.53362475109261|      44|
|52.53362475109261|      46|
|52.53362475109261|      46|
|52.53362475109261|      47|
|52.53362475109261|      47|
|52.53362475109261|      47|
|52.53362475109261|      47|
|52.53362475109261|      48|
|52.53362475109261|      48|
|52.53362475109261|      48|
|52.53362475109261|      48|
|52.53362475109261|      48|
|52.53362475109261|      48|
|52.53362475109261|      48|
|52.53362475109261|      49|
|52.53362475109261|      49|
|52.53362475109261|      49|
|52.53362475109261|      50|
+-----------------+--------+
only showing top 20 rows



In [29]:
#Luu tru & model

In [60]:
lrModel.save('Fligh_duration')

In [61]:
from pyspark.ml.regression import LinearRegressionModel

In [62]:
lrModel2 = LinearRegressionModel.load('Fligh_duration')

In [63]:
#Predict new values

In [64]:
unlabel_data = test_data.select('features')

In [65]:
predictions = lrModel2.transform(unlabel_data)

In [66]:
predictions.show(5)

+--------+-----------------+
|features|       prediction|
+--------+-----------------+
|  [67.0]|52.53362475109261|
|  [67.0]|52.53362475109261|
|  [67.0]|52.53362475109261|
|  [67.0]|52.53362475109261|
|  [67.0]|52.53362475109261|
+--------+-----------------+
only showing top 5 rows

