In [61]:
import findspark
findspark.init()
import pyspark

In [62]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Ltz's Homework").master('local').getOrCreate()

In [3]:
spark

## Logistic Regression

In [5]:
from pyspark.sql.types import *
schema_sdf = StructType([
        StructField('Year', IntegerType(), True),
        StructField('Month', IntegerType(), True),
        StructField('DayofMonth', IntegerType(), True),
        StructField('DayOfWeek', IntegerType(), True),
        StructField('DepTime', DoubleType(), True),
        StructField('CRSDepTime', DoubleType(), True),
        StructField('ArrTime', DoubleType(), True),
        StructField('CRSArrTime', DoubleType(), True),
        StructField('UniqueCarrier', StringType(), True),
        StructField('FlightNum', StringType(), True),
        StructField('TailNum', StringType(), True),
        StructField('ActualElapsedTime', DoubleType(), True),
        StructField('CRSElapsedTime',  DoubleType(), True),
        StructField('AirTime',  DoubleType(), True),
        StructField('ArrDelay',  DoubleType(), True),
        StructField('DepDelay',  DoubleType(), True),
        StructField('Origin', StringType(), True),
        StructField('Dest',  StringType(), True),
        StructField('Distance',  DoubleType(), True),
        StructField('TaxiIn',  DoubleType(), True),
        StructField('TaxiOut',  DoubleType(), True),
        StructField('Cancelled',  IntegerType(), True),
        StructField('CancellationCode',  StringType(), True),
        StructField('Diverted',  IntegerType(), True),
        StructField('CarrierDelay', DoubleType(), True),
        StructField('WeatherDelay',  DoubleType(), True),
        StructField('NASDelay',  DoubleType(), True),
        StructField('SecurityDelay',  DoubleType(), True),
        StructField('LateAircraftDelay',  DoubleType(), True)
    ])
air = spark.read.options(header='true').schema(schema_sdf).csv("/lifeng/student/liutuozhen/airdelay_small.csv")

In [59]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [12]:
air.describe(['ArrDelay']).show()



+-------+------------------+
|summary|          ArrDelay|
+-------+------------------+
|  count|           5432958|
|   mean|  6.97897995898367|
| stddev|30.191156753519532|
|    min|           -1238.0|
|    max|            1779.0|
+-------+------------------+



                                                                                

In [18]:
air.head(1)

21/12/15 19:04:19 WARN Utils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.debug.maxToStringFields' in SparkEnv.conf.


[Row(Year=2006, Month=7, DayofMonth=6, DayOfWeek=4, DepTime=2055.0, CRSDepTime=2055.0, ArrTime=2150.0, CRSArrTime=2148.0, UniqueCarrier='XE', FlightNum='2619', TailNum='N11526', ActualElapsedTime=55.0, CRSElapsedTime=53.0, AirTime=25.0, ArrDelay=2.0, DepDelay=0.0, Origin='IAH', Dest='LCH', Distance=127.0, TaxiIn=8.0, TaxiOut=22.0, Cancelled=0, CancellationCode=None, Diverted=0, CarrierDelay=0.0, WeatherDelay=0.0, NASDelay=0.0, SecurityDelay=0.0, LateAircraftDelay=0.0)]

In [19]:
assembler = VectorAssembler(
        inputCols=["DepDelay", "Distance"],
        outputCol="features"
)

In [47]:
data_ = assembler.transform(air)

In [27]:
data_

DataFrame[Year: int, Month: int, DayofMonth: int, DayOfWeek: int, DepTime: double, CRSDepTime: double, ArrTime: double, CRSArrTime: double, UniqueCarrier: string, FlightNum: string, TailNum: string, ActualElapsedTime: double, CRSElapsedTime: double, AirTime: double, ArrDelay: double, DepDelay: double, Origin: string, Dest: string, Distance: double, TaxiIn: double, TaxiOut: double, Cancelled: int, CancellationCode: string, Diverted: int, CarrierDelay: double, WeatherDelay: double, NASDelay: double, SecurityDelay: double, LateAircraftDelay: double, features: vector]

In [50]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [51]:
lr = LogisticRegression(maxIter=100, regParam=0.01)

In [94]:
model1 = lr.fit(trainingData)

In [5]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

sentenceData = spark.createDataFrame([
    (0.0, "Hi I heard about Spark"),
    (0.0, "I wish Java could use case classes"),
    (1.0, "Logistic regression models are neat")
], ["label", "sentence"])

In [6]:
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
tokenizer

Tokenizer_1edf0b17acf2

In [7]:
wordsData = tokenizer.transform(sentenceData)
wordsData.show()

+-----+--------------------+--------------------+
|label|            sentence|               words|
+-----+--------------------+--------------------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|
|  0.0|I wish Java could...|[i, wish, java, c...|
|  1.0|Logistic regressi...|[logistic, regres...|
+-----+--------------------+--------------------+



                                                                                

In [10]:
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=2)
featurizedData = hashingTF.transform(wordsData)
featurizedData.show()

+-----+--------------------+--------------------+-------------------+
|label|            sentence|               words|        rawFeatures|
+-----+--------------------+--------------------+-------------------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|(2,[0,1],[1.0,4.0])|
|  0.0|I wish Java could...|[i, wish, java, c...|(2,[0,1],[1.0,6.0])|
|  1.0|Logistic regressi...|[logistic, regres...|(2,[0,1],[3.0,2.0])|
+-----+--------------------+--------------------+-------------------+



In [9]:
# alternatively, CountVectorizer can also be used to get term frequency vectors
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.select("label", "features").show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(20,[0,5,9,17],[0...|
|  0.0|(20,[2,7,9,13,15]...|
|  1.0|(20,[4,6,13,15,18...|
+-----+--------------------+



## LDA

In [4]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [5]:
from pyspark.sql.types import *
schema_sdf = StructType([
        StructField('date', StringType(), True),
        StructField('rating', IntegerType(), True),
        StructField('review', StringType(), True)
    ])
review = spark.read.options(header='true').schema(schema_sdf).csv("file:///home/student/student/liutuozhen/spark/avengers_review.csv")
review = review.dropna()

In [27]:
review.show()

+---------+------+--------------------+
|     date|rating|              review|
+---------+------+--------------------+
|29-Apr-19|    10|I only have One w...|
|29-Apr-19|     9|A very decent end...|
|18-Sep-19|    10|"I have to say In...|
|28-Apr-19|    10|Man this film goe...|
|27-Apr-19|    10|More than 10 year...|
|22-Jun-19|    10|Just magnificentl...|
|10-Jul-19|    10|Avengers: Endgame...|
|27-May-19|    10|Reading through s...|
|27-Apr-19|    10|"This movie was t...|
|28-Dec-19|     4|"Infinity War was...|
|25-Apr-19|    10|This is what all ...|
|14-Apr-20|     1|If one Avengers p...|
|23-Jun-19|    10|This movie was so...|
|30-Apr-19|     5|A messed up past,...|
|28-Apr-19|     4|Infinity wars fel...|
| 8-May-19|     1|Far from what one...|
| 5-May-19|     9|"There is a weird...|
|26-Apr-19|    10|No words. What an...|
|28-Apr-19|    10|Loved the jokes. ...|
|27-Apr-19|    10|Simply amazing fr...|
+---------+------+--------------------+
only showing top 20 rows



In [63]:
tokenizer = Tokenizer(inputCol="review", outputCol="words")
tokenizer

Tokenizer_f83adb6ef6dd

In [7]:
review_words = tokenizer.transform(review)

In [30]:
review_words.show()

+---------+------+--------------------+--------------------+
|     date|rating|              review|               words|
+---------+------+--------------------+--------------------+
|29-Apr-19|    10|I only have One w...|[i, only, have, o...|
|29-Apr-19|     9|A very decent end...|[a, very, decent,...|
|18-Sep-19|    10|"I have to say In...|["i, have, to, sa...|
|28-Apr-19|    10|Man this film goe...|[man, this, film,...|
|27-Apr-19|    10|More than 10 year...|[more, than, 10, ...|
|22-Jun-19|    10|Just magnificentl...|[just, magnificen...|
|10-Jul-19|    10|Avengers: Endgame...|[avengers:, endga...|
|27-May-19|    10|Reading through s...|[reading, through...|
|27-Apr-19|    10|"This movie was t...|["this, movie, wa...|
|28-Dec-19|     4|"Infinity War was...|["infinity, war, ...|
|25-Apr-19|    10|This is what all ...|[this, is, what, ...|
|14-Apr-20|     1|If one Avengers p...|[if, one, avenger...|
|23-Jun-19|    10|This movie was so...|[this, movie, was...|
|30-Apr-19|     5|A mess

In [71]:
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=100)
featurizedData = hashingTF.transform(review_words)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

data = rescaledData.select("rating", "features")
data.show()

+------+--------------------+
|rating|            features|
+------+--------------------+
|    10|(100,[3,4,5,10,13...|
|     9|(100,[6,10,18,23,...|
|    10|(100,[2,5,10,11,1...|
|    10|(100,[3,8,10,24,3...|
|    10|(100,[0,4,5,7,8,1...|
|    10|(100,[3,5,7,10,16...|
|    10|(100,[70,80,82],[...|
|    10|(100,[0,1,3,5,8,1...|
|    10|(100,[0,2,3,4,5,9...|
|     4|(100,[0,1,2,4,5,6...|
|    10|(100,[12,14,16,19...|
|     1|(100,[1,2,3,6,7,1...|
|    10|(100,[5,7,10,13,1...|
|     5|(100,[3,7,10,11,1...|
|     4|(100,[4,7,10,15,2...|
|     1|(100,[0,3,5,6,7,1...|
|     9|(100,[0,3,5,6,7,1...|
|    10|(100,[2,5,10,15,1...|
|    10|(100,[4,6,10,15,1...|
|    10|(100,[0,1,2,3,4,8...|
+------+--------------------+
only showing top 20 rows



In [87]:
from pyspark.ml.clustering import LDA
lda = LDA(k=2, maxIter=10)
model = lda.fit(data)

                                                                                

In [88]:
# Describe topics.
topics = model.describeTopics(3)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)

The topics described by their top-weighted terms:
+-----+------------+------------------------------------------------------------------+
|topic|termIndices |termWeights                                                       |
+-----+------------+------------------------------------------------------------------+
|0    |[25, 81, 63]|[0.02267453318781094, 0.018067881190574377, 0.016521117328933677] |
|1    |[29, 34, 10]|[0.017304295666922084, 0.017041683961121077, 0.016887362881921438]|
+-----+------------+------------------------------------------------------------------+



In [89]:
transformed = model.transform(data)
transformed.select("rating", "topicDistribution").show(truncate=False)

+------+----------------------------------------+
|rating|topicDistribution                       |
+------+----------------------------------------+
|10    |[0.21331872856513448,0.7866812714348655]|
|9     |[0.858526604447247,0.14147339555275304] |
|10    |[0.9016935807101404,0.09830641928985952]|
|10    |[0.05039997211340099,0.949600027886599] |
|10    |[0.44083058592262064,0.5591694140773794]|
|10    |[0.7898507840379052,0.21014921596209485]|
|10    |[0.21348695456463335,0.7865130454353666]|
|10    |[0.6167794721858191,0.383220527814181]  |
|10    |[0.0232296312137377,0.9767703687862622] |
|4     |[0.38852605722918426,0.6114739427708157]|
|10    |[0.13327275349533668,0.8667272465046633]|
|1     |[0.7641206540065714,0.23587934599342858]|
|10    |[0.3957236585774921,0.6042763414225079] |
|5     |[0.18754884879382588,0.8124511512061741]|
|4     |[0.051068691642869986,0.94893130835713] |
|1     |[0.7205224590943771,0.27947754090562305]|
|9     |[0.29865243586235646,0.7013475641376435]|


In [90]:
import pyspark.sql.functions as fn
vectorToColumn = fn.udf(lambda vec: vec[0].item(), DoubleType())

In [91]:
transformed_probs = transformed.withColumn("topic_prob", vectorToColumn(transformed["topicDistribution"])).select("rating", "topic_prob")

In [92]:
transformed_probs.show()

[Stage 111:>                                                        (0 + 1) / 1]

+------+--------------------+
|rating|          topic_prob|
+------+--------------------+
|    10| 0.21331872856513448|
|     9|   0.858526604447247|
|    10|  0.9016935807101404|
|    10| 0.05039997211340099|
|    10| 0.44083058592262064|
|    10|  0.7898507840379052|
|    10| 0.21348695456463335|
|    10|  0.6167794721858191|
|    10|  0.0232296312137377|
|     4| 0.38852605722918426|
|    10| 0.13327275349533668|
|     1|  0.7641206540065714|
|    10|  0.3957236585774921|
|     5| 0.18754884879382588|
|     4|0.051068691642869986|
|     1|  0.7205224590943771|
|     9| 0.29865243586235646|
|    10| 0.16368643562463409|
|    10| 0.31141499347922996|
|    10|  0.5282397850895073|
+------+--------------------+
only showing top 20 rows



                                                                                

In [93]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation

assembler = VectorAssembler(
    inputCols=["rating", "topic_prob"],
    outputCol="features")

df_for_cor = assembler.transform(transformed_probs)
r1 = Correlation.corr(df_for_cor, "features").head()
print("Pearson correlation matrix:\n" + str(r1[0]))

Traceback (most recent call last):
  File "/usr/lib/spark-current/python/lib/pyspark.zip/pyspark/daemon.py", line 170, in manager
  File "/usr/lib/spark-current/python/lib/pyspark.zip/pyspark/daemon.py", line 73, in worker
  File "/usr/lib/spark-current/python/lib/pyspark.zip/pyspark/worker.py", line 402, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/usr/lib/spark-current/python/lib/pyspark.zip/pyspark/serializers.py", line 724, in read_int
    raise EOFError
EOFError
Traceback (most recent call last):                                              
  File "/usr/lib/spark-current/python/lib/pyspark.zip/pyspark/daemon.py", line 170, in manager
  File "/usr/lib/spark-current/python/lib/pyspark.zip/pyspark/daemon.py", line 73, in worker
  File "/usr/lib/spark-current/python/lib/pyspark.zip/pyspark/worker.py", line 402, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/usr/lib/spark-current/python/lib/pyspark.zip/pyspark/serializers.py", li

Pearson correlation matrix:
DenseMatrix([[1.        , 0.10818118],
             [0.10818118, 1.        ]])


                                                                                