In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF

In [2]:
# create session
spark = SparkSession.builder.appName('idf').getOrCreate()

# load data
spark.sparkContext.addFile("https://s3.amazonaws.com/zepl-trilogy-test/airlines.csv")
df = spark.read.csv(SparkFiles.get("airlines.csv"), sep=",", header=True)
df.show(truncate=100)

+----------------------------------------------------------------------------------------------------+
|                                                                                      Airline Tweets|
+----------------------------------------------------------------------------------------------------+
|                            @VirginAmerica plus you've added commercials to the experience... tacky.|
|@VirginAmerica seriously would pay $30 a flight for seats that didn't have this playing. it's rea...|
|                             @VirginAmerica do you miss me? Don't worry we'll be together very soon.|
|       @VirginAmerica Are the hours of operation for the Club at SFO that are posted online current?|
|@VirginAmerica awaiting my return phone call, just would prefer to use your online self-service o...|
+----------------------------------------------------------------------------------------------------+



In [3]:
# Tokenize dataframe
tokened = Tokenizer(inputCol="Airline Tweets", outputCol="words")
tokened_transformed = tokened.transform(df)
tokened_transformed.show(truncate=50)

+--------------------------------------------------+--------------------------------------------------+
|                                    Airline Tweets|                                             words|
+--------------------------------------------------+--------------------------------------------------+
|@VirginAmerica plus you've added commercials to...|[@virginamerica, plus, you've, added, commercia...|
|@VirginAmerica seriously would pay $30 a flight...|[@virginamerica, seriously, would, pay, $30, a,...|
|@VirginAmerica do you miss me? Don't worry we'l...|[@virginamerica, do, you, miss, me?, don't, wor...|
|@VirginAmerica Are the hours of operation for t...|[@virginamerica, are, the, hours, of, operation...|
|@VirginAmerica awaiting my return phone call, j...|[@virginamerica, awaiting, my, return, phone, c...|
+--------------------------------------------------+--------------------------------------------------+



In [4]:
# Remove stop words
stop_list = ["@VirginAmerica", "$30", "@virginamerica"]
remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords=stop_list)
removed_frame = remover.transform(tokened_transformed)
removed_frame.show(truncate=33)

+---------------------------------+---------------------------------+---------------------------------+
|                   Airline Tweets|                            words|                         filtered|
+---------------------------------+---------------------------------+---------------------------------+
|@VirginAmerica plus you've add...|[@virginamerica, plus, you've,...|[plus, you've, added, commerci...|
|@VirginAmerica seriously would...|[@virginamerica, seriously, wo...|[seriously, would, pay, a, fli...|
|@VirginAmerica do you miss me?...|[@virginamerica, do, you, miss...|[do, you, miss, me?, don't, wo...|
|@VirginAmerica Are the hours o...|[@virginamerica, are, the, hou...|[are, the, hours, of, operatio...|
|@VirginAmerica awaiting my ret...|[@virginamerica, awaiting, my,...|[awaiting, my, return, phone, ...|
+---------------------------------+---------------------------------+---------------------------------+



In [5]:
# Run the hashing term frequency
hashing = HashingTF(inputCol="filtered", outputCol="hashedValues", numFeatures=pow(2,4))

# Transform into a DF
hashed_df = hashing.transform(removed_frame)
hashed_df.show()

+--------------------+--------------------+--------------------+--------------------+
|      Airline Tweets|               words|            filtered|        hashedValues|
+--------------------+--------------------+--------------------+--------------------+
|@VirginAmerica pl...|[@virginamerica, ...|[plus, you've, ad...|(16,[3,4,5,7,8,9,...|
|@VirginAmerica se...|[@virginamerica, ...|[seriously, would...|(16,[0,1,2,3,4,9,...|
|@VirginAmerica do...|[@virginamerica, ...|[do, you, miss, m...|(16,[0,1,8,10,11,...|
|@VirginAmerica Ar...|[@virginamerica, ...|[are, the, hours,...|(16,[0,1,2,4,7,9,...|
|@VirginAmerica aw...|[@virginamerica, ...|[awaiting, my, re...|(16,[0,3,4,6,7,8,...|
+--------------------+--------------------+--------------------+--------------------+



In [6]:
# Fit the IDF on the data set 
idf = IDF(inputCol="hashedValues", outputCol="features")
idfModel = idf.fit(hashed_df)
rescaledData = idfModel.transform(hashed_df)

In [7]:
# Display the dataframe
rescaledData.select("filtered", "features").show(truncate=50)

+--------------------------------------------------+--------------------------------------------------+
|                                          filtered|                                          features|
+--------------------------------------------------+--------------------------------------------------+
|[plus, you've, added, commercials, to, the, exp...|(16,[3,4,5,7,8,9,12,14],[0.4054651081081644,0.1...|
|[seriously, would, pay, a, flight, for, seats, ...|(16,[0,1,2,3,4,9,11,12,13,14],[0.36464311358790...|
|[do, you, miss, me?, don't, worry, we'll, be, t...|(16,[0,1,8,10,11,12,14,15],[0.1823215567939546,...|
|[are, the, hours, of, operation, for, the, club...|(16,[0,1,2,4,7,9,11,12,14,15],[0.54696467038186...|
|[awaiting, my, return, phone, call,, just, woul...|(16,[0,3,4,6,7,8,9,12,13,14,15],[0.364643113587...|
+--------------------------------------------------+--------------------------------------------------+

