# Part 2: Datasets/DataFrames

In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import ChiSqSelector, RegexTokenizer, CountVectorizer, StringIndexer, StopWordsRemover, HashingTF, IDF
from pyspark.sql import SparkSession

In [2]:
input_path = "hdfs:///user/dic23_shared/amazon-reviews/full/reviews_devset.json"
# input_path = "hdfs:///user/e11809642/reviews/reduced_devset.json"

Create Spark Session  

In [3]:
spark = SparkSession.builder.appName("ChiSquaredPipeline").getOrCreate()
sc = spark.sparkContext

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/lib/spark/jars/slf4j-log4j12-1.7.30.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/lib/hadoop/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
23/05/24 18:03:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/05/24 18:03:23 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/05/24 18:03:23 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
23/05/24 18:03:23 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
23/05/24 18:03:23 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
23/05/24 18:03:2

Read dataset into DataFrame

In [4]:
input_df = spark.read.json(input_path)

                                                                                

tokenize

In [5]:
pattern = "[^a-zA-Z<>^|]+"
tokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern=pattern, gaps=True)
df = tokenizer.transform(df)

stopword removal (using local `stopwords.txt`)

In [6]:
stopwords = sc.textFile("hdfs:///user/e11717659/stopwords.txt").collect()
remover = StsopWordsRemover(inputCol="words", outputCol="filtered", stopWords=stopwords)
df = remover.transform(df)

split dataset

In [None]:
training_data, test_data, validation_data = df.randomSplit([0.7, 0.2, 0.1], seed=69)

count term frequencies

In [7]:
# Convert the category column to a numeric type
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")

# Compute TF-IDF
hashing_tf = HashingTF(inputCol="filtered", outputCol="rawFeatures")

idf = IDF(inputCol="rawFeatures", outputCol="features")

df.select("features", "category").show()

23/05/24 18:04:14 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB


+--------------------+--------------------+
|            features|            category|
+--------------------+--------------------+
|(262144,[7527,758...|Patio_Lawn_and_Garde|
|(262144,[1968,232...|Patio_Lawn_and_Garde|
|(262144,[29241,29...|Patio_Lawn_and_Garde|
|(262144,[8804,230...|Patio_Lawn_and_Garde|
|(262144,[1797,243...|Patio_Lawn_and_Garde|
|(262144,[2325,892...|Patio_Lawn_and_Garde|
|(262144,[10631,17...|Patio_Lawn_and_Garde|
|(262144,[3613,150...|Patio_Lawn_and_Garde|
|(262144,[5030,196...|Patio_Lawn_and_Garde|
|(262144,[22716,33...|Patio_Lawn_and_Garde|
|(262144,[16928,39...|Patio_Lawn_and_Garde|
|(262144,[1546,389...|Patio_Lawn_and_Garde|
|(262144,[13790,19...|Patio_Lawn_and_Garde|
|(262144,[7150,310...|Patio_Lawn_and_Garde|
|(262144,[2701,635...|Patio_Lawn_and_Garde|
|(262144,[991,2325...|Patio_Lawn_and_Garde|
|(262144,[3121,578...|Patio_Lawn_and_Garde|
|(262144,[121517,1...|Patio_Lawn_and_Garde|
|(262144,[3898,223...|Patio_Lawn_and_Garde|
|(262144,[140904,1...|Patio_Lawn

feature selection w/ Chi Square (top 2000 features)

In [8]:
css = ChiSqSelector(featuresCol="features", outputCol="selectedFeatures", labelCol="categoryIndex", numTopFeatures=2000)
css_model = css.fit(df)
df = css_model.transform(df)

23/05/24 18:04:15 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/05/24 18:04:15 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/05/24 18:04:25 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
                                                                                

In [15]:
selected_features = css_model.selectedFeatures
feature_names = [df.select("features").features[i] for i in selected_features]

Create pipeline combining all steps

# Part 3: Text Classification

In [11]:
# Normalize each Vector using $L^1$ norm.
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
df = normalizer.transform(dfs)

NameError: name 'SomeTransformer' is not defined

In [10]:
pipeline = Pipeline(stages=[tokenizer, remover, indexer, hashing_tf, idf, css, normalizer])
pipeline.fit(training_data)

In [None]:
spark.stop()