In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import ChiSqSelector, RegexTokenizer, CountVectorizer, StringIndexer, StopWordsRemover
from pyspark.sql import SparkSession

# Define the path to the input file
input_path = "hdfs:///user/dic23_shared/amazon-reviews/full/reviews_devset.json"
# input_path = "hdfs:///user/e11809642/reviews/reduced_devset.json"

# Create or retrieve a SparkSession
spark = SparkSession.builder.appName("ChiSquaredPipeline").getOrCreate()
sc = spark.sparkContext

# Read the input file as a DataFrame
df = spark.read.json(input_path)

# Define the regular expression pattern
pattern = "[^a-zA-Z<>^|]+"

# Tokenize the review text using a regex tokenizer
tokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern=pattern, gaps=True)
df = tokenizer.transform(df)

# Filter out stopwords from the tokens using a stop words remover
stopwords = sc.textFile("hdfs:///user/e11809642/stopwords.txt").collect()
remover = StopWordsRemover(inputCol="words", outputCol="filtered", stopWords=stopwords)
df = remover.transform(df)

# Convert the category column to a numeric type
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
df = indexer.fit(df).transform(df)

# Compute the term frequency for each term in each category
cv = CountVectorizer(inputCol="filtered", outputCol="features", binary=True)
cv_model = cv.fit(df)
df = cv_model.transform(df)

# Create a ChiSqSelector to select the top 2000 features
css = ChiSqSelector(featuresCol="features", outputCol="selectedFeatures", labelCol="categoryIndex", numTopFeatures=2000)
css_model = css.fit(df)
df = css_model.transform(df)

selected_features = css_model.selectedFeatures

feature_names = [cv_model.vocabulary[i] for i in selected_features]

pipeline = Pipeline(stages=[tokenizer, remover, indexer, cv, css])

spark.stop()


SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/lib/spark/jars/slf4j-log4j12-1.7.30.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/lib/hadoop/lib/slf4j-reload4j-1.7.36.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
23/05/09 13:49:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/05/09 13:49:23 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/05/09 13:49:23 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
23/05/09 13:49:23 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
23/05/09 13:49:23 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
23/05/09 13:49:2