In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from numpy.testing import assert_equal

# Problem 2

For this problem, you will be working with the [Humor Detection from Product Question Answering Systems](https://registry.opendata.aws/humor-detection/) data set.

You will perform the tasks for this problem on Jojie and you will answer them directly on this notebook. You must work directly with the data on S3. Do **not** download. You may **only** use Apache Spark and the Python Standard Library. You **cannot** use numpy, scipy, pandas or scikit-learn. Do **not** print or display large amount of results. You will get deductions if you make the browser unresponsive.

## Problem 2a [5 pts]

Create a function `compute_pcc` that will return the PCC for `label`. Exclude null `label`s.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count

spark = SparkSession.builder \
    .appName("BDCCFinalExam") \
    .master("local[*]") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider") \
    .config("spark.jars.packages", 
            "org.apache.hadoop:hadoop-aws:3.2.0," 
            "com.amazonaws:aws-java-sdk-bundle:1.11.375") \
    .getOrCreate()

In [3]:
s3_path = "s3a://humor-detection-pds/*.csv"

df = spark.read.option("header", True).option("inferSchema", True).csv(s3_path)
df = df.filter(col("label").isNotNull())
df.printSchema()

25/06/07 16:54:50 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties

root
 |-- question: string (nullable = true)
 |-- product_description: string (nullable = true)
 |-- image_url: string (nullable = true)
 |-- label: integer (nullable = true)



                                                                                

In [4]:
df.toPandas()['label'].value_counts()

                                                                                

label
0    19138
1     9567
Name: count, dtype: int64

In [5]:
def compute_pcc():
    class_counts = (
            df.groupBy("label")
              .agg(count("*").alias("count"))
              .toPandas()
        )
    class_counts["percentage"] = class_counts["count"] / class_counts["count"].sum()
    pcc = sum(class_counts["percentage"] ** 2)
    return pcc

In [6]:
from numpy.testing import assert_almost_equal
pcc = compute_pcc()

                                                                                

In [7]:
print(f"Proportional Chance Criterion (PCC): {pcc:.4f}")

Proportional Chance Criterion (PCC): 0.5556


## Problem 2b [15 pts]

Using `question` as a feature, create a trained MLlib supervised ML model for predicting `label` that uses hashing as a preprocessing step. Test accuracy should be at least 1.25 * PCC. Make sure that the steps are clearly documented and the value of test accuracy is explicitly stated.

In [8]:
from pyspark.ml.feature import Tokenizer, HashingTF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder

# Step 1: Tokenize the question
tokenizer = Tokenizer(inputCol="question", outputCol="tokens")

# Step 2: HashingTF to convert tokens into numerical vectors
hashing_tf = HashingTF(inputCol="tokens", outputCol="features", numFeatures=10000)

# Step 3: Train a logistic regression model
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=20)

# Step 4: Assemble pipeline
pipeline = Pipeline(stages=[tokenizer, hashing_tf, lr])

# Step 5: Train/Test split
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

# Step 6: Fit the model
model = pipeline.fit(train_df)

# Step 7: Predict
predictions = model.transform(test_df)

# Step 8: Evaluate Accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
test_accuracy = evaluator.evaluate(predictions)

# Step 9: Compute PCC
from numpy.testing import assert_almost_equal
pcc = compute_pcc()

# Step 10: Print and assert accuracy threshold
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"PCC: {pcc:.4f}")
print(f"1.25 * PCC: {1.25 * pcc:.4f}")

assert test_accuracy >= 1.25 * pcc, "Accuracy is below the required 1.25 * PCC threshold."

25/06/07 16:55:16 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS

Test Accuracy: 0.7776
PCC: 0.5556
1.25 * PCC: 0.6945


                                                                                