## Notebook about Text Classification with Naive Bayes

In [0]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml.classification import NaiveBayes

In [0]:
# Start Spark session (in Databricks this is already running)
spark = SparkSession.builder.appName("TextClassification").getOrCreate()

# Create a small example dataset
data = spark.createDataFrame([
    (0.0, "spark is amazing"),
    (1.0, "machine learning is cool"),
    (0.0, "spark is fast"),
    (1.0, "deep learning with neural networks"),
    (0.0, "big data with spark"),
    (1.0, "AI and ML are related")
], ["label", "text"])

# Tokenizer: split text into words
tokenizer = Tokenizer(inputCol="text", outputCol="words")

# HashingTF: convert words to fixed-length term frequency vectors
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)

# IDF: convert TF to TF-IDF
idf = IDF(inputCol="rawFeatures", outputCol="features")



In [0]:
# Naive Bayes model
nb = NaiveBayes(featuresCol="features", labelCol="label")



In [0]:
# Build pipeline
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, nb])


In [0]:

# Fit the model
model = pipeline.fit(data)

# Test dataset
test = spark.createDataFrame([
    (0.0, "spark is great for big data"),
    (1.0, "neural networks and deep learning are powerful")
], ["label", "text"])

# Make predictions
predictions = model.transform(test)

# Show results
predictions.select("text", "label", "prediction", "probability").show(truncate=False)

+----------------------------------------------+-----+----------+-----------------------------------------+
|text                                          |label|prediction|probability                              |
+----------------------------------------------+-----+----------+-----------------------------------------+
|spark is great for big data                   |0.0  |0.0       |[0.9616138566524584,0.03838614334754158] |
|neural networks and deep learning are powerful|1.0  |1.0       |[0.029304485312308653,0.9706955146876913]|
+----------------------------------------------+-----+----------+-----------------------------------------+

