In [25]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, StringType
from pyspark.ml.feature import CountVectorizer, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [26]:
spark = SparkSession.builder.appName("TextClassification")
    .getOrCreate()

IndentationError: unexpected indent (591800348.py, line 2)

In [27]:
# Sample data (text, label)
data = [
    ("Cats are playing in the garden", "animal"),
    ("Dogs are barking loudly", "animal"),
    ("Stocks rose in early trading", "finance"),
    ("Investors are watching markets closely", "finance"),
    ("I love reading books", "hobby"),
    ("He enjoys painting and drawing", "hobby")
]

In [28]:
df = spark.createDataFrame(data, ['text', 'label'])
df.show()

+--------------------+-------+
|                text|  label|
+--------------------+-------+
|Cats are playing ...| animal|
|Dogs are barking ...| animal|
|Stocks rose in ea...|finance|
|Investors are wat...|finance|
|I love reading books|  hobby|
|He enjoys paintin...|  hobby|
+--------------------+-------+



In [32]:
stemmer = PorterStemmer()

def stem_tokens(text):
    tokens = word_tokenize(text.lower())
    stems = [stemmer.stem(token) for token in tokens]
    return stems

### Register UDF
stem_udf = udf(stem_tokens, ArrayType(StringType()))

### Apply Stemming

df_stemmed = df.withColumn("stemmed_words" , stem_udf(col("text")))
df_stemmed.show()



+--------------------+-------+--------------------+
|                text|  label|       stemmed_words|
+--------------------+-------+--------------------+
|Cats are playing ...| animal|[cat, are, play, ...|
|Dogs are barking ...| animal|[dog, are, bark, ...|
|Stocks rose in ea...|finance|[stock, rose, in,...|
|Investors are wat...|finance|[investor, are, w...|
|I love reading books|  hobby|[i, love, read, b...|
|He enjoys paintin...|  hobby|[he, enjoy, paint...|
+--------------------+-------+--------------------+



                                                                                

In [35]:
### Featuer enginnering
cv = CountVectorizer(inputCol="stemmed_words", outputCol="features")

#### Label Encoding
indexer = StringIndexer(inputCol="label", outputCol="labelIndex")

### Classification
lr = LogisticRegression(featuresCol="features", labelCol="labelIndex", maxIter=10)



##build pipeline
pipeline = Pipeline(stages=[cv, indexer, lr])

## train model
model = pipeline.fit(df_stemmed)


25/05/24 12:24:57 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

In [39]:
predictions = model.transform(df_stemmed)
predictions.select("text", "label", "prediction").show(truncate=False)



+--------------------------------------+-------+----------+
|text                                  |label  |prediction|
+--------------------------------------+-------+----------+
|Cats are playing in the garden        |animal |0.0       |
|Dogs are barking loudly               |animal |0.0       |
|Stocks rose in early trading          |finance|1.0       |
|Investors are watching markets closely|finance|1.0       |
|I love reading books                  |hobby  |2.0       |
|He enjoys painting and drawing        |hobby  |2.0       |
+--------------------------------------+-------+----------+



                                                                                