#Simple text classification in pyspark

In [80]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [81]:
import pyspark

In [82]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("TextClassifierwithPySpark").getOrCreate()

In [83]:
!gdown --fuzzy https://drive.google.com/file/d/134oSNkuKlrbmkP8dPXQtA60kioMsx2IU/view?usp=share_link

Downloading...
From: https://drive.google.com/uc?id=134oSNkuKlrbmkP8dPXQtA60kioMsx2IU
To: /content/udemy_courses_clean.csv
  0% 0.00/870k [00:00<?, ?B/s]100% 870k/870k [00:00<00:00, 145MB/s]


In [84]:
df = spark.read.csv("/content/udemy_courses_clean.csv",header=True,inferSchema=True)
df.show()

+---+---------+--------------------+--------------------+-------+-----+---------------+-----------+------------+------------------+----------------+--------------------+----------------+--------------------+
|_c0|course_id|        course_title|                 url|is_paid|price|num_subscribers|num_reviews|num_lectures|             level|content_duration| published_timestamp|         subject|  clean_course_title|
+---+---------+--------------------+--------------------+-------+-----+---------------+-----------+------------+------------------+----------------+--------------------+----------------+--------------------+
|  0|  1070968|Ultimate Investme...|https://www.udemy...|   True|  200|           2147|         23|          51|        All Levels|       1.5 hours|2017-01-18T20:58:58Z|Business Finance|Ultimate Investme...|
|  1|  1113822|Complete GST Cour...|https://www.udemy...|   True|   75|           2792|        923|         274|        All Levels|        39 hours|2017-03-09T16:34:20Z

our target is subject column

In [85]:
df = df.select('course_title','subject')

In [86]:
df.show()

+--------------------+----------------+
|        course_title|         subject|
+--------------------+----------------+
|Ultimate Investme...|Business Finance|
|Complete GST Cour...|Business Finance|
|Financial Modelin...|Business Finance|
|Beginner to Pro -...|Business Finance|
|How To Maximize Y...|Business Finance|
|Trading Penny Sto...|Business Finance|
|Investing And Tra...|Business Finance|
|Trading Stock Cha...|Business Finance|
|Options Trading 3...|Business Finance|
|The Only Investme...|Business Finance|
|Forex Trading Sec...|Business Finance|
|Trading Options W...|Business Finance|
|Financial Managem...|Business Finance|
|Forex Trading Cou...|Business Finance|
|Python Algo Tradi...|Business Finance|
|Short Selling: Le...|Business Finance|
|Basic Technical A...|Business Finance|
|The Complete Char...|Business Finance|
|7 Deadly Mistakes...|Business Finance|
|Financial Stateme...|Business Finance|
+--------------------+----------------+
only showing top 20 rows



In [87]:
df.groupBy("subject").count().show()

+--------------------+-----+
|             subject|count|
+--------------------+-----+
|play Electric Gui...|    1|
|Multiply returns ...|    1|
|                null|    6|
|    Business Finance| 1198|
|Introduction Guit...|    1|
|Learn Play Fernan...|    1|
|      Graphic Design|  603|
|Aprende tocar el ...|    1|
|     Web Development| 1200|
|Learn Classical G...|    1|
| Musical Instruments|  676|
+--------------------+-----+



In [88]:
df.toPandas()['subject'].isnull().sum()

6

In [89]:
df = df.dropna(subset = ("subject"))

Feature Extraction

In [90]:
from pyspark.ml.feature import Tokenizer,StopWordsRemover,CountVectorizer,IDF
from pyspark.ml.feature import StringIndexer

In [91]:
tokenizer = Tokenizer(inputCol = "course_title", outputCol = "course_tokens")
stopwords_remover = StopWordsRemover(inputCol='course_tokens',outputCol='removed_tokens')
vectorizer = CountVectorizer(inputCol='removed_tokens',outputCol='count_features')
idf = IDF(inputCol='count_features',outputCol='vectorized_features')

In [92]:
labelEncoder = StringIndexer(inputCol='subject',outputCol='label').fit(df)

In [93]:
label_dict = {'Web Development':0.0,
 'Business Finance':1.0,
 'Musical Instruments':2.0,
 'Graphic Design':3.0}

In [94]:
labelEncoder.transform(df).show(5)


+--------------------+----------------+-----+
|        course_title|         subject|label|
+--------------------+----------------+-----+
|Ultimate Investme...|Business Finance|  1.0|
|Complete GST Cour...|Business Finance|  1.0|
|Financial Modelin...|Business Finance|  1.0|
|Beginner to Pro -...|Business Finance|  1.0|
|How To Maximize Y...|Business Finance|  1.0|
+--------------------+----------------+-----+
only showing top 5 rows



In [95]:
df = labelEncoder.transform(df)

In [96]:
train,test = df.randomSplit((0.7,0.3),seed=36)

In [97]:
from pyspark.ml.classification import LogisticRegression
logreg = LogisticRegression(featuresCol='vectorized_features',labelCol='label')

Lets build pipeline

In [98]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[tokenizer,stopwords_remover,vectorizer,idf, logreg])

In [99]:
model = pipeline.fit(train)

In [102]:
predictions = model.transform(test)

In [103]:
predictions.columns

['course_title',
 'subject',
 'label',
 'course_tokens',
 'removed_tokens',
 'count_features',
 'vectorized_features',
 'rawPrediction',
 'probability',
 'prediction']

In [104]:
predictions.select('rawPrediction','probability','subject','label','prediction').show(10)

+--------------------+--------------------+-------------------+-----+----------+
|       rawPrediction|         probability|            subject|label|prediction|
+--------------------+--------------------+-------------------+-----+----------+
|[-9.3258336335251...|[1.14287965839483...|Musical Instruments|  2.0|       2.0|
|[-10.486766865887...|[4.77381253460093...|Musical Instruments|  2.0|       2.0|
|[-0.5938203645487...|[2.29480525645599...|   Business Finance|  1.0|       1.0|
|[-7.2137634002143...|[1.76553600884570...|Musical Instruments|  2.0|       1.0|
|[-12.614757376412...|[7.23675199517615...|Musical Instruments|  2.0|       2.0|
|[32.0854736051830...|[0.99999999999999...|    Web Development|  0.0|       0.0|
|[-0.0323419718631...|[1.38129177988177...|   Business Finance|  1.0|       1.0|
|[-7.3974249564886...|[2.45828508847594...|   Business Finance|  1.0|       1.0|
|[7.74873830431251...|[0.88418654797933...|     Graphic Design|  3.0|       0.0|
|[8.64465012986057...|[1.557

Evaluation

In [105]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol='label',predictionCol='prediction',metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
accuracy

0.9387568555758684

In [106]:
from pyspark.mllib.evaluation import MulticlassMetrics
model_metric= MulticlassMetrics(predictions['label','prediction'].rdd)



In [107]:
print("Accuracy:",model_metric.accuracy)
print("Precision:",model_metric.precision(1.0))
print("Recall:",model_metric.recall(1.0))
print("F1Score:",model_metric.fMeasure(1.0))

Accuracy: 0.9387568555758684
Precision: 0.9811827956989247
Recall: 0.8924205378973105
F1Score: 0.9346991037131882
