# MACHINE LEARNING

## SETUP

In [2]:
# Setup - Run only once per Kernel App
%conda install openjdk -y

# install PySpark
%pip install pyspark==3.4.0

# install spark-nlp
%pip install spark-nlp==5.1.3

# restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

Retrieving notices: ...working... done
Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 23.3.1
  latest version: 23.10.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.10.0



## Package Plan ##

  environment location: /opt/conda

  added / updated specs:
    - openjdk


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2023.08.22 |       h06a4308_0         123 KB
    certifi-2023.11.17         |  py310h06a4308_0         158 KB
    openjdk-11.0.13            |       h87a67e3_0       341.0 MB
    ------------------------------------------------------------
                                           Total:       341.3 MB

The following NEW packages will be INSTALLED:

  openjdk            pk

## MODEL 1

In [4]:
# Import pyspark and build Spark session
from pyspark.sql import SparkSession

# Import pyspark and build Spark session
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[*]")\
    .config("spark.driver.memory","16G")\
    .config("spark.executor.memory", "12g")\
    .config("spark.executor.cores", "3")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3,org.apache.hadoop:hadoop-aws:3.2.2")\
    .config(
            "fs.s3a.aws.credentials.provider",
            "com.amazonaws.auth.ContainerCredentialsProvider"
    )\
    .getOrCreate()

print(spark.version)

3.4.0


In [12]:
import sagemaker
from pyspark.sql.functions import lower, regexp_replace, col, concat_ws
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.base import Finisher, DocumentAssembler
from pyspark.sql.functions import length

### Import Data

In [19]:
%%time
bucket = "project-group34"
session = sagemaker.Session()
output_prefix_data_submissions = "project/submissions/yyyy=*"
s3_path = f"s3a://{bucket}/{output_prefix_data_submissions}"
print(f"reading comments from {s3_path}")
submissions = spark.read.parquet(s3_path, header=True)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
reading comments from s3a://project-group34/project/submissions/yyyy=*
CPU times: user 27.6 ms, sys: 0 ns, total: 27.6 ms
Wall time: 968 ms


In [7]:
submissions.printSchema()

root
 |-- adserver_click_url: string (nullable = true)
 |-- adserver_imp_pixel: string (nullable = true)
 |-- archived: boolean (nullable = true)
 |-- author: string (nullable = true)
 |-- author_cakeday: boolean (nullable = true)
 |-- author_flair_css_class: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- author_id: string (nullable = true)
 |-- brand_safe: boolean (nullable = true)
 |-- contest_mode: boolean (nullable = true)
 |-- created_utc: timestamp (nullable = true)
 |-- crosspost_parent: string (nullable = true)
 |-- crosspost_parent_list: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- approved_at_utc: string (nullable = true)
 |    |    |-- approved_by: string (nullable = true)
 |    |    |-- archived: boolean (nullable = true)
 |    |    |-- author: string (nullable = true)
 |    |    |-- author_flair_css_class: string (nullable = true)
 |    |    |-- author_flair_text: string (nullable = true)
 |    |    

### Data Processing

In [8]:
print(f"Shape of the submissions data is: {submissions.count()} x {len(submissions.columns)}")



Shape of the submissions data is: 875969 x 68


                                                                                

In [20]:
submissions = submissions.select("subreddit", "title", "selftext", "score", "num_comments", "over_18", "is_self", "is_video", "domain", "created_utc", "author", "author_flair_text", "media")

In [21]:
# Assuming your DataFrame is named `df`
submissions = submissions.withColumn('post_length', length(submissions.title) + length(submissions.selftext))

In [22]:
from pyspark.sql import functions as F

submissions = submissions.withColumn('created_utc', F.to_timestamp('created_utc'))

# Extract time-based features
submissions = submissions.withColumn('hour_of_day', F.hour('created_utc'))
submissions = submissions.withColumn('day_of_week', F.dayofweek('created_utc'))  # 1 (Sunday) to 7 (Saturday)
# Map each day of the week from numeric to string
submissions = submissions.withColumn('day_of_week_str', F.expr("""
    CASE day_of_week 
        WHEN 1 THEN 'Sunday'
        WHEN 2 THEN 'Monday'
        WHEN 3 THEN 'Tuesday'
        WHEN 4 THEN 'Wednesday'
        WHEN 5 THEN 'Thursday'
        WHEN 6 THEN 'Friday'
        WHEN 7 THEN 'Saturday'
    END
"""))
submissions = submissions.withColumn('day_of_month', F.dayofmonth('created_utc'))
submissions = submissions.withColumn('month', F.month('created_utc'))
submissions = submissions.withColumn('year', F.year('created_utc'))

submissions = submissions.withColumn('has_media', F.col('media').isNotNull())

In [23]:
submissions = submissions.drop(*["media"])

In [26]:
submissions = submissions.select('subreddit',
                                 'title',
                                 'selftext',
                                 'score',
                                 'num_comments',
                                 'over_18',
                                 'is_self',
                                 'is_video',
                                 'domain',
                                 'post_length',
                                 'hour_of_day',
                                 'day_of_week',
                                 'day_of_week_str',
                                 'day_of_month',
                                 'month',
                                 'year',
                                 'has_media')

In [28]:
# Combine 'title' and 'selftext' into a new column 'body'
submissions = submissions.withColumn("body", concat_ws(" ", col("title"), col("selftext")))


In [29]:
submissions = submissions.drop(*["title", "selftext"])

In [30]:
submissions.show(5)

+----------+-----+------------+-------+-------+--------+---------------+-----------+-----------+-----------+---------------+------------+-----+----+---------+--------------------+
| subreddit|score|num_comments|over_18|is_self|is_video|         domain|post_length|hour_of_day|day_of_week|day_of_week_str|day_of_month|month|year|has_media|                body|
+----------+-----+------------+-------+-------+--------+---------------+-----------+-----------+-----------+---------------+------------+-----+----+---------+--------------------+
|television|    0|           9|  false|   true|   false|self.television|        605|         22|          4|      Wednesday|          27|    1|2021|    false|Is there a websit...|
|     anime|    0|           3|  false|  false|   false|      i.redd.it|         50|         22|          4|      Wednesday|          27|    1|2021|    false|Does anyone know ...|
|television|    4|          11|  false|  false|   false|   deadline.com|         86|         22|    

In [31]:
from pyspark.sql.functions import col, count, when

missing_vals = submissions.select([count(when(col(c).isNull(), c)).alias(c) for c in submissions.columns])

In [32]:
missing_vals.show()



+---------+-----+------------+-------+-------+--------+------+-----------+-----------+-----------+---------------+------------+-----+----+---------+----+
|subreddit|score|num_comments|over_18|is_self|is_video|domain|post_length|hour_of_day|day_of_week|day_of_week_str|day_of_month|month|year|has_media|body|
+---------+-----+------------+-------+-------+--------+------+-----------+-----------+-----------+---------------+------------+-----+----+---------+----+
|        0|    0|           0|      0|      0|       0|  8002|          0|          0|          0|              0|           0|    0|   0|        0|   0|
+---------+-----+------------+-------+-------+--------+------+-----------+-----------+-----------+---------------+------------+-----+----+---------+----+



                                                                                

In [33]:
submissions = submissions.na.drop(subset=["domain"])

In [34]:
from pyspark.sql.functions import lower, regexp_replace

submissions = submissions.withColumn("body", lower(col("body")))

# Remove newline characters
submissions = submissions.withColumn("body", regexp_replace(col("body"), "\n", " "))

# Remove punctuations
submissions = submissions.withColumn("body", regexp_replace(col("body"), "[^a-zA-Z0-9\s]", ""))


In [35]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF

# Tokenize text
tokenizer = Tokenizer(inputCol="body", outputCol="words")
tokenized_df = tokenizer.transform(submissions)

# Remove stop words
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
df_no_stopwords = remover.transform(tokenized_df)

# Vectorize words
hashingTF = HashingTF(inputCol="filtered_words", outputCol="rawFeatures")
featurizedData = hashingTF.transform(df_no_stopwords)

# Optionally, use IDF to rescale the feature vectors
idf = IDF(inputCol="rawFeatures", outputCol="features")
rescaledData = idf.fit(featurizedData).transform(featurizedData)


                                                                                

In [39]:
rescaledData = rescaledData.drop(*["day_of_week_str", "words", "filtered_words", "rawFeatures"])

In [40]:
rescaledData.printSchema()

root
 |-- subreddit: string (nullable = true)
 |-- score: long (nullable = true)
 |-- num_comments: long (nullable = true)
 |-- over_18: boolean (nullable = true)
 |-- is_self: boolean (nullable = true)
 |-- is_video: boolean (nullable = true)
 |-- domain: string (nullable = true)
 |-- post_length: integer (nullable = true)
 |-- hour_of_day: integer (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- day_of_month: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- has_media: boolean (nullable = false)
 |-- body: string (nullable = false)
 |-- features: vector (nullable = true)



In [41]:
rescaledData.show()

23/11/27 18:07:59 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB


+----------+-----+------------+-------+-------+--------+---------------+-----------+-----------+-----------+------------+-----+----+---------+--------------------+--------------------+
| subreddit|score|num_comments|over_18|is_self|is_video|         domain|post_length|hour_of_day|day_of_week|day_of_month|month|year|has_media|                body|            features|
+----------+-----+------------+-------+-------+--------+---------------+-----------+-----------+-----------+------------+-----+----+---------+--------------------+--------------------+
|television|    0|           9|  false|   true|   false|self.television|        605|         22|          4|          27|    1|2021|    false|is there a websit...|(262144,[1546,158...|
|     anime|    0|           3|  false|  false|   false|      i.redd.it|         50|         22|          4|          27|    1|2021|    false|does anyone know ...|(262144,[101370,1...|
|television|    4|          11|  false|  false|   false|   deadline.com|   

In [38]:
rescaledData.select("features").show(1, truncate=False)

23/11/27 18:02:36 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB


+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

### ML Preprocessing

In [62]:
from pyspark.sql.functions import col

rescaledData = rescaledData.withColumn("over_18", col("over_18").cast("string"))
rescaledData = rescaledData.withColumn("is_self", col("is_self").cast("string"))
rescaledData = rescaledData.withColumn("is_video", col("is_video").cast("string"))
rescaledData = rescaledData.withColumn("has_media", col("has_media").cast("string"))


In [59]:
train_data, test_data, val_data = rescaledData.randomSplit([0.8, 0.18, 0.02], seed=1220)

# Print the number of records in each dataset
print("Number of training records: " + str(train_data.count()))
print("Number of testing records: " + str(test_data.count()))
print("Number of validation records: " + str(val_data.count()))
# train_data.cache()

23/11/27 19:01:14 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/11/27 19:04:59 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB


Number of training records: 694195


23/11/27 19:08:27 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB


Number of testing records: 156531




Number of validation records: 17241


                                                                                

DataFrame[subreddit: string, score: bigint, num_comments: bigint, over_18: boolean, is_self: boolean, is_video: boolean, domain: string, post_length: int, hour_of_day: int, day_of_week: int, day_of_month: int, month: int, year: int, has_media: boolean, body: string, features: vector]

In [45]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, IndexToString, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier, MultilayerPerceptronClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml import Pipeline, Model

In [66]:
stringIndexer_over_18 = StringIndexer(inputCol="over_18", outputCol="over_18_ix")
stringIndexer_is_self = StringIndexer(inputCol="is_self", outputCol="is_self_ix")
stringIndexer_is_video = StringIndexer(inputCol="is_video", outputCol="is_video_ix")
stringIndexer_has_media = StringIndexer(inputCol="has_media", outputCol="has_media_ix")
stringIndexer_subreddit = StringIndexer(inputCol="subreddit", outputCol="subreddit_ix")

In [47]:
# Fit the StringIndexer to the data
stringIndexerModel_subreddit = stringIndexer_subreddit.fit(submissions)

# Now you can access the labels
labels = stringIndexerModel_subreddit.labels

                                                                                

In [48]:
print(labels)

['anime', 'movies', 'television']


In [67]:
onehot_over_18 = OneHotEncoder(inputCol="over_18_ix", outputCol="over_18_vec")
onehot_is_self = OneHotEncoder(inputCol="is_self_ix", outputCol="is_self_vec")
onehot_is_video = OneHotEncoder(inputCol="is_video_ix", outputCol="is_video_vec")
onehot_has_media = OneHotEncoder(inputCol="has_media_ix", outputCol="has_media_vec")

In [68]:
rescaledData.printSchema()

root
 |-- subreddit: string (nullable = true)
 |-- score: long (nullable = true)
 |-- num_comments: long (nullable = true)
 |-- over_18: string (nullable = true)
 |-- is_self: string (nullable = true)
 |-- is_video: string (nullable = true)
 |-- domain: string (nullable = true)
 |-- post_length: integer (nullable = true)
 |-- hour_of_day: integer (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- day_of_month: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- has_media: string (nullable = false)
 |-- body: string (nullable = false)
 |-- features: vector (nullable = true)



In [69]:
vectorAssembler_features = VectorAssembler(
    inputCols=["features", "over_18_vec", 
               "is_self_vec", "is_video_vec", "has_media_vec", "score", "num_comments",
               "post_length", "hour_of_day", "day_of_week", "day_of_month", "month", "year"],
    outputCol="combined_features")


In [70]:
rf_classifier = RandomForestClassifier(labelCol="subreddit_ix", featuresCol="combined_features", numTrees=30)

In [71]:
labelConverter = IndexToString(inputCol="prediction", 
                               outputCol="predictedSubreddit", 
                               labels=stringIndexer_subreddit.fit(rescaledData).labels)

                                                                                

In [72]:
pipeline = Pipeline(stages=[
    stringIndexer_subreddit, 
    stringIndexer_over_18, 
    stringIndexer_is_self, 
    stringIndexer_is_video, 
    stringIndexer_has_media,
    onehot_over_18,
    onehot_is_self,
    onehot_is_video,
    onehot_has_media,
    vectorAssembler_features,
    rf_classifier,
    labelConverter
])


In [73]:
train_data.printSchema()

root
 |-- subreddit: string (nullable = true)
 |-- score: long (nullable = true)
 |-- num_comments: long (nullable = true)
 |-- over_18: string (nullable = true)
 |-- is_self: string (nullable = true)
 |-- is_video: string (nullable = true)
 |-- domain: string (nullable = true)
 |-- post_length: integer (nullable = true)
 |-- hour_of_day: integer (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- day_of_month: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- has_media: string (nullable = false)
 |-- body: string (nullable = false)
 |-- features: vector (nullable = true)



In [74]:
rf_model = pipeline.fit(train_data)

23/11/27 19:24:13 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/11/27 19:27:43 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/11/27 19:27:43 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/11/27 19:31:05 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/11/27 19:31:05 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/11/27 19:34:32 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/11/27 19:34:33 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/11/27 19:38:00 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/11/27 19:38:01 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
23/11/27 19:41:17 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
                                                                                

IllegalArgumentException: subreddit_vec does not exist. Available: subreddit, score, num_comments, over_18, is_self, is_video, domain, post_length, hour_of_day, day_of_week, day_of_month, month, year, has_media, body, features, subreddit_ix, over_18_ix, is_self_ix, is_video_ix, has_media_ix, over_18_vec, is_self_vec, is_video_vec, has_media_vec

In [None]:
# Transforming the training data using the trained model
transformed_train_data = rf_model.transform(train_data)

In [None]:
# Showing the first few rows of the transformed data
transformed_train_data.show()