In [None]:
bucket_name = "sk2224-projectdata"
!aws s3 mb s3://{bucket_name}

In [None]:
# Setup - Run only once per Kernel App
%conda install openjdk -y

# install PySpark
%pip install pyspark==3.4.0

# install spark-nlp
%pip install spark-nlp==5.1.3

# restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

### Starting Spark Session

In [2]:
# Import pyspark and build Spark session
from pyspark.sql import SparkSession

# Import pyspark and build Spark session
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[*]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3,org.apache.hadoop:hadoop-aws:3.2.2")\
    .config(
            "fs.s3a.aws.credentials.provider",
            "com.amazonaws.auth.ContainerCredentialsProvider",
    )\
    .getOrCreate()



:: loading settings :: url = jar:file:/opt/conda/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-44e4036b-aa68-4a54-9d07-ccdb0aeaabac;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;5.1.3 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.828 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.unimi.dsi#fastutil;7.0.12 in central
	found org.projectlombok#lombok;1.16.8 in central
	found com.google.cloud#google-cloud-storage;2.20.1 in central
	found com.google.guava#guava;31.1-jre in c

### Importing necessary Libraries

In [3]:
import sagemaker
from pyspark.sql.functions import lower, regexp_replace, col, concat_ws
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.base import Finisher, DocumentAssembler

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


### Reading Sumbmissions Data

In [4]:
%%time
bucket = "sk2224-projectdata"
session = sagemaker.Session()
output_prefix_data_comments = "submissions/suggestions/yyyy=*"
s3_path = f"s3a://{bucket}/{output_prefix_data_comments}"
print(f"reading submissions from {s3_path}")
submissions = spark.read.parquet(s3_path, header=True)


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
reading submissions from s3a://sk2224-projectdata/submissions/suggestions/yyyy=*


23/11/21 01:33:53 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

CPU times: user 267 ms, sys: 15.4 ms, total: 283 ms
Wall time: 8.51 s


23/11/21 01:34:00 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [5]:
submissions.printSchema()

root
 |-- adserver_click_url: string (nullable = true)
 |-- adserver_imp_pixel: string (nullable = true)
 |-- archived: boolean (nullable = true)
 |-- author: string (nullable = true)
 |-- author_cakeday: boolean (nullable = true)
 |-- author_flair_css_class: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- author_id: string (nullable = true)
 |-- brand_safe: boolean (nullable = true)
 |-- contest_mode: boolean (nullable = true)
 |-- created_utc: timestamp (nullable = true)
 |-- crosspost_parent: string (nullable = true)
 |-- crosspost_parent_list: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- approved_at_utc: string (nullable = true)
 |    |    |-- approved_by: string (nullable = true)
 |    |    |-- archived: boolean (nullable = true)
 |    |    |-- author: string (nullable = true)
 |    |    |-- author_flair_css_class: string (nullable = true)
 |    |    |-- author_flair_text: string (nullable = true)
 |    |    

#### Removing unwanted rows, and columns

In [6]:
# Filter out rows where 'text' or 'author' is '[deleted]'
submissions_filtered = submissions.filter((submissions.selftext != '[deleted]') & (submissions.selftext != '[removed]') & (submissions.author != '[deleted]') & (submissions.author != '[removed]')& (submissions.title != '[deleted]') &  (submissions.title != '[removed]')) 

# Show the filtered DataFrame
submissions_filtered = submissions_filtered.select("subreddit", "author", "title", "selftext",
                             "created_utc", "num_comments", "score", 
                             "over_18", "media", "pinned", "locked", 
                             "disable_comments", "domain", "hidden", 
                             "distinguished", "hide_score")

In [7]:
submissions_filtered.show(5)

[Stage 1:>                                                          (0 + 1) / 1]

+----------------+-----------------+--------------------+--------------------+-------------------+------------+-----+-------+-----+------+------+----------------+--------------------+------+-------------+----------+
|       subreddit|           author|               title|            selftext|        created_utc|num_comments|score|over_18|media|pinned|locked|disable_comments|              domain|hidden|distinguished|hide_score|
+----------------+-----------------+--------------------+--------------------+-------------------+------------+-----+-------+-----+------+------+----------------+--------------------+------+-------------+----------+
|    Animesuggest|      RektoriusYT|Never watched mec...|So basically for ...|2021-02-18 15:46:14|          12|    3|  false| null| false| false|            null|   self.Animesuggest| false|         null|     false|
|    Animesuggest|           bff_op|Anime like Highsc...|Hello I need sugg...|2021-02-18 15:50:06|           6|    1|  false| null| fals

                                                                                

#### Combining 'title' and 'selftext' columns into a new column 'RedditText'

In [8]:
from pyspark.sql.functions import concat_ws

# Combine 'title' and 'selftext' columns into a new column 'RedditText'
submissions_combined = submissions_filtered.withColumn(
    "RedditText", concat_ws(" ", "title", "selftext")
)

In [9]:
# Show the first 5 rows of the DataFrame with the new column
submissions_combined.select("subreddit", "author", "RedditText", "created_utc", "num_comments", "score").show(5)

+----------------+-----------------+--------------------+-------------------+------------+-----+
|       subreddit|           author|          RedditText|        created_utc|num_comments|score|
+----------------+-----------------+--------------------+-------------------+------------+-----+
|    Animesuggest|      RektoriusYT|Never watched mec...|2021-02-18 15:46:14|          12|    3|
|    Animesuggest|           bff_op|Anime like Highsc...|2021-02-18 15:50:06|           6|    1|
|MovieSuggestions|    scottymac0707|Blockbusters Drop...|2021-02-18 15:50:53|           8|    3|
|MovieSuggestions|       stone78221|Family Movies lik...|2021-02-18 15:51:41|           4|    1|
|MovieSuggestions|Mighty_Dragon_001|Looking for movie...|2021-02-18 15:54:03|           5|    4|
+----------------+-----------------+--------------------+-------------------+------------+-----+
only showing top 5 rows



In [10]:
submissions_combined = submissions_combined.select("subreddit", "author", "RedditText", "created_utc", "num_comments", "score")

### Creating Pipeline for text Cleaning on the data

Setting up  a Spark NLP pipeline for text preprocessing:

DocumentAssembler:

Gathers input text data into a Spark DataFrame.
Input Column: "RedditText"
Output Column: "document"
Tokenizer:

Breaks down documents into individual words.
Input Column: "document"
Output Column: "token"
Normalizer:

Converts text to lowercase.
Input Column: "token"
Output Column: "normalized"
Lemmatizer:

Performs lemmatization on normalized words.
Input Column: "normalized"
Output Column: "lemma"
StopWordsCleaner:

Removes common English stopwords.
There is no library in spark nlp helce using nltk stopwords
Input Column: "lemma"
Output Column: "clean_lemma"
Finisher:

Converts processed tokens into human-readable output.
Input Column: "clean_lemma"
Pipeline:

Defines the sequence of stages in the NLP pipeline.
The resulting "clean_lemma" column contains preprocessed text ready for further analysis or machine learning tasks.

In [11]:
from sparknlp.annotator import (Tokenizer, Normalizer,
                                LemmatizerModel, StopWordsCleaner)

In [12]:
documentAssembler = DocumentAssembler()\
    .setInputCol("RedditText")\
    .setOutputCol("document")

In [13]:
# Regex Tokenizer to break words
tokenizer = Tokenizer() \
     .setInputCols(['document']) \
     .setOutputCol('token')

In [14]:
# Normalizing and setting case insensitive to be true
normalizer = Normalizer() \
     .setInputCols(['token']) \
     .setOutputCol('normalized') \
     .setLowercase(True)

In [15]:
!pip install nltk

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [16]:
import nltk
from nltk.corpus import stopwords

# Download stopwords data
nltk.download('stopwords')

# Now you can use stopwords from nltk.corpus
stopwords_list = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
# Remove Stopwords
stopwords_cleaner = StopWordsCleaner() \
     .setInputCols(['lemma']) \
     .setOutputCol('clean_lemma') \
     .setCaseSensitive(False) \
     .setStopWords(stopwords_list)

In [18]:
# finisher converts tokens to human-readable output
finisher = Finisher() \
     .setInputCols(['clean_lemma']) \
     .setCleanAnnotations(False)

In [19]:
# Lemmatizing
lemmatizer = LemmatizerModel.pretrained() \
     .setInputCols(['normalized']) \
     .setOutputCol('lemma')

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ | ]lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
Download done! Loading the resource.
[ / ]

                                                                                

[OK!]


In [20]:
pipeline = Pipeline() \
    .setStages([
        documentAssembler,
        tokenizer,
        normalizer,
        lemmatizer,  # Add the lemmatizer stage here
        stopwords_cleaner.setInputCols(['lemma']),
        finisher
    ])


In [21]:
from pyspark.ml import PipelineModel

# Create an empty DataFrame with the same schema as your actual data
empty_df = spark.createDataFrame([], schema=submissions_combined.schema)

In [22]:
# Fit the pipeline model on the empty DataFrame
pipeline_model = pipeline.fit(empty_df)



In [23]:
# Transform your actual DataFrame using the fitted pipeline model
submission_clean = pipeline_model.transform(submissions_combined)

In [24]:
submission_clean.select(submission_clean.finished_clean_lemma).show(5)

[Stage 5:>                                                          (0 + 1) / 1]

+--------------------+
|finished_clean_lemma|
+--------------------+
|[never, watch, me...|
|[anime, like, hig...|
|[blockbuster, dro...|
|[family, movie, l...|
|[look, movie, lik...|
+--------------------+
only showing top 5 rows



                                                                                

In [25]:
submission_clean.columns

['subreddit',
 'author',
 'RedditText',
 'created_utc',
 'num_comments',
 'score',
 'document',
 'token',
 'normalized',
 'lemma',
 'clean_lemma',
 'finished_clean_lemma']

In [26]:
# Remove 'lemma', 'clean_lemma' from the DataFrame
submission_clean = submission_clean.drop('lemma', 'clean_lemma',
 'document',
 'token',
 'normalized',)

# Rename 'finished_clean_lemma' to 'clean_RedditText'
submission_clean = submission_clean.withColumnRenamed('finished_clean_lemma', 'clean_RedditText')


In [27]:
submission_clean.show(5)

+----------------+-----------------+--------------------+-------------------+------------+-----+--------------------+
|       subreddit|           author|          RedditText|        created_utc|num_comments|score|    clean_RedditText|
+----------------+-----------------+--------------------+-------------------+------------+-----+--------------------+
|    Animesuggest|      RektoriusYT|Never watched mec...|2021-02-18 15:46:14|          12|    3|[never, watch, me...|
|    Animesuggest|           bff_op|Anime like Highsc...|2021-02-18 15:50:06|           6|    1|[anime, like, hig...|
|MovieSuggestions|    scottymac0707|Blockbusters Drop...|2021-02-18 15:50:53|           8|    3|[blockbuster, dro...|
|MovieSuggestions|       stone78221|Family Movies lik...|2021-02-18 15:51:41|           4|    1|[family, movie, l...|
|MovieSuggestions|Mighty_Dragon_001|Looking for movie...|2021-02-18 15:54:03|           5|    4|[look, movie, lik...|
+----------------+-----------------+--------------------

                                                                                

### Finding the most used words in the data for Movie and Anime Subreddits

Conversion: The code first converts the array of strings in the 'clean_RedditText' column into a single string, using a space as the separator.

Split and Explode: It then splits the string into individual words and explodes the resulting array, creating a new row for each word.

Grouping and Counting: After that, it groups the DataFrame by the 'word' column and counts the occurrences of each word.

Sorting: Finally, it sorts the result by the count of occurrences in descending order.

In [35]:
# Assuming 'subreddit' is a column in your DataFrame
submission_clean_movie = submission_clean.filter(submission_clean['subreddit'] == 'MovieSuggestions')


In [36]:
submission_clean_movie.show(5)

+----------------+--------------------+--------------------+-------------------+------------+-----+--------------------+
|       subreddit|              author|          RedditText|        created_utc|num_comments|score|    clean_RedditText|
+----------------+--------------------+--------------------+-------------------+------------+-----+--------------------+
|MovieSuggestions|       scottymac0707|Blockbusters Drop...|2021-02-18 15:50:53|           8|    3|[blockbuster, dro...|
|MovieSuggestions|          stone78221|Family Movies lik...|2021-02-18 15:51:41|           4|    1|[family, movie, l...|
|MovieSuggestions|   Mighty_Dragon_001|Looking for movie...|2021-02-18 15:54:03|           5|    4|[look, movie, lik...|
|MovieSuggestions|  theRealestAintReal|Detective movies ...|2021-06-30 22:21:40|          13|    3|[detective, movie...|
|MovieSuggestions|Lazy-Paleontologist9|Larger than life ...|2021-06-30 22:24:27|          11|    2|[large, life, fil...|
+----------------+--------------

                                                                                

In [37]:
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.functions import explode, split
from pyspark.sql.functions import desc

#'clean_RedditText' is the column containing tokenized and cleaned words
words_column = 'clean_RedditText'

# Convert array of strings to a single string with space as a separator
submission_clean_movie = submission_clean_movie.withColumn('clean_text', concat_ws(' ', words_column))

# Split the words and explode the array to create a new row for each word
word_count = submission_clean_movie.select(explode(split('clean_text', ' ')).alias('word'))

# Group by word and count occurrences
word_count = word_count.groupBy('word').count()

# Sort by count in descending order
word_count = word_count.sort(desc('count'))


In [38]:
# Show the top words
word_count.show(10)



+---------+-----+
|     word|count|
+---------+-----+
|    movie|72187|
|     like|27869|
|     look|17926|
|    watch|17677|
|       im|15842|
|     film|14465|
|     good|13604|
|      see|10841|
|     love| 9366|
|something| 8858|
+---------+-----+
only showing top 10 rows



                                                                                

In [32]:
# Assuming 'subreddit' is a column in your DataFrame
submission_clean_anime = submission_clean.filter(submission_clean['subreddit'] == 'Animesuggest')


In [33]:
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.functions import explode, split
from pyspark.sql.functions import desc

#'clean_RedditText' is the column containing tokenized and cleaned words
words_column = 'clean_RedditText'

# Convert array of strings to a single string with space as a separator
submission_clean_anime = submission_clean_anime.withColumn('clean_text', concat_ws(' ', words_column))

# Split the words and explode the array to create a new row for each word
word_count_a = submission_clean_anime.select(explode(split('clean_text', ' ')).alias('word'))

# Group by word and count occurrences
word_count_a = word_count_a.groupBy('word').count()

# Sort by count in descending order
word_count_a = word_count_a.sort(desc('count'))


In [39]:
word_count_a.show(10)



+---------+-----+
|     word|count|
+---------+-----+
|    anime|73632|
|     like|45544|
|    watch|34677|
|     look|23720|
|       im|23468|
|     good|19125|
|      one|17144|
|something|16997|
|       mc|15713|
|character|14876|
+---------+-----+
only showing top 10 rows



                                                                                

Looking at the words people use the most, it seems like people on Reddit really enjoy chatting about their favorite anime and movies. Now, we're going to dig into the comments and pick out the names of the movies and anime that everyone is talking about the most. This way, we can find out which ones are super popular among the community.

### TF-IDF to find important words

In [45]:
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType
import pandas as pd

In [41]:
# Combine the tokenized words into sentences for TF-IDF vectorization
concat_udf = udf(lambda x: ' '.join(x), StringType())
submission_clean_movie = submission_clean_movie.withColumn("concatenated_text", concat_udf(submission_clean_movie["clean_RedditText"]))

# Tokenization
tokenizer = Tokenizer(inputCol="concatenated_text", outputCol="words")
submission_clean_movie = tokenizer.transform(submission_clean_movie)

In [None]:
# TF
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
submission_clean_movie = hashingTF.transform(submission_clean_movie)

# IDF
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(submission_clean_movie)
submission_clean_movie = idfModel.transform(submission_clean_movie)

                                                                                

In [43]:
idf_scores = idfModel.idf.toArray()


In [44]:
submission_clean_movie.select("features", "words").show(truncate=False)

[Stage 23:>                                                         (0 + 1) / 1]

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 193, in manager
  File "/opt/conda/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/opt/conda/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 874, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/opt/conda/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 596, in read_int
    raise EOFError
EOFError
                                                                                

TF-IDF vectorization on a DataFrame containing cleaned and tokenized Reddit comments related to movies. The process begins by concatenating the tokenized words into a single text column using a User Defined Function (UDF). Subsequently, a Tokenizer is employed to split the concatenated text into individual words. The term frequencies of these words are then calculated using HashingTF, creating a sparse feature vector. The code further computes the Inverse Document Frequency (IDF) to weigh down common words and highlight distinctive ones. Finally, the TF-IDF features are obtained, and the resulting DataFrame displays both the TF-IDF features and the original words. This representation provides a numerical foundation for the textual data, enabling applications in natural language processing tasks such as text classification or clustering within the context of movie-related Reddit discussions. Fine-tuning parameters or preprocessing steps may be required based on the specific analytical goals and characteristics of the data.

While this approach offers valuable insights into key words and their numerical representations through TF-IDF, its utility for our specific analysis is limited. The TF-IDF vectorization is designed to capture the significance of words in individual documents relative to the entire dataset. However, in our context of extracting movie and anime names from Reddit comments, this numerical representation may not be the most effective. Instead, we will explore alternative methods to identify and extract specific movie and anime references from the comments, allowing us to delve deeper into the most mentioned or discussed titles. This decision is rooted in the recognition that TF-IDF, while beneficial for certain tasks, may not be the optimal choice for our current objective of extracting and analyzing movie and anime references from the Reddit data.