# NATURAL LANGUAGE PROCESSING

## Setup

In [27]:
# Setup - Run only once per Kernel App
%conda install openjdk -y

# install PySpark
%pip install pyspark==3.4.0

# install spark-nlp
%pip install spark-nlp==5.1.3

# restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 23.3.1
  latest version: 23.10.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.10.0



# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.
Collecting pyspark==3.4.0
  Using cached pyspark-3.4.0-py2.py3-none-any.whl
Collecting py4j==0.10.9.7 (from pyspark==3.4.0)
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: py4j, pyspark
  Attempting uninstall: py4j
    Found existing installation: py4j 0.10.9.2
    Uninstalling py4j-0.10.9.2:
      Successfully uninstalled py4j-0.10.9.2
  Attempting uninstall: pyspark
    Fou

In [28]:
# Import pyspark and build Spark session
from pyspark.sql import SparkSession

# Import pyspark and build Spark session
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[*]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3")\
    .getOrCreate()

print(spark.version)

3.2.0


In [37]:
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.base import Finisher, DocumentAssembler

## Import Libraries

In [30]:
import sagemaker
from pyspark.sql.functions import lower, regexp_replace, col, concat_ws
from pyspark.ml.feature import Tokenizer, StopWordsRemover

In [31]:
print(f"Spark version: {spark.version}")
print(f"sparknlp version: {sparknlp.version()}")

Spark version: 3.2.0
sparknlp version: 5.1.4


## Import Data

In [16]:
%%time
bucket = "project-group34"
session = sagemaker.Session()
output_prefix_data_comments = "project/comments/yyyy=*"
s3_path = f"s3a://{bucket}/{output_prefix_data_comments}"
print(f"reading comments from {s3_path}")
comments = spark.read.parquet(s3_path, header=True)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
reading comments from s3a://project-group34/project/comments/yyyy=*


23/11/20 16:06:06 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

CPU times: user 273 ms, sys: 22.2 ms, total: 295 ms
Wall time: 8.12 s


In [6]:
comments = comments.cache()

In [17]:
comments.printSchema()

root
 |-- author: string (nullable = true)
 |-- author_cakeday: boolean (nullable = true)
 |-- author_flair_css_class: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- body: string (nullable = true)
 |-- can_gild: boolean (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- created_utc: timestamp (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- edited: string (nullable = true)
 |-- gilded: long (nullable = true)
 |-- id: string (nullable = true)
 |-- is_submitter: boolean (nullable = true)
 |-- link_id: string (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- permalink: string (nullable = true)
 |-- retrieved_on: timestamp (nullable = true)
 |-- score: long (nullable = true)
 |-- stickied: boolean (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- subreddit_id: string (nullable = true)



In [29]:
# display a subset of columns
comments.select("subreddit", "author", "body", "parent_id", "id", "created_utc", "score", "controversiality").show()

+----------------+---------------+--------------------+----------+-------+-------------------+-----+----------------+
|       subreddit|         author|                body| parent_id|     id|        created_utc|score|controversiality|
+----------------+---------------+--------------------+----------+-------+-------------------+-----+----------------+
|    Animesuggest|        Athenza|{Now and Then, He...| t3_m3ygv3|gqscelh|2021-03-13 10:15:52|    2|               0|
|    Animesuggest|       Roboragi|**Ima, Soko ni Ir...|t1_gqscelh|gqscf1z|2021-03-13 10:16:05|    1|               0|
|    Animesuggest|      [deleted]|           [deleted]| t3_m3vnjl|gqscjse|2021-03-13 10:18:25|    1|               0|
|MovieSuggestions|      katnip_fl|       Jacobs Ladder| t3_m3rw47|gqscl5i|2021-03-13 10:19:07|    2|               0|
|    Animesuggest|        Athenza|{Kino no Tabi: Th...| t3_m3xpu6|gqscnqz|2021-03-13 10:20:26|    1|               0|
|    Animesuggest|    Dropsoftime|Try Mahouka kouko...| 

In [18]:
# Filter out rows where 'body' or 'author' is '[deleted]'
comments_filtered = comments.filter((comments.body != '[deleted]') & (comments.author != '[deleted]'))

# Show the filtered DataFrame
comments_filtered = comments_filtered.select("subreddit", "author", "body", "parent_id", "id", "created_utc", "score", "controversiality")

In [9]:
comments_filtered.show()

[Stage 1:>                                                          (0 + 1) / 1]

+----------------+---------------+--------------------+----------+-------+-------------------+-----+----------------+
|       subreddit|         author|                body| parent_id|     id|        created_utc|score|controversiality|
+----------------+---------------+--------------------+----------+-------+-------------------+-----+----------------+
|    Animesuggest|        Athenza|{Now and Then, He...| t3_m3ygv3|gqscelh|2021-03-13 10:15:52|    2|               0|
|    Animesuggest|       Roboragi|**Ima, Soko ni Ir...|t1_gqscelh|gqscf1z|2021-03-13 10:16:05|    1|               0|
|MovieSuggestions|      katnip_fl|       Jacobs Ladder| t3_m3rw47|gqscl5i|2021-03-13 10:19:07|    2|               0|
|    Animesuggest|        Athenza|{Kino no Tabi: Th...| t3_m3xpu6|gqscnqz|2021-03-13 10:20:26|    1|               0|
|    Animesuggest|    Dropsoftime|Try Mahouka kouko...| t3_m43dco|gqscnr8|2021-03-13 10:20:26|    3|               0|
|MovieSuggestions|   alienstabler|[Holes (2003)](ht...| 

                                                                                

In [89]:
# import spacy
# import re
# from pyspark.sql.functions import udf
# from pyspark.sql.types import ArrayType, StringType

# # Load the NLP model once
# nlp = spacy.load("en_core_web_sm")

# def extract_movie_names(text):
    
#     doc = nlp(text)

#     movie_positions = []
#     movie_names = []
    
#     # Extract named entities from the context window
#     for ent in doc.ents:
#         if ent.label_ == "ORG" or ent.label_ == "PERSON": 
#             movie_positions.append((ent.start_char, ent.end_char))
#             movie_names.append(ent.text)
            
#     return movie_positions, movie_names

# def remove_movie_names(text, movie_positions):
#     # Sort positions in reverse order to avoid shifting positions during removal
#     movie_positions = sorted(movie_positions, key=lambda x: x[0], reverse=True)

#     # Remove movie names from the text
#     for start, end in movie_positions:
#         text = text[:start] + ' ' + text[end:]

#     # Remove extra spaces
#     text = ' '.join(text.split())

#     return text

# def extract_movie_names_regex(text, movie_names):
#     # Define a regex pattern for movie names
#     movie_name_pattern = r"(?:'([^']+)'|\"([^\"]+)\"|([A-Z][a-z]*(?: [A-Z0-9][a-z0-9]*)*)(?: \(\d{4}\))?)"

#     # Find matches in the text using the regex pattern
#     movie_matches = re.findall(movie_name_pattern, text)

#     # Combine the results from the capturing groups
#     movies = [match[0] or match[1] or match[2] for match in movie_matches]
#     movie_names = movie_names + movies

#     return movie_names

# # Create a UDF
# extract_movie_names_udf = udf(lambda text: extract_movie_names(text, nlp), ArrayType(StringType()))

# # Applying the UDF to your DataFrame
# df_with_movie_names = comments_filtered.withColumn("movie_names", extract_movie_names_udf(comments_filtered["body"]))
# df_with_movie_names.show()


In [8]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, ArrayType, StructType, StructField

import spacy
import re

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Define schema for the UDF output
movie_schema = StructType([
    StructField("movie_positions", ArrayType(ArrayType(StringType()))),
    StructField("movie_names", ArrayType(StringType()))
])

# UDF to extract movie names
@udf(movie_schema)
def extract_movie_names_udf(text):
    doc = nlp(text)
    movie_positions = []
    movie_names = []

    for ent in doc.ents:
        if ent.label_ == "ORG" or ent.label_ == "PERSON":
            movie_positions.append([ent.start_char, ent.end_char])
            movie_names.append(ent.text)

    return (movie_positions, movie_names)

# UDF to remove movie names
@udf(StringType())
def remove_movie_names_udf(text, movie_positions):
    # Reverse sort positions to avoid shifting positions
    if movie_positions:
        movie_positions = sorted([(int(start), int(end)) for start, end in movie_positions], key=lambda x: x[0], reverse=True)

        for start, end in movie_positions:
            text = text[:start] + ' ' + text[end:]

        return ' '.join(text.split())
    
    else:
        return text

# UDF to extract movie names using regex
@udf(ArrayType(StringType()))
def extract_movie_names_regex_udf(text, movie_names):
    movie_name_pattern = r'(?:\"([^\"]+)\"|([A-Z][a-z]*(?:\s+(?:[a-z]+\s+)*[A-Z][a-z]*)*)(?: \(\d{4}\))?)'

    movie_matches = re.findall(movie_name_pattern, text)
    movies = [match[0] or match[1] or match[2] for match in movie_matches]
    return movie_names + movies

# Applying the UDFs to the DataFrame
df_with_movie_data = comments_filtered.withColumn("movie_data", extract_movie_names_udf(comments_filtered["body"]))
df_removed_movie_names = df_with_movie_data.withColumn("body_no_movies", remove_movie_names_udf(comments["body"], df_with_movie_data["movie_data.movie_positions"]))
df_final = df_removed_movie_names.withColumn("movie_names", extract_movie_names_regex_udf(df_removed_movie_names["body_no_movies"], df_removed_movie_names["movie_data.movie_names"]))

In [13]:
# If you want to print the first row as a dictionary for better readability
first_row_dict = df_final.first().asDict()
print(first_row_dict)

[Stage 2:>                                                          (0 + 1) / 1]

{'subreddit': 'Animesuggest', 'author': 'Athenza', 'body': '{Now and Then, Here and There}', 'parent_id': 't3_m3ygv3', 'id': 'gqscelh', 'created_utc': datetime.datetime(2021, 3, 13, 10, 15, 52), 'score': 2, 'controversiality': 0, 'movie_data': Row(movie_positions=[], movie_names=[]), 'body_no_movies': '{Now and Then, Here and There}', 'movie_names': ['Now and Then', 'Here and There']}


Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/opt/conda/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/opt/conda/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 663, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/opt/conda/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 564, in read_int
    raise EOFError
EOFError
                                                                                

In [92]:
df_final.select("subreddit", "author", "body", "parent_id", "id", "created_utc", "score", "controversiality", "movie_names").show()

[Stage 3:>                                                          (0 + 1) / 1]

+----------------+---------------+--------------------+----------+-------+-------------------+-----+----------------+--------------------+
|       subreddit|         author|                body| parent_id|     id|        created_utc|score|controversiality|         movie_names|
+----------------+---------------+--------------------+----------+-------+-------------------+-----+----------------+--------------------+
|    Animesuggest|        Athenza|{Now and Then, He...| t3_m3ygv3|gqscelh|2021-03-13 10:15:52|    2|               0|[Now and Then, He...|
|    Animesuggest|       Roboragi|**Ima, Soko ni Ir...|t1_gqscelh|gqscf1z|2021-03-13 10:16:05|    1|               0|[Genres, Sci-Fi, ...|
|MovieSuggestions|      katnip_fl|       Jacobs Ladder| t3_m3rw47|gqscl5i|2021-03-13 10:19:07|    2|               0|     [Jacobs Ladder]|
|    Animesuggest|        Athenza|{Kino no Tabi: Th...| t3_m3xpu6|gqscnqz|2021-03-13 10:20:26|    1|               0|[{Ping Pong the A...|
|    Animesuggest|    Drops

Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/opt/conda/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/opt/conda/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 663, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/opt/conda/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 564, in read_int
    raise EOFError
EOFError
                                                                                

In [10]:
df_final = df_final.cache()

23/11/19 21:50:02 WARN CacheManager: Asked to cache already cached data.


In [11]:
stop_words = set(["a", "an", "the", "and", "or", "but", "is", "are", "was", "were", "be", "been", "being", "there", "he", "she"])  # Example stop words

In [12]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

def remove_stop_word_from_movie_names(movies):
    if movies and len(movies[0].split()) == 1 and movies[0].lower() in stop_words:
        return movies[1:]
    return movies

remove_stop_word_udf = udf(remove_stop_word_from_movie_names, ArrayType(StringType()))

In [13]:
df_final = df_final.withColumn("movie_names", remove_stop_word_udf(df_final["movie_names"]))

In [14]:
df_final_sample = df_final.limit(100)

In [15]:
from pyspark.sql.functions import explode, col, count

# Flatten the movie_names column
df_flattened = df_final_sample.withColumn("movie_name", explode(col("movie_names")))

# Group by movie_name and count the occurrences
df_movie_frequency = df_flattened.groupBy("movie_name").agg(count("*").alias("frequency"))

In [None]:
# Show the resulting DataFrame
df_movie_frequency_pd = df_movie_frequency.toPandas()

[Stage 1:>                                                        (0 + 4) / 241]
KeyboardInterrupt



In [None]:
spark.stop()

In [29]:
# import sagemaker
# import boto3

# # Define your bucket and script path
# bucket = 'project-group34'
# script_key = 'scripts/install_spacy.sh'

# # Create a boto3 S3 client
# s3_client = boto3.client('s3')

# # Path to your local script
# local_script_path = './code/install_spacy.sh'

# # Upload the script
# s3_client.upload_file(Filename=local_script_path, Bucket=bucket, Key=script_key)

In [2]:
!mkdir -p ./code

In [4]:
%%writefile ./code/suggestion_extract_process.py

import subprocess
import sys

import sys
print(f"Python version: {sys.version}")
print(f"Python executable: {sys.executable}")
print(sys.path)

import os
import logging
import argparse

# Import pyspark and build Spark session
from pyspark.sql.functions import *
from pyspark.sql.types import (
    DoubleType,
    IntegerType,
    StringType,
    StructField,
    StructType,
)
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType
import re
from pyspark.sql.functions import explode, count

spacy_version = "3.7.2"  
thinc_version = "8.2.1" 
pydantic_version = "1.8.0" 

# Install the packages using pip
subprocess.check_call([sys.executable, "-m", "pip", "install", f"spacy=={spacy_version}"])
subprocess.check_call([sys.executable, "-m", "pip", "install", f"thinc=={thinc_version}"])
subprocess.check_call([sys.executable, "-m", "pip", "install", f"pydantic=={pydantic_version}"])
subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
import spacy

logging.basicConfig(format='%(asctime)s,%(levelname)s,%(module)s,%(filename)s,%(lineno)d,%(message)s', level=logging.DEBUG)
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))

def main():
    
    parser = argparse.ArgumentParser(description="app inputs and outputs")
    parser.add_argument("--s3_dataset_path", type=str, help="Path of dataset in S3")    
    parser.add_argument("--col_name_for_filtering", type=str, help="Name of the column to filter")
    args = parser.parse_args()

    spark = SparkSession.builder.appName("PySparkApp").getOrCreate()
    logger.info(f"spark version = {spark.version}")
    
    # This is needed to save RDDs which is the only way to write nested Dataframes into CSV format
    sc = spark.sparkContext
    sc._jsc.hadoopConfiguration().set(
        "mapred.output.committer.class", "org.apache.hadoop.mapred.FileOutputCommitter"
    )

    # Downloading the data from S3 into a Dataframe
    logger.info(f"going to read {args.s3_dataset_path} for r/{args.col_name_for_filtering}")
    df = spark.read.parquet(args.s3_dataset_path, header=True)
    vals = [args.col_name_for_filtering]
    df_filtered = df.where(col("subreddit").isin(vals))
    logger.info(f"finished reading files...")
    
    # DATA CLEANING
    comments_filtered = df_filtered.filter((df.body != '[deleted]') & (df.author != '[deleted]'))

    # Load spaCy model
    nlp = spacy.load("en_core_web_sm")

    # Define schema for the UDF output
    movie_schema = StructType([
        StructField("movie_positions", ArrayType(ArrayType(StringType()))),
        StructField("movie_names", ArrayType(StringType()))
    ])

    # UDF to extract movie names
    @udf(movie_schema)
    def extract_movie_names_udf(text):
        doc = nlp(text)
        movie_positions = []
        movie_names = []

        for ent in doc.ents:
            if ent.label_ == "ORG" or ent.label_ == "PERSON":
                movie_positions.append([ent.start_char, ent.end_char])
                movie_names.append(ent.text)

        return (movie_positions, movie_names)

    # UDF to remove movie names
    @udf(StringType())
    def remove_movie_names_udf(text, movie_positions):
        # Reverse sort positions to avoid shifting positions
        if movie_positions:
            movie_positions = sorted([(int(start), int(end)) for start, end in movie_positions], key=lambda x: x[0], reverse=True)

            for start, end in movie_positions:
                text = text[:start] + ' ' + text[end:]

            return ' '.join(text.split())

        else:
            return text

    # UDF to extract movie names using regex
    @udf(ArrayType(StringType()))
    def extract_movie_names_regex_udf(text, movie_names):
        movie_name_pattern = r'(?:\"([^\"]+)\"|([A-Z][a-z]*(?:\s+(?:[a-z]+\s+)*[A-Z][a-z]*)*)(?: \(\d{4}\))?)'

        movie_matches = re.findall(movie_name_pattern, text)
        movies = [m for match in matches for m in match if m]
        return movie_names + movies
    
    def remove_stop_word_from_movie_names(suggestion_list):
        
        if suggestion_list and len(suggestion_list[0].split()) == 1 and suggestion_list[0].lower() in stop_words:
            return suggestion_list[1:]
        
        return suggestion_list

    # Applying the UDFs to the DataFrame
    df_with_suggestions = comments_filtered.withColumn("movie_data", extract_movie_names_udf(comments_filtered["body"]))
    df_removed_suggestions_names = df_with_suggestions.withColumn("body_no_movies", remove_movie_names_udf(comments["body"], df_with_movie_data["movie_data.movie_positions"]))
    df_final = df_removed_suggestions_names.withColumn("suggestion_list", extract_movie_names_regex_udf(df_removed_movie_names["body_no_movies"], df_removed_movie_names["movie_data.movie_names"]))
    
    df_final = df_final.select("subreddit", "author", "body", "parent_id", "id", "created_utc", "score", "controversiality", "suggestion_list")
    
    stop_words = set(["a", "an", "the", "and", "or", "but", "is", "are", "was", "were", "be", "been", "being", "there", "he", "she"])  # stop words
    
    df_final = df_final.withColumn("suggestion_lists", remove_stop_word_udf(df_final["suggestion_list"]))
    
    remove_stop_word_udf = udf(remove_stop_word_from_movie_names, ArrayType(StringType()))

    # Flatten the movie_names column
    df_flattened = df_final_sample.withColumn("suggestions", explode(col("suggestion_lists")))

    # Group by movie_name and count the occurrences
    df_suggestion_frequency = df_flattened.groupBy("suggestions").agg(count("*").alias("frequency"))

    df_suggestion_frequency.write.mode("overwrite").parquet("{s3_path}/nlp/")
    
    logger.info(f"all done...")
    
if __name__ == "__main__":
    main()

Overwriting ./code/suggestion_extract_process.py


In [5]:
%%time
import time
import sagemaker
from sagemaker.spark.processing import PySparkProcessor

# Setup the PySpark processor to run the job. Note the instance type and instance count parameters. SageMaker will create these many instances of this type for the spark job.
role = sagemaker.get_execution_role()
spark_processor = PySparkProcessor(
    base_job_name="sm-spark-project",
    framework_version="3.3",
    role=role,
    instance_count=8,
    instance_type="ml.m5.xlarge",
    max_runtime_in_seconds=3600,
)

# # S3 URI of the initialization script
# s3_uri_init_script = f's3://{bucket}/{script_key}'

# s3 paths
session = sagemaker.Session()
output_prefix_logs = f"spark_logs"

configuration = [
    {
        "Classification": "spark-defaults",
        "Properties": {"spark.executor.memory": "12g", "spark.executor.cores": "4"},
    }
]

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
CPU times: user 2.45 s, sys: 409 ms, total: 2.85 s
Wall time: 2.6 s


In [6]:
%%time
subreddit_list = ["MovieSuggestions", "televisionsuggestions", "Animesuggest"]
for subreddit in subreddit_list:
    print(f"going to extract suggestions data for subreddit={subreddit}")
    bucket = "project-group34"
    output_prefix_data_comments = "project/comments/yyyy=*"
    s3_path = f"s3a://{bucket}/{output_prefix_data_comments}"
    col_name_for_filtering = subreddit

    # run the job now, the arguments array is provided as command line to the Python script (Spark code in this case).
    spark_processor.run(
        submit_app="./code/suggestion_extract_process.py",
        arguments=[
            "--s3_dataset_path",
            s3_path,
            "--col_name_for_filtering",
            col_name_for_filtering,
        ],
        spark_event_logs_s3_uri="s3://{}/{}/spark_event_logs".format(bucket, output_prefix_logs),
        logs=False,
        configuration=configuration
    )
    # give some time for resources from this iterations to get cleaned up
    # if we start the job immediately we could get insufficient resources error
    time.sleep(60)

going to extract suggestions data for subreddit=MovieSuggestions


INFO:sagemaker:Creating processing-job with name sm-spark-project-2023-11-20-04-39-35-458


........................................................................*

UnexpectedStatusException: Error for Processing job sm-spark-project-2023-11-20-04-39-35-458: Failed. Reason: AlgorithmError: See job logs for more information