# NATURAL LANGUAGE PROCESSING

## Setup

In [2]:
# Setup - Run only once per Kernel App
%conda install openjdk -y

# install PySpark
%pip install pyspark==3.4.0

# install spark-nlp
%pip install spark-nlp==5.1.3

# restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 23.3.1
  latest version: 23.10.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=23.10.0



# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --

In [4]:
# # Import pyspark and build Spark session
# from pyspark.sql import SparkSession

# # Import pyspark and build Spark session
# spark = SparkSession.builder \
#     .appName("Spark NLP")\
#     .master("local[*]")\
#     .config("spark.driver.memory","16G")\
#     .config("spark.driver.maxResultSize", "0") \
#     .config("spark.kryoserializer.buffer.max", "2000M")\
#     .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3")\
#     .getOrCreate()

# print(spark.version)

# Import pyspark and build Spark session
from pyspark.sql import SparkSession

# Import pyspark and build Spark session
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[*]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3,org.apache.hadoop:hadoop-aws:3.2.2")\
    .config(
            "fs.s3a.aws.credentials.provider",
            "com.amazonaws.auth.ContainerCredentialsProvider",
    )\
    .getOrCreate()

# spark = (
#     SparkSession.builder.appName("PySparkApp")
#     .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.2.2")
#     .config(
#         "fs.s3a.aws.credentials.provider",
#         "com.amazonaws.auth.ContainerCredentialsProvider",
#     )
#     .getOrCreate()
# )

print(spark.version)

3.4.0


## Import Libraries

In [5]:
import sagemaker
from pyspark.sql.functions import lower, regexp_replace, col, concat_ws
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.base import Finisher, DocumentAssembler

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [6]:
print(f"Spark version: {spark.version}")
print(f"sparknlp version: {sparknlp.version()}")

Spark version: 3.4.0
sparknlp version: 5.1.3


## Import Data

In [7]:
%%time
bucket = "project-group34"
session = sagemaker.Session()
output_prefix_data_comments = "project/comments/yyyy=*"
s3_path = f"s3a://{bucket}/{output_prefix_data_comments}"
print(f"reading comments from {s3_path}")
comments = spark.read.parquet(s3_path, header=True)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
reading comments from s3a://project-group34/project/comments/yyyy=*


23/11/20 20:32:00 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

CPU times: user 270 ms, sys: 9.62 ms, total: 280 ms
Wall time: 8.3 s


In [6]:
# comments = comments.cache()

In [7]:
comments.printSchema()

root
 |-- author: string (nullable = true)
 |-- author_cakeday: boolean (nullable = true)
 |-- author_flair_css_class: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- body: string (nullable = true)
 |-- can_gild: boolean (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- created_utc: timestamp (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- edited: string (nullable = true)
 |-- gilded: long (nullable = true)
 |-- id: string (nullable = true)
 |-- is_submitter: boolean (nullable = true)
 |-- link_id: string (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- permalink: string (nullable = true)
 |-- retrieved_on: timestamp (nullable = true)
 |-- score: long (nullable = true)
 |-- stickied: boolean (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- subreddit_id: string (nullable = true)



In [8]:
# display a subset of columns
comments.select("subreddit", "author", "body", "parent_id", "id", "created_utc", "score", "controversiality").show()

[Stage 1:>                                                          (0 + 1) / 1]

+----------------+---------------+--------------------+----------+-------+-------------------+-----+----------------+
|       subreddit|         author|                body| parent_id|     id|        created_utc|score|controversiality|
+----------------+---------------+--------------------+----------+-------+-------------------+-----+----------------+
|    Animesuggest|        Athenza|{Now and Then, He...| t3_m3ygv3|gqscelh|2021-03-13 10:15:52|    2|               0|
|    Animesuggest|       Roboragi|**Ima, Soko ni Ir...|t1_gqscelh|gqscf1z|2021-03-13 10:16:05|    1|               0|
|    Animesuggest|      [deleted]|           [deleted]| t3_m3vnjl|gqscjse|2021-03-13 10:18:25|    1|               0|
|MovieSuggestions|      katnip_fl|       Jacobs Ladder| t3_m3rw47|gqscl5i|2021-03-13 10:19:07|    2|               0|
|    Animesuggest|        Athenza|{Kino no Tabi: Th...| t3_m3xpu6|gqscnqz|2021-03-13 10:20:26|    1|               0|
|    Animesuggest|    Dropsoftime|Try Mahouka kouko...| 

                                                                                

In [8]:
# Filter out rows where 'body' or 'author' is '[deleted]'
comments_filtered = comments.filter((comments.body != '[deleted]') & (comments.author != '[deleted]'))

# Show the filtered DataFrame
comments_filtered = comments_filtered.select("subreddit", "author", "body", "parent_id", "id", "created_utc", "score", "controversiality")

In [10]:
comments_filtered.show()

+----------------+---------------+--------------------+----------+-------+-------------------+-----+----------------+
|       subreddit|         author|                body| parent_id|     id|        created_utc|score|controversiality|
+----------------+---------------+--------------------+----------+-------+-------------------+-----+----------------+
|    Animesuggest|        Athenza|{Now and Then, He...| t3_m3ygv3|gqscelh|2021-03-13 10:15:52|    2|               0|
|    Animesuggest|       Roboragi|**Ima, Soko ni Ir...|t1_gqscelh|gqscf1z|2021-03-13 10:16:05|    1|               0|
|MovieSuggestions|      katnip_fl|       Jacobs Ladder| t3_m3rw47|gqscl5i|2021-03-13 10:19:07|    2|               0|
|    Animesuggest|        Athenza|{Kino no Tabi: Th...| t3_m3xpu6|gqscnqz|2021-03-13 10:20:26|    1|               0|
|    Animesuggest|    Dropsoftime|Try Mahouka kouko...| t3_m43dco|gqscnr8|2021-03-13 10:20:26|    3|               0|
|MovieSuggestions|   alienstabler|[Holes (2003)](ht...| 

                                                                                

In [9]:
comments_filtered_movies = comments_filtered.where(col("subreddit").isin("MovieSuggestions"))

In [11]:
# Define the pipeline stages
document_assembler = DocumentAssembler() \
    .setInputCol("body") \
    .setOutputCol("document")

sentence_detector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence")

tokenizer = Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")

# Use a pretrained embeddings model, for example, BERT
embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") \
    .setInputCols(["sentence", "token"]) \
    .setOutputCol("embeddings")

ner_model = NerDLModel.pretrained("ner_dl_bert", "en") \
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setOutputCol("ner")

ner_converter = NerConverter() \
    .setInputCols(["sentence", "token", "ner"]) \
    .setOutputCol("ner_chunk")

# Build the pipeline
nlp_pipeline = Pipeline(stages=[
    document_assembler,
    sentence_detector,
    tokenizer,
    embeddings,
    ner_model,
    ner_converter
])

# Apply the pipeline to your DataFrame
model = nlp_pipeline.fit(comments_filtered_movies)
result = model.transform(comments_filtered_movies)

bert_base_cased download started this may take some time.
Approximate size to download 384.9 MB
[OK!]
ner_dl_bert download started this may take some time.
Approximate size to download 15.4 MB
[OK!]


In [12]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType, ArrayType
import pyspark.sql.functions as F

# Define a UDF to filter and extract movie names
def extract_movies(chunks):
    movie_names = [chunk.result for chunk in chunks if chunk.metadata['entity'] in ['PERSON', 'ORG']]
    return movie_names

extract_movie_names_udf = udf(extract_movies, ArrayType(StringType()))

# Apply the UDF to the DataFrame
movies_df = result.withColumn("movie_names", extract_movie_names_udf(F.col("ner_chunk")))

# Display the results
movies_df.select("body", "movie_names").show()

[Stage 7:>                                                          (0 + 1) / 1]

+--------------------+--------------------+
|                body|         movie_names|
+--------------------+--------------------+
|       Jacobs Ladder|     [Jacobs Ladder]|
|[Holes (2003)](ht...|                  []|
|V for Vendetta\n\...|                  []|
|Synchronic was co...|                  []|
|Climax!!!!! Sound...|       [Climax!!!!!]|
|• Sorry To Bother...|                  []|
|I would also sugg...|                  []|
|Midsommar. Was no...|         [Midsommar]|
|Reservoir Dogs\n\...|                  []|
|             Hoodlum|           [Hoodlum]|
|Crime/Thriller:\n...|[Asura, Slice, Sl...|
|Don't Breathe\n\n...|                  []|
|This is a new one...|                  []|
|Just realized the...|                  []|
|     Children of Men|                  []|
|So many, thank yo...|[Grand Budapest H...|
|        Pacific Rim.|       [Pacific Rim]|
|Be careful with i...|                  []|
|Wolf Warrior inst...|                  []|
|The anime? Wow, h...|          

                                                                                

In [13]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, ArrayType, StructType, StructField

# import spacy
import re

# # Load spaCy model
# nlp = spacy.load("en_core_web_sm")

# # Define schema for the UDF output
# movie_schema = StructType([
#     StructField("movie_positions", ArrayType(ArrayType(StringType()))),
#     StructField("movie_names", ArrayType(StringType()))
# ])

# # UDF to extract movie names
# @udf(movie_schema)
# def extract_movie_names_udf(text):
#     doc = nlp(text)
#     movie_positions = []
#     movie_names = []

#     for ent in doc.ents:
#         if ent.label_ == "ORG" or ent.label_ == "PERSON":
#             movie_positions.append([ent.start_char, ent.end_char])
#             movie_names.append(ent.text)

#     return (movie_positions, movie_names)

# UDF to remove movie names
@udf(StringType())
def remove_movie_names_udf(text, movie_names):
    if movie_names:
        for name in movie_names:
            text = text.replace(name, ' ')
        return ' '.join(text.split())
    else:
        return text

# UDF to extract movie names using regex
@udf(ArrayType(StringType()))
def extract_movie_names_regex_udf(text, movie_names):
    movie_name_pattern = r'(?:\"([^\"]+)\"|([A-Z][a-z]*(?:\s+(?:[a-z]+\s+)*[A-Z][a-z]*)*)(?: \(\d{4}\))?)'

    movie_matches = re.findall(movie_name_pattern, text)
    movies = [match[0] or match[1] or match[2] for match in movie_matches]
    return movie_names + movies

# Remove movie names from the 'body' text
df_removed_movie_names = movies_df.withColumn("body_no_movies", remove_movie_names_udf(movies_df["body"], movies_df["movie_names"]))

# If you still want to use the regex method to supplement the NER extraction
df_final = df_removed_movie_names.withColumn("additional_movie_names", extract_movie_names_regex_udf(df_removed_movie_names["body_no_movies"], df_removed_movie_names["movie_names"]))


In [17]:
df_final.select("body", "movie_names", "additional_movie_names").show()

[Stage 11:>                                                         (0 + 1) / 1]

+--------------------+--------------------+----------------------+
|                body|         movie_names|additional_movie_names|
+--------------------+--------------------+----------------------+
|       Jacobs Ladder|     [Jacobs Ladder]|       [Jacobs Ladder]|
|[Holes (2003)](ht...|                  []|               [Holes]|
|V for Vendetta\n\...|                  []|  [V for Vendetta\n...|
|Synchronic was co...|                  []|          [Synchronic]|
|Climax!!!!! Sound...|       [Climax!!!!!]|  [Climax!!!!!, Sou...|
|• Sorry To Bother...|                  []|  [Sorry To Bother ...|
|I would also sugg...|                  []|  [I would also sug...|
|Midsommar. Was no...|         [Midsommar]|  [Midsommar, Was n...|
|Reservoir Dogs\n\...|                  []|  [Reservoir Dogs\n...|
|             Hoodlum|           [Hoodlum]|             [Hoodlum]|
|Crime/Thriller:\n...|[Asura, Slice, Sl...|  [Asura, Slice, Sl...|
|Don't Breathe\n\n...|                  []|  [Don, Breathe\n\n

                                                                                

In [14]:
import nltk
nltk.download('stopwords')
eng_stopwords = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
# stop_words = set(["a", "an", "the", "and", "or", "but", "is", "are", "was", "were", "be", "been", "being", "there", "he", "she"])  # Example stop words

In [16]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

def remove_stop_word_from_movie_names(movies):
    if movies and len(movies[0].split()) == 1 and movies[0].lower() in eng_stopwords:
        return movies[1:]
    return movies

remove_stop_word_udf = udf(remove_stop_word_from_movie_names, ArrayType(StringType()))

In [17]:
df_final = df_final.withColumn("movie_names_final", remove_stop_word_udf(df_final["additional_movie_names"]))

In [24]:
from pyspark.sql.functions import explode, col, count

# Flatten the movie_names column
df_flattened = df_final.withColumn("movie_name", explode(col("movie_names_final")))
df_movie_frequency = df_flattened.groupBy("movie_name").agg(count("*").alias("frequency"))

In [22]:
from pyspark.sql.functions import desc

# Sort the DataFrame by frequency in descending order and take the top 1000
df_top_500_movies = df_movie_frequency.orderBy(desc("frequency")).limit(500)

In [24]:
# df_top_500_movies.show()

In [29]:
# import sagemaker
# import boto3

# # Define your bucket and script path
# bucket = 'project-group34'
# script_key = 'scripts/install_spacy.sh'

# # Create a boto3 S3 client
# s3_client = boto3.client('s3')

# # Path to your local script
# local_script_path = './code/install_spacy.sh'

# # Upload the script
# s3_client.upload_file(Filename=local_script_path, Bucket=bucket, Key=script_key)

In [2]:
!mkdir -p ./code

In [26]:
%%writefile ./code/suggestion_extract_process.py

import subprocess
import sys

import sys
print(f"Python version: {sys.version}")
print(f"Python executable: {sys.executable}")
print(sys.path)

subprocess.check_call([sys.executable, "-m", "pip", "install", "spark-nlp"])

import os
import logging
import argparse

# Import pyspark and build Spark session
from pyspark.sql.functions import *
from pyspark.sql.types import (
    DoubleType,
    IntegerType,
    StringType,
    StructField,
    StructType,
)
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import udf
import pyspark.sql.functions as F
from pyspark.sql.types import ArrayType
import re
from pyspark.sql.functions import explode, count
import sagemaker
from pyspark.sql.functions import lower, regexp_replace, col, concat_ws
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.base import Finisher, DocumentAssembler

from pyspark.sql.functions import desc

import nltk
nltk.download('stopwords')
eng_stopwords = nltk.corpus.stopwords.words('english')

logging.basicConfig(format='%(asctime)s,%(levelname)s,%(module)s,%(filename)s,%(lineno)d,%(message)s', level=logging.DEBUG)
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))

def main():
    
    parser = argparse.ArgumentParser(description="app inputs and outputs")
    parser.add_argument("--s3_dataset_path", type=str, help="Path of dataset in S3")    
    parser.add_argument("--col_name_for_filtering", type=str, help="Name of the column to filter")
    args = parser.parse_args()

    spark = SparkSession.builder \
    .appName("Spark NLP")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3")\
    .getOrCreate()
    
    logger.info(f"Spark version: {spark.version}")
    logger.info(f"sparknlp version: {sparknlp.version()}")
    
    # This is needed to save RDDs which is the only way to write nested Dataframes into CSV format
    sc = spark.sparkContext
    sc._jsc.hadoopConfiguration().set(
        "mapred.output.committer.class", "org.apache.hadoop.mapred.FileOutputCommitter"
    )

    # Downloading the data from S3 into a Dataframe
    logger.info(f"going to read {args.s3_dataset_path} for r/{args.col_name_for_filtering}")
    df = spark.read.parquet(args.s3_dataset_path, header=True)
    vals = [args.col_name_for_filtering]
    df_filtered = df.where(col("subreddit").isin(vals))
    logger.info(f"finished reading files...")
    
    # DATA CLEANING
    comments_filtered = df_filtered.filter((df.body != '[deleted]') & (df.author != '[deleted]'))

    # Define the pipeline stages
    document_assembler = DocumentAssembler() \
        .setInputCol("body") \
        .setOutputCol("document")

    sentence_detector = SentenceDetector() \
        .setInputCols(["document"]) \
        .setOutputCol("sentence")

    tokenizer = Tokenizer() \
        .setInputCols(["sentence"]) \
        .setOutputCol("token")

    # Use a pretrained embeddings model, for example, BERT
    embeddings = BertEmbeddings.pretrained("bert_base_cased", "en") \
        .setInputCols(["sentence", "token"]) \
        .setOutputCol("embeddings")

    ner_model = NerDLModel.pretrained("ner_dl_bert", "en") \
        .setInputCols(["sentence", "token", "embeddings"]) \
        .setOutputCol("ner")

    ner_converter = NerConverter() \
        .setInputCols(["sentence", "token", "ner"]) \
        .setOutputCol("ner_chunk")

    # Build the pipeline
    nlp_pipeline = Pipeline(stages=[
        document_assembler,
        sentence_detector,
        tokenizer,
        embeddings,
        ner_model,
        ner_converter
    ])

    # Apply the pipeline to your DataFrame
    model = nlp_pipeline.fit(comments_filtered_movies)
    result = model.transform(comments_filtered_movies)

    # Define a UDF to filter and extract movie names
    def extract_movies(chunks):
        movie_names = [chunk.result for chunk in chunks if chunk.metadata['entity'] in ['PERSON', 'ORG']]
        return movie_names

    extract_movie_names_udf = udf(extract_movies, ArrayType(StringType()))

    # Apply the UDF to the DataFrame
    movies_df = result.withColumn("movie_names", extract_movie_names_udf(F.col("ner_chunk")))


    @udf(StringType())
    def remove_movie_names_udf(text, movie_names):
        if movie_names:
            for name in movie_names:
                text = text.replace(name, ' ')
            return ' '.join(text.split())
        else:
            return text

    # UDF to extract movie names using regex
    @udf(ArrayType(StringType()))
    def extract_movie_names_regex_udf(text, movie_names):
        movie_name_pattern = r'(?:\"([^\"]+)\"|([A-Z][a-z]*(?:\s+(?:[a-z]+\s+)*[A-Z][a-z]*)*)(?: \(\d{4}\))?)'

        movie_matches = re.findall(movie_name_pattern, text)
        movies = [match[0] or match[1] or match[2] for match in movie_matches]
        return movie_names + movies
    
    def remove_stop_word_from_movie_names(movies):
        if movies and len(movies[0].split()) == 1 and movies[0].lower() in eng_stopwords:
            return movies[1:]
        return movies

    # Remove movie names from the 'body' text
    df_removed_movie_names = movies_df.withColumn("body_no_movies", remove_movie_names_udf(movies_df["body"], movies_df["movie_names"]))

    # If you still want to use the regex method to supplement the NER extraction
    df_final = df_removed_movie_names.withColumn("additional_movie_names", extract_movie_names_regex_udf(df_removed_movie_names["body_no_movies"], df_removed_movie_names["movie_names"]))

    df_final = df_final.select("subreddit", "author", "body", "parent_id", "id", "created_utc", "score", "controversiality", "additional_movie_names")
    
    remove_stop_word_udf = udf(remove_stop_word_from_movie_names, ArrayType(StringType()))

    df_final = df_final.withColumn("movie_names_final", remove_stop_word_udf(df_final["additional_movie_names"]))
    
    # Flatten the movie_names column
    df_flattened = df_final.withColumn("movie_name", explode(col("movie_names_final")))

    # Group by movie_name and count the occurrences
    df_frequency = df_flattened.groupBy("movie_name").agg(count("*").alias("frequency"))
    
    # Sort the DataFrame by frequency in descending order and take the top 1000
    df_top_1000_movies = df_frequency.orderBy(desc("frequency")).limit(1000)
    
    bucket = "project-group34"
    output_prefix_data_comments = "project/comments/"
    s3_path = f"s3a://{bucket}/{output_prefix_data_comments}"

    df_top_1000_movies.write.mode("overwrite").parquet(f"{s3_path}/movie_suggestions/")
    
    logger.info(f"all done...")
    
if __name__ == "__main__":
    main()

Overwriting ./code/suggestion_extract_process.py


In [27]:
%%time
import time
import sagemaker
from sagemaker.spark.processing import PySparkProcessor

# Setup the PySpark processor to run the job. Note the instance type and instance count parameters. SageMaker will create these many instances of this type for the spark job.
role = sagemaker.get_execution_role()
spark_processor = PySparkProcessor(
    base_job_name="sm-spark-project",
    framework_version="3.3",
    role=role,
    instance_count=8,
    instance_type="ml.m5.xlarge",
    max_runtime_in_seconds=3600,
)

# # S3 URI of the initialization script
# s3_uri_init_script = f's3://{bucket}/{script_key}'

# s3 paths
session = sagemaker.Session()
output_prefix_logs = f"spark_logs"

configuration = [
    {
        "Classification": "spark-defaults",
        "Properties": {"spark.executor.memory": "12g", "spark.executor.cores": "4"},
    }
]

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
CPU times: user 107 ms, sys: 4.68 ms, total: 112 ms
Wall time: 187 ms


In [28]:
%%time
subreddit_list = ["MovieSuggestions", "televisionsuggestions", "Animesuggest"]
for subreddit in subreddit_list:
    print(f"going to extract suggestions data for subreddit={subreddit}")
    bucket = "project-group34"
    output_prefix_data_comments = "project/comments/yyyy=*"
    s3_path = f"s3a://{bucket}/{output_prefix_data_comments}"
    col_name_for_filtering = subreddit

    # run the job now, the arguments array is provided as command line to the Python script (Spark code in this case).
    spark_processor.run(
        submit_app="./code/suggestion_extract_process.py",
        arguments=[
            "--s3_dataset_path",
            s3_path,
            "--col_name_for_filtering",
            col_name_for_filtering,
        ],
        spark_event_logs_s3_uri="s3://{}/{}/spark_event_logs".format(bucket, output_prefix_logs),
        logs=False,
        configuration=configuration
    )
    # give some time for resources from this iterations to get cleaned up
    # if we start the job immediately we could get insufficient resources error
    time.sleep(60)

going to extract suggestions data for subreddit=MovieSuggestions


INFO:sagemaker:Creating processing-job with name sm-spark-project-2023-11-20-21-13-28-201


..............................................................................*

UnexpectedStatusException: Error for Processing job sm-spark-project-2023-11-20-21-13-28-201: Failed. Reason: AlgorithmError: See job logs for more information