In [0]:
pip install nltk

Python interpreter will be restarted.
Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
Collecting tqdm
  Downloading tqdm-4.65.0-py3-none-any.whl (77 kB)
Collecting regex>=2021.8.3
  Downloading regex-2023.3.23-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (768 kB)
Installing collected packages: tqdm, regex, nltk
Successfully installed nltk-3.8.1 regex-2023.3.23 tqdm-4.65.0
Python interpreter will be restarted.


In [0]:
pip install praw

Python interpreter will be restarted.
Collecting praw
  Downloading praw-7.7.0-py3-none-any.whl (189 kB)
Collecting update-checker>=0.18
  Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Collecting prawcore<3,>=2.1
  Downloading prawcore-2.3.0-py3-none-any.whl (16 kB)
Collecting websocket-client>=0.54.0
  Downloading websocket_client-1.5.1-py3-none-any.whl (55 kB)
Installing collected packages: websocket-client, update-checker, prawcore, praw
Successfully installed praw-7.7.0 prawcore-2.3.0 update-checker-0.18.0 websocket-client-1.5.1
Python interpreter will be restarted.


In [0]:
# Author: Jash

import praw
import pandas as pd
import datetime

def scrape_subreddits(subreddit_names):
    """
    Scrapes the top 100 posts of each subreddit in subreddit_names from the past month

    Args:
        subreddit_names (list of str): List of subreddit names

    Returns:
        (list of lists): List of posts, where each post is a list of [subreddit, title, score, num_comments, created_str]
    """
    reddit = praw.Reddit(
        client_id="7Ld-hmmFLnuBRncnLLmDTg",
        client_secret="rb3VXiZyK0DzY_Ih2ZnoJ464Ik1w4g",
        user_agent="Fair_Tomorrow_5835"
    )

    posts = []
    for name in subreddit_names:
        subreddit = reddit.subreddit(name)
        for post in subreddit.top(limit=100, time_filter='month'):
            created_datetime = datetime.datetime.fromtimestamp(post.created)
            created_str = created_datetime.strftime('%Y-%m-%d %H:%M:%S')
            posts.append([post.subreddit, post.title, post.score, post.num_comments, created_str])

    return posts

def get_posts():
    """
    Main function to retrieve posts from subreddits and convert them to a Pandas dataframe

    Returns:
        (Pandas DataFrame): Dataframe of scraped subreddit posts with columns [Subreddit, Title, Score, Num_comments, Date]
    """
    subreddit_names = ['microsoft', 'google', 'youtube', 'intel', 'aws', 'azure', 'apple', 'raspberry_pi', 'android', 'amazon', 'openai', 'bing']
    print('Scraping subreddits: ', subreddit_names)

    posts = scrape_subreddits(subreddit_names)
    posts = pd.DataFrame(posts, columns=['Subreddit', 'Title', 'Score', 'Num_comments', 'Date'])
    posts['Subreddit'] = posts['Subreddit'].astype(str)
    return posts
    # posts.to_csv('data/reddit_data.csv')


posts = get_posts()

Scraping subreddits:  ['microsoft', 'google', 'youtube', 'intel', 'aws', 'azure', 'apple', 'raspberry_pi', 'android', 'amazon', 'openai', 'bing']


In [0]:
# Author: Jash

import pyspark
import nltk
import string
from pyspark.sql.functions import split, col, regexp_replace, concat_ws, lower, udf
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.types import ArrayType, StringType
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Download stopwords and initialize SnowballStemmer
nltk.download('stopwords')
stopwords = stopwords.words('english')
snowball = SnowballStemmer('english')

def preprocess(df):
    """
    Preprocesses the data by removing stopwords, punctuations, and stemming the words

    Args:
        df (Pandas DataFrame): DataFrame containing the data

    Returns:
        (Spark DataFrame): DataFrame containing the processed data
    """
    # Create a Spark session
    SparkContext = pyspark.SparkContext
    sc = SparkContext.getOrCreate()
    sc._conf.set("spark.sql.execution.arrow.pyspark.enabled", "false") # Disable Arrow optimization
    spark = pyspark.sql.SparkSession(sc)

    # Create a Spark DataFrame from the Pandas DataFrame
    spark_df = spark.createDataFrame(df)

    # Preprocess the data
    processed_df = data_preprocess(spark_df)

    return processed_df

def data_preprocess(df, stopwords=stopwords):
    """
    Processes the data by removing stopwords, punctuations, and stemming the words

    Args:
        df (Spark DataFrame): DataFrame containing the data
        stopwords (list of str): List of stopwords to remove

    Returns:
        (Spark DataFrame): DataFrame containing the processed data
    """

    # Use a regular expression to remove all non-alphabetic characters and punctuations
    df = df.withColumn("Title_clean", regexp_replace(col("Title"), r'[^a-zA-Z\s]', ''))
    # Convert all words to lowercase
    df = df.withColumn("Title", lower(col("Title")))
    # Split the "Title_clean" column into an array of words
    df = df.withColumn("Title_words", split(col("Title_clean"), " "))
    # Stem the array of words in the "Title_words" column
    df = df.withColumn("Title_words", udf(lambda x: [snowball.stem(word) for word in x], ArrayType(StringType()))(col("Title_words")))
    # Remove the stopwords case insensitively from the "Title" column
    remover = StopWordsRemover(inputCol="Title_words", outputCol="Title_filtered", stopWords=stopwords, caseSensitive=False)
    filtered_df = remover.transform(df)
    # Join the words back into a sentence
    filtered_df = filtered_df.withColumn("Title_filtered", concat_ws(" ", col("Title_filtered")))
    filtered_df = filtered_df.drop("Title_clean", "Title_words")
    # Lower the case of the Title_filtered column
    filtered_df = filtered_df.withColumn("Title_filtered", lower(col("Title_filtered")))

    return filtered_df


# Preprocess the data
preprocessed_posts = preprocess(posts)

# Show the first 10 rows of the processed data
preprocessed_posts.show(10)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
+---------+--------------------+-----+------------+-------------------+--------------------+
|Subreddit|               Title|Score|Num_comments|               Date|      Title_filtered|
+---------+--------------------+-----+------------+-------------------+--------------------+
|microsoft|bing gaining new ...|  392|          91|2023-03-24 16:43:23|bing gain new use...|
|microsoft|get off my deskto...|  324|          56|2023-03-28 17:26:55|get desktop windo...|
|microsoft|microsoft 365 cop...|  231|          64|2023-03-16 16:19:05|  microsoft  copilot|
|microsoft|why is the ms hom...|  192|          62|2023-03-19 23:49:03|whi ms homepag fi...|
|microsoft|i switched to edg...|  178|          43|2023-04-02 07:56:21|switch edg  year ...|
|microsoft|bing offers free ...|  157|          13|2023-03-22 10:29:53|bing offer free i...|
|microsoft|microsoft ranked ...|  150|      