In [1]:
import pyspark
import pandas as pd

In [2]:
# Spark Session created
#from pyspark.sql import SparkSession
#spark = SparkSession.builder.getOrCreate()
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("topic_modeling") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()


In [3]:
#spark.stop()

In [4]:
df = pd.read_csv("tweets.csv")
df.head(1)

Unnamed: 0,TweetID,UserID,Username,Timestamp,Text,State,County,City,Sentiment,COVID-related,AgeGroup,Age_Confidence,Gender,Gender_Confidence,Org_Confidence,Retweet
0,1240790697504051203,724718492205875203,lionray98,Fri Mar 20 00:02:09 +0000 2020,We learned today that two Lakers players have ...,Louisiana,,,0.7783,1,19-29,0.9404,male,0.9999,0.0,1


In [5]:
# Convert Pandas DataFrame to Spark DataFrame
spark_df = spark.createDataFrame(df)

# Show the Spark DataFrame
spark_df.show(1)

+-------------------+------------------+---------+--------------------+--------------------+---------+------+----+---------+-------------+--------+--------------+------+-----------------+--------------+-------+
|            TweetID|            UserID| Username|           Timestamp|                Text|    State|County|City|Sentiment|COVID-related|AgeGroup|Age_Confidence|Gender|Gender_Confidence|Org_Confidence|Retweet|
+-------------------+------------------+---------+--------------------+--------------------+---------+------+----+---------+-------------+--------+--------------+------+-----------------+--------------+-------+
|1240790697504051203|724718492205875203|lionray98|Fri Mar 20 00:02:...|We learned today ...|Louisiana|   NaN| NaN|   0.7783|            1|   19-29|        0.9404|  male|           0.9999|           0.0|      1|
+-------------------+------------------+---------+--------------------+--------------------+---------+------+----+---------+-------------+--------+---------

In [6]:
# Get the number of rows
num_rows = spark_df.count()
print(f"Number of rows: {num_rows}")
# Get the number of columns
num_columns = len(spark_df.columns)
print(f"Number of columns: {num_columns}")

Number of rows: 314534
Number of columns: 16


In [7]:
## Data Preprocessing 

from pyspark.ml.feature import Tokenizer, StopWordsRemover

# Tokenize the text
tokenizer = Tokenizer(inputCol="Text", outputCol="words")
tweets_words = tokenizer.transform(spark_df)

# Remove stop words
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
tweets_filtered = remover.transform(tweets_words)

# Show the preprocessed DataFrame
tweets_filtered.select("Text", "filtered_words").show(truncate=False)


+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Text                                                                                                                                                                                                                                                                                                                |filtered_words                                                                                    

In [8]:
## Data Preprocessing 

from pyspark.ml.feature import CountVectorizer

# Create a CountVectorizer model
cv = CountVectorizer(inputCol="filtered_words", outputCol="features")

# Fit the model to the DataFrame and transform it
cv_model = cv.fit(tweets_filtered)
tweets_dtm = cv_model.transform(tweets_filtered)

# Show the DTM
tweets_dtm.select("Text", "features").show(truncate=False)


+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Text                                                                                                                                                                                                                                                                                                                |features                                                                                    

In [9]:
from pyspark.ml.clustering import LDA

# Train the LDA model
lda = LDA(k=5, maxIter=10)  # k is the number of topics
lda_model = lda.fit(tweets_dtm)

# Describe topics
topics = lda_model.describeTopics()
vocab = cv_model.vocabulary
for topic in topics.collect():
    print("Topic {}: {}".format(topic[0], [vocab[idx] for idx in topic[1]]))


Topic 0: ['', 'pandemic', '&amp;', 'coronavirus', '#covid19', 'lie', 'system', 'health', 'it.', 'must']
Topic 1: ['', 'still', 'please', 'like', 'mask', 'retweet', '#covid19', 'see', 'much', 'wearing']
Topic 2: ['', 'mask', 'quarantine', 'wear', 'day', 'coronavirus', 'like', 'people', 'got', '&amp;']
Topic 3: ['', 'coronavirus', 'people', 'trump', 'covid-19', '&amp;', 'virus', 'pandemic', '-', '#covid19']
Topic 4: ['quarantine', 'people', '', 'trump', 'want', 'like', 'coronavirus', 'wear', 'covid', 'covid-19']


In [10]:
# Assign topics to documents
topic_distributions = lda_model.transform(tweets_dtm)

# Show the topic distribution for each document
topic_distributions.select("Text", "topicDistribution").show(1, truncate=False)


+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------+
|Text                                                                                                                                                                                                     |topicDistribution                                                                                     |
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------+
|We learned today that two Lakers players have tested positive for COVID-19. Bo

In [1]:
import nbformat

# Load the notebook file
with open('topic_modeling.ipynb', 'r', encoding='utf-8') as f:
    notebook = nbformat.read(f, as_version=4)

# Count the lines of code in all code cells
code_lines = 0
for cell in notebook['cells']:
    if cell['cell_type'] == 'code':
        code_lines += len(cell['source'].splitlines())

print(f"Total lines of code: {code_lines}")


Total lines of code: 68


In [1]:
import json

def count_spark_operations(notebook_path):
    # Open the notebook file with utf-8 encoding
    with open(notebook_path, 'r', encoding='utf-8') as f:
        notebook = json.load(f)

    spark_operation_count = 0
    for cell in notebook['cells']:
        if cell['cell_type'] == 'code':
            code = cell['source']
            # Check for common Spark operation keywords
            for line in code:
                if any(op in line for op in ['filter', 'select', 'groupBy', 'join', 'map', 'reduce', 'show', 'collect']):
                    spark_operation_count += 1

    return spark_operation_count

# Example usage
notebook_path = 'topic_modeling.ipynb'  # Update this with your notebook path
operation_count = count_spark_operations(notebook_path)
print(f"Number of Spark Operations Found: {operation_count}")


Number of Spark Operations Found: 10
