# NLP Sentiment Analysis

In [1]:
spark

StatementMeta(80dcc4b2-bc50-4e81-91e5-397b7f13252e, 13, 6, Finished, Available)

In [15]:
%%configure -f \
{"conf": {"spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.2"}}

StatementMeta(80dcc4b2-bc50-4e81-91e5-397b7f13252e, 15, -1, Finished, Available)

Unrecognized options: 

In [16]:
!pip install spark-nlp

StatementMeta(80dcc4b2-bc50-4e81-91e5-397b7f13252e, 15, 6, Finished, Available)

Collecting spark-nlp
  Downloading spark_nlp-5.1.4-py2.py3-none-any.whl (540 kB)
[K     |████████████████████████████████| 540 kB 9.3 MB/s eta 0:00:01
[?25hInstalling collected packages: spark-nlp
Successfully installed spark-nlp-5.1.4


In [17]:
workspace_default_storage_account = "group10astorage46582e02e"
workspace_default_container = "azureml-blobstore-e8a18b52-3288-4d1f-9f32-d5a9249c2c0e"
workspace_wasbs_base_url = (f"wasbs://{workspace_default_container}@{workspace_default_storage_account}.blob.core.windows.net/")
comment_load = spark.read.parquet(f"{workspace_wasbs_base_url}/mbti_comments.parquet")
comment_load.printSchema()

StatementMeta(80dcc4b2-bc50-4e81-91e5-397b7f13252e, 15, 7, Finished, Available)

root
 |-- sub_id: string (nullable = true)
 |-- comment_author: string (nullable = true)
 |-- comment_text: string (nullable = true)
 |-- link_id: string (nullable = true)
 |-- comment_score: long (nullable = true)
 |-- comment_controversiality: long (nullable = true)
 |-- reply_to: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)



In [18]:
# Cache the dataset
comment_load.cache()

StatementMeta(80dcc4b2-bc50-4e81-91e5-397b7f13252e, 15, 8, Finished, Available)

DataFrame[sub_id: string, comment_author: string, comment_text: string, link_id: string, comment_score: bigint, comment_controversiality: bigint, reply_to: string, year: int, month: int]

In [29]:
from pyspark.sql import functions as F
# Find the range of comment_score
min_score, max_score = comment_load.agg(
    F.min("comment_score"), 
    F.max("comment_score")
).first()

print(f"Range of comment_score: {min_score} to {max_score}")

StatementMeta(80dcc4b2-bc50-4e81-91e5-397b7f13252e, 12, 10, Finished, Available)

Range of comment_score: -126 to 1259


In [19]:
# Calculate the 25th, 50th, and 75th percentiles
quantiles = comment_load.stat.approxQuantile("comment_score", [0.25, 0.5, 0.75], 0.0)

print(f"25th percentile: {quantiles[0]}")
print(f"50th percentile (median): {quantiles[1]}")
print(f"75th percentile: {quantiles[2]}")

StatementMeta(80dcc4b2-bc50-4e81-91e5-397b7f13252e, 15, 9, Finished, Available)

25th percentile: 1.0
50th percentile (median): 2.0
75th percentile: 3.0


In [31]:
comment_score_summary = comment_load.describe(['comment_score'])
comment_score_summary.show()

StatementMeta(80dcc4b2-bc50-4e81-91e5-397b7f13252e, 12, 12, Finished, Available)

+-------+------------------+
|summary|     comment_score|
+-------+------------------+
|  count|           1834140|
|   mean| 4.352661192711571|
| stddev|13.546681999290229|
|    min|              -126|
|    max|              1259|
+-------+------------------+



In [32]:
comment_score_summary.toPandas().to_csv("Users/ml2078/fall-2023-reddit-project-team-10/data/csv/comment_score_summary.csv",index=False)

StatementMeta(80dcc4b2-bc50-4e81-91e5-397b7f13252e, 12, 13, Finished, Available)

In [20]:
from pyspark.sql import functions as F
# Create a new categorical column based on comment_score division
def score_category(score):
    if score <= quantiles[0]:
        return 'Low'
    elif score <= quantiles[1]:
        return 'Medium'
    elif score <= quantiles[2]:
        return 'High'
    else:
        return 'Very High'

score_category_udf = F.udf(score_category)

comment_load = comment_load.withColumn("score_category", score_category_udf("comment_score"))

# View the schema to confirm the new column addition
comment_load.printSchema()

StatementMeta(80dcc4b2-bc50-4e81-91e5-397b7f13252e, 15, 10, Finished, Available)

root
 |-- sub_id: string (nullable = true)
 |-- comment_author: string (nullable = true)
 |-- comment_text: string (nullable = true)
 |-- link_id: string (nullable = true)
 |-- comment_score: long (nullable = true)
 |-- comment_controversiality: long (nullable = true)
 |-- reply_to: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- score_category: string (nullable = true)



In [34]:
from pyspark.sql import functions as F

# Assuming your DataFrame with the score_category column is named 'results'
category_counts = comment_load.groupBy("score_category","reply_to").count()

# Show the results
category_counts.orderBy("score_category","reply_to").show()


StatementMeta(80dcc4b2-bc50-4e81-91e5-397b7f13252e, 12, 15, Finished, Available)

+--------------+--------+------+
|score_category|reply_to| count|
+--------------+--------+------+
|          High|      t1|101867|
|          High|      t3| 89412|
|           Low|      t1|430248|
|           Low|      t3|380668|
|        Medium|      t1|236557|
|        Medium|      t3|179771|
|     Very High|      t1|206793|
|     Very High|      t3|208824|
+--------------+--------+------+



In [35]:
from pyspark.sql import functions as F

# Calculate the total counts for each reply_to category
total_counts = category_counts.groupBy("reply_to").agg(F.sum("count").alias("total_count"))

# Join the total counts back to the original DataFrame
combined_data = category_counts.join(total_counts, "reply_to")

# Calculate the percentage
combined_data_with_percentage = combined_data.withColumn(
    "percentage",
    F.round((F.col("count") / F.col("total_count")),2),
).orderBy("reply_to","score_category").select("reply_to","score_category","count","percentage")


StatementMeta(80dcc4b2-bc50-4e81-91e5-397b7f13252e, 12, 16, Finished, Available)

In [37]:
combined_data_with_percentage.toPandas().to_csv("Users/ml2078/fall-2023-reddit-project-team-10/data/csv/comment_score_percentage.csv",index=False)

StatementMeta(80dcc4b2-bc50-4e81-91e5-397b7f13252e, 12, 18, Finished, Available)

In [36]:
# Show the results
combined_data_with_percentage.show()

StatementMeta(80dcc4b2-bc50-4e81-91e5-397b7f13252e, 12, 17, Finished, Available)

+--------+--------------+------+----------+
|reply_to|score_category| count|percentage|
+--------+--------------+------+----------+
|      t1|          High|101867|       0.1|
|      t1|           Low|430248|      0.44|
|      t1|        Medium|236557|      0.24|
|      t1|     Very High|206793|      0.21|
|      t3|          High| 89412|       0.1|
|      t3|           Low|380668|      0.44|
|      t3|        Medium|179771|      0.21|
|      t3|     Very High|208824|      0.24|
+--------+--------------+------+----------+



In [21]:
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import UniversalSentenceEncoder, SentimentDLModel
from pyspark.ml import Pipeline

# Define the name of the SentimentDLModel
MODEL_NAME = "sentimentdl_use_twitter"  # Replace with the model name you intend to use

# Configure the Document Assembler
documentAssembler = DocumentAssembler()\
    .setInputCol("comment_text")\
    .setOutputCol("document")

# Configure the Universal Sentence Encoder
use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
    .setInputCols(["document"])\
    .setOutputCol("sentence_embeddings")

# Configure the SentimentDLModel
sentimentdl = SentimentDLModel.pretrained(name=MODEL_NAME, lang="en")\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("sentiment")

# Set up the NLP Pipeline
nlpPipeline = Pipeline(
    stages=[
        documentAssembler,
        use,
        sentimentdl
    ])

# Apply the Pipeline to your DataFrame
pipelineModel = nlpPipeline.fit(comment_load)
results = pipelineModel.transform(comment_load)


StatementMeta(80dcc4b2-bc50-4e81-91e5-397b7f13252e, 15, 11, Finished, Available)

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]
sentimentdl_use_twitter download started this may take some time.
Approximate size to download 11.4 MB
[OK!]


In [39]:
results.printSchema()

StatementMeta(80dcc4b2-bc50-4e81-91e5-397b7f13252e, 12, 20, Finished, Available)

root
 |-- sub_id: string (nullable = true)
 |-- comment_author: string (nullable = true)
 |-- comment_text: string (nullable = true)
 |-- link_id: string (nullable = true)
 |-- comment_score: long (nullable = true)
 |-- comment_controversiality: long (nullable = true)
 |-- reply_to: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- score_category: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- sentence_embeddings: array (nu

In [22]:
result_df = results.select("comment_text","comment_controversiality","reply_to","score_category",F.explode("sentiment.result").alias("sentiment"))

StatementMeta(80dcc4b2-bc50-4e81-91e5-397b7f13252e, 15, 12, Finished, Available)

In [42]:
result_df.show(10)

StatementMeta(80dcc4b2-bc50-4e81-91e5-397b7f13252e, 12, 23, Finished, Available)

+--------------------+------------------------+--------+--------------+---------+
|        comment_text|comment_controversiality|reply_to|score_category|sentiment|
+--------------------+------------------------+--------+--------------+---------+
|yes it feels like...|                       0|      t3|     Very High| positive|
|   Hahaha! What?????|                       0|      t1|           Low| positive|
|           [deleted]|                       0|      t1|        Medium| negative|
|I'd photo my frie...|                       0|      t3|           Low| positive|
|I think you may b...|                       0|      t1|           Low| positive|
|&gt;they're reall...|                       0|      t1|        Medium| positive|
|Interesting why d...|                       0|      t1|           Low| positive|
|I did some creepy...|                       0|      t3|           Low| negative|
|I see Shrek using...|                       0|      t1|           Low| negative|
|               

In [24]:
result_df.limit(5).toPandas().to_csv("Users/ml2078/fall-2023-reddit-project-team-10/data/csv/sentiment_result_limit5.csv",index=False)

StatementMeta(80dcc4b2-bc50-4e81-91e5-397b7f13252e, 15, 14, Finished, Available)

In [23]:
from pyspark.sql import functions as F

# Group by score_category and sentiment, then count the occurrences
category_sentiment_counts = result_df.groupBy("score_category", "sentiment").count()
category_sentiment_counts.show()

StatementMeta(80dcc4b2-bc50-4e81-91e5-397b7f13252e, 15, 13, Finished, Available)

+--------------+---------+------+
|score_category|sentiment| count|
+--------------+---------+------+
|           Low|  neutral| 43229|
|           Low| negative|257820|
|        Medium| positive|288381|
|     Very High| positive|274252|
|           Low| positive|509866|
|          High| negative| 50260|
|        Medium| negative|105302|
|     Very High| negative|118009|
|        Medium|  neutral| 22645|
|     Very High|  neutral| 23356|
|          High|  neutral| 10537|
|          High| positive|130481|
+--------------+---------+------+



In [25]:
category_sentiment_counts.toPandas().to_csv("Users/ml2078/fall-2023-reddit-project-team-10/data/csv/sentiment_counts.csv",index=False)

StatementMeta(80dcc4b2-bc50-4e81-91e5-397b7f13252e, 15, 15, Finished, Available)

In [26]:
pandas_df = category_sentiment_counts.toPandas()


StatementMeta(80dcc4b2-bc50-4e81-91e5-397b7f13252e, 15, 16, Finished, Available)

In [40]:
# Reshape the data for heatmap plotting
heatmap_data = pandas_df.pivot("score_category", "sentiment", "count")

StatementMeta(80dcc4b2-bc50-4e81-91e5-397b7f13252e, 15, 30, Finished, Available)

In [41]:
import pandas as pd
# Convert the 'score_category' to a categorical type with the desired order
ordered_categories = ['Low','Medium','High','Very High']
heatmap_data.index = pd.CategoricalIndex(heatmap_data.index, categories=ordered_categories, ordered=True)

# Sort the DataFrame by the 'score_category' index to ensure the order is applied
heatmap_data.sort_index(level='score_category', ascending=False, inplace=True)

StatementMeta(80dcc4b2-bc50-4e81-91e5-397b7f13252e, 15, 31, Finished, Available)

In [42]:
heatmap_data.head()

StatementMeta(80dcc4b2-bc50-4e81-91e5-397b7f13252e, 15, 32, Finished, Available)

sentiment,negative,neutral,positive
score_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Very High,118009,23356,274252
High,50260,10537,130481
Medium,105302,22645,288381
Low,257820,43229,509866


In [44]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 8))
sentiment_heatmap = sns.heatmap(heatmap_data, annot=True, fmt=".2f", cmap="YlGnBu")
plt.title('Heatmap of Sentiment Percentage by Score Category')
plt.ylabel('Score Category')
plt.xlabel('Sentiment')

plt.savefig('Users/ml2078/fall-2023-reddit-project-team-10/plots/csv/heatmap.png', dpi=300, bbox_inches='tight')
plt.show()


StatementMeta(80dcc4b2-bc50-4e81-91e5-397b7f13252e, 15, 34, Finished, Available)

FileNotFoundError: [Errno 2] No such file or directory: 'Users/ml2078/fall-2023-reddit-project-team-10/plots/csv/heatmap.png'