In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
!tar xf spark-3.2.0-bin-hadoop3.2.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "spark-3.2.0-bin-hadoop3.2"

import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext
sc

In [16]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, when, col
from pyspark.sql.types import IntegerType
import string

In [17]:
!unzip /content/moviereviews.zip

Archive:  /content/moviereviews.zip
replace moviereviews/cv000_29416.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: moviereviews/cv000_29416.txt  
replace moviereviews/cv000_29590.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: moviereviews/cv000_29590.txt  
  inflating: moviereviews/cv001_18431.txt  
  inflating: moviereviews/cv001_19502.txt  
  inflating: moviereviews/cv002_15918.txt  
  inflating: moviereviews/cv002_17424.txt  
  inflating: moviereviews/cv003_11664.txt  
  inflating: moviereviews/cv003_12683.txt  
  inflating: moviereviews/cv004_11636.txt  
  inflating: moviereviews/cv004_12641.txt  
  inflating: moviereviews/cv005_29357.txt  
  inflating: moviereviews/cv005_29443.txt  
  inflating: moviereviews/cv006_15448.txt  
  inflating: moviereviews/cv006_17022.txt  
  inflating: moviereviews/cv007_4968.txt  
  inflating: moviereviews/cv007_4992.txt  
  inflating: moviereviews/cv008_29326.txt  
  inflating: moviereviews/cv008_29435.txt  
  inflating: movie

In [18]:
# Step 1: Load Data
# Read movie reviews and word lists
positive_words = spark.read.text("/content/pos.txt").rdd.flatMap(lambda x: x).collect()
negative_words = spark.read.text("/content/neg.txt").rdd.flatMap(lambda x: x).collect()

In [19]:
from pyspark.sql.functions import col, when,substring_index
# Define a function to calculate sentiment for a review
def calculate_sentiment(review):
    # Split the review into words
    words = review.split()

    # Count the number of positive and negative words in the review
    positive_count = len([word for word in words if word in positive_words])
    negative_count = len([word for word in words if word in negative_words])

    # Determine the overall sentiment as "neutral," "positive," or "negative"
    sentiment = "neutral"
    if positive_count > negative_count:
        sentiment = "positive"
    elif positive_count < negative_count:
        sentiment = "negative"

    # Return the counts and sentiment
    return (positive_count, negative_count, sentiment)

# Load movie reviews from a directory
reviews = sc.wholeTextFiles("/content/moviereviews/")

# Calculate sentiment scores for each review and format the data
sentiment_scores = reviews.map(lambda x: (x[0], *calculate_sentiment(x[1])))
sentiment_df = sentiment_scores.toDF(["File", "Positive_Count", "Negative_Count", "Sentiment"])

# Modify the "File" column to retain only the file name
sentiment_df = sentiment_df.withColumn("File", substring_index(col("File"), "/", -1))

# Display the DataFrame with sentiment information
sentiment_df.show()

+---------------+--------------+--------------+---------+
|           File|Positive_Count|Negative_Count|Sentiment|
+---------------+--------------+--------------+---------+
|cv755_23616.txt|            23|            13| positive|
|cv774_15488.txt|            20|            24| negative|
|cv700_21947.txt|            55|            36| positive|
|cv901_11934.txt|            12|             8| positive|
|cv960_29007.txt|            26|            41| negative|
|cv112_11193.txt|            35|            39| negative|
|cv137_17020.txt|            23|            34| negative|
|cv883_27751.txt|            17|             9| positive|
| cv020_9234.txt|            20|            32| negative|
|cv399_28593.txt|            18|            25| negative|
| cv803_8584.txt|            35|            45| negative|
|cv391_10802.txt|            35|            37| negative|
|cv262_12649.txt|            37|            47| negative|
|cv412_25254.txt|            53|            60| negative|
|cv181_14401.t