In [1]:
#SENTIMENT ANALYSIS

In [2]:
#TOKENAIZATION
#STOP WORDS REMOVAL
#LEMAATIZATION
#VECTORIZATION

In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, StringIndexer, HashingTF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [4]:
spark = SparkSession.builder.master("local[*]").appName("sentiment_analysis_app_2").config("spark.executor.memory", "8g").config("spark.driver.memory", "8g").config("spark.executor.cores", "4").getOrCreate()

In [5]:
df=spark.read.csv(r"C:\Users\ACER\OneDrive\Desktop\spark_lab\data\IMDB Dataset.csv",header=True,inferSchema=True)

In [6]:
df.show(10)

+--------------------+--------------------+
|              review|           sentiment|
+--------------------+--------------------+
|One of the other ...|            positive|
|"A wonderful litt...| not only is it w...|
|"I thought this w...| but spirited you...|
|Basically there's...|            negative|
|"Petter Mattei's ...| power and succes...|
|"Probably my all-...| but that only ma...|
|I sure would like...|            positive|
|This show was an ...|            negative|
|Encouraged by the...|            negative|
|If you like origi...|            positive|
+--------------------+--------------------+
only showing top 10 rows



In [7]:
df=df.withColumnRenamed("sentiment","label").withColumnRenamed("review","text")

In [8]:
df = df.filter(df["label"].isNotNull()).withColumn("label", df["label"].cast("string"))

In [9]:
df.show(5)

+--------------------+--------------------+
|                text|               label|
+--------------------+--------------------+
|One of the other ...|            positive|
|"A wonderful litt...| not only is it w...|
|"I thought this w...| but spirited you...|
|Basically there's...|            negative|
|"Petter Mattei's ...| power and succes...|
+--------------------+--------------------+
only showing top 5 rows



In [10]:
df = df.sample(fraction=0.4)

In [11]:
label_indexer = StringIndexer(inputCol="label", outputCol="label_index", handleInvalid="keep")
# Tokenization
tokenizer = Tokenizer(inputCol="text", outputCol="words")

# Remove stopwords
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
vectorizer = HashingTF(inputCol="filtered_words", outputCol="features", numFeatures=800)
logistic = LogisticRegression(featuresCol="features", labelCol="label_index")

In [12]:
# Build Pipeline - setup stages one by one
pipeline = Pipeline(stages=[label_indexer, tokenizer, stopwords_remover, vectorizer, logistic])

In [13]:
# split the data into training and testing
# we train on 80% of data and testing on 20% of data
train_df, test_df = df.randomSplit([0.7, 0.3])

In [14]:
model = pipeline.fit(train_df)

In [15]:
predictions = model.transform(test_df)

In [35]:
predictions.show(5)

+--------------------+--------------------+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|                text|               label|label_index|               words|      filtered_words|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|"""Ardh Satya"" i...| this one is the ...|     5874.0|["""ardh, satya""...|["""ardh, satya""...|(800,[68,144,163,...|[232.461188080935...|[0.53233122645475...|       0.0|
|"""Coconut Fred's...| and every episod...|     5874.0|["""coconut, fred...|["""coconut, fred...|(800,[65,167,195,...|[82.5389409833174...|[1.64774223025627...|      83.0|
|"""Crossfire"" is...| which is anti-Se...|     5874.0|["""crossfire"", ...|["""crossfire"", ...|(800,[31,134,172,...|[39.4436920322885...|[

In [38]:
# Filter for negative sentiments
negative_df = predictions.filter(predictions.prediction == 0.0)  # Assuming 0.0 represents negative
print("Negative Sentiments:")
negative_df.select("text").show(truncate=False)


Negative Sentiments:
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [39]:
# Filter for neutral sentiments
neutral_df = predictions.filter(predictions.prediction == 1.0)  # Assuming 1.0 represents neutral
print("Neutral Sentiments:")
neutral_df.select("text").show(truncate=False)


Neutral Sentiments:
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [40]:
# Filter for positive sentiments
positive_df = predictions.filter(predictions.prediction == 2.0)  # Assuming 2.0 represents positive
print("Positive Sentiments:")
positive_df.select("text").show(truncate=False)


Positive Sentiments:
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------