In [3]:
pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317130 sha256=877f59c8fb932f907fb8214a795e07795bc47db5b3df664ed35911ce009ca5fb
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [4]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [5]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [6]:
df1 = pd.read_csv('/content/gdrive/MyDrive/data_neww.csv')

In [None]:
df = df1[['overall','reviewText']]

In [None]:
df['overall'] = df['overall'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['overall'] = df['overall'].astype(int)


In [None]:
df.dtypes

overall        int64
reviewText    object
dtype: object

In [None]:
import pandas as pd
import numpy as np

# Get the indices of the rows to drop
indices_to_drop = df[df['overall'] == 5].index.tolist()
np.random.shuffle(indices_to_drop)
indices_to_drop = indices_to_drop[:len(indices_to_drop)//2]

# Drop the rows
df = df.drop(indices_to_drop)

In [None]:
df

Unnamed: 0,overall,reviewText
1,5,My 11 y.o. loved this...and so do I (you know ...
2,5,"The pictures are great , I've done one and gav..."
5,5,MY HUSBAND LOVED IT. HE IS IN TO DRAGONS.
6,5,love it
7,4,cool
...,...,...
1827444,1,Not at all what I expected. Found identical it...
1827445,5,I put these in clear soasp and gave out as par...
1827448,5,"Small, cheap, exactly what we needed! Not a t..."
1827450,1,Ours didn't work! Only 3 worked out of the wh...


In [None]:
df['overall'].value_counts()

5    651633
4    272072
3    130480
1     63396
2     58240
Name: overall, dtype: int64

In [None]:
csv_path = '/content/gdrive/MyDrive/data_neww.csv'
df.to_csv(csv_path,index=False)

In [None]:
df1

Unnamed: 0,overall,reviewText
0,5,My 11 y.o. loved this...and so do I (you know ...
1,5,"The pictures are great , I've done one and gav..."
2,5,MY HUSBAND LOVED IT. HE IS IN TO DRAGONS.
3,5,love it
4,4,cool
...,...,...
1175816,1,Not at all what I expected. Found identical it...
1175817,5,I put these in clear soasp and gave out as par...
1175818,5,"Small, cheap, exactly what we needed! Not a t..."
1175819,1,Ours didn't work! Only 3 worked out of the wh...


In [7]:
# Create a SparkSession
spark = SparkSession.builder \
    .appName("Sessiontn") \
    .master("local[1]") \
    .config("spark.driver.memory", "64g") \
    .getOrCreate()
    # .config("spark.driver.memory", "32g") \

In [8]:
sparkDF=spark.createDataFrame(df1) 
sparkDF.show()

+-------+--------------------+
|overall|          reviewText|
+-------+--------------------+
|      5|My 11 y.o. loved ...|
|      5|The pictures are ...|
|      5|MY HUSBAND LOVED ...|
|      5|             love it|
|      4|                cool|
|      5|Exactly as descri...|
|      5|Sometimes you nee...|
|      2|This is indeed a ...|
|      4|I bought several ...|
|      1|total waste of mo...|
|      3|This is pretty mu...|
|      4|its a cute little...|
|      3|This is a tiny bo...|
|      2|They were ok but ...|
|      5|Great fun for my ...|
|      4|This was a stocki...|
|      5|            Fun gift|
|      5|          great book|
|      5|a wonderful littl...|
|      2|The book is only ...|
+-------+--------------------+
only showing top 20 rows



In [10]:
random_state=42

In [9]:
from pyspark.sql.functions import col
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

In [11]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml import Pipeline

# Define the tokenizer
tokenizer = Tokenizer(inputCol="reviewText", outputCol="words")

# Define the stop words remover
stop_words_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

# Define the hashing term frequency (HTF) vectorizer
hashing_tf = HashingTF(inputCol="filtered_words", outputCol="raw_features")

# Define the inverse document frequency (IDF) transformer
idf = IDF(inputCol="raw_features", outputCol="features")

# Define the preprocessing pipeline
preprocessing_pipeline = Pipeline(stages=[tokenizer, stop_words_remover, hashing_tf, idf])

# Fit the preprocessing pipeline to the data and transform the data
preprocessed_data = preprocessing_pipeline.fit(sparkDF).transform(sparkDF).select("features", "overall")

In [12]:
# Split data into training and testing sets
(training_data, testing_data) = preprocessed_data.randomSplit([0.8, 0.2], seed=42)

Random Forest

In [14]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [16]:
# Train the Random Forest model
rf = RandomForestClassifier(featuresCol='features', numTrees=10, maxDepth=5, labelCol="overall")
pipeline = Pipeline(stages=[rf])
model = pipeline.fit(training_data)

# Make predictions on the test data
predictions = model.transform(testing_data)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="overall", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g" % accuracy)

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 43064)
Traceback (most recent call last):
  File "/usr/lib/python3.10/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(r

Py4JError: ignored

Logistic Regression

In [None]:
random_state=42

In [None]:
# Train logistic regression model
lr = LogisticRegression(featuresCol='features', labelCol='overall', maxIter=10, regParam=0.3, elasticNetParam=0)
lr_model = lr.fit(training_data)

In [None]:
# Evaluate logistic regression model on testing data
lr_predictions = lr_model.transform(testing_data)
lr_evaluator = MulticlassClassificationEvaluator(labelCol="overall", predictionCol="prediction", metricName="accuracy")
lr_accuracy = lr_evaluator.evaluate(lr_predictions)
print("Logistic Regression Accuracy = %g" % lr_accuracy)

Logistic Regression Accuracy = 0.613706


In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col

# Evaluate the logistic regression model on the testing data
lr_predictions = lr_model.transform(testing_data)

# Compute the confusion matrix
lr_confusion_matrix = lr_predictions.groupBy("overall", "prediction").count().orderBy("overall", "prediction")
lr_confusion_matrix.show()

# Compute the accuracy
lr_evaluator = MulticlassClassificationEvaluator(labelCol="overall", predictionCol="prediction", metricName="accuracy")
lr_accuracy = lr_evaluator.evaluate(lr_predictions)
print("Logistic Regression Accuracy = %g" % lr_accuracy)

+-------+----------+-----+
|overall|prediction|count|
+-------+----------+-----+
|      1|       1.0| 3077|
|      1|       2.0|  210|
|      1|       3.0| 1080|
|      1|       4.0|  997|
|      1|       5.0| 7257|
|      2|       1.0|  810|
|      2|       2.0|  443|
|      2|       3.0| 1651|
|      2|       4.0| 1795|
|      2|       5.0| 6829|
|      3|       1.0|  367|
|      3|       2.0|  190|
|      3|       3.0| 3442|
|      3|       4.0| 5669|
|      3|       5.0|16194|
|      4|       1.0|  170|
|      4|       2.0|  127|
|      4|       3.0| 1528|
|      4|       4.0|12461|
|      4|       5.0|40099|
+-------+----------+-----+
only showing top 20 rows

Logistic Regression Accuracy = 0.613706


Sentiment


In [None]:
from pyspark.sql.functions import when

# Add new column based on the "overall" column
sparkDF = sparkDF.withColumn("Actual sentiment", when(sparkDF.overall >= 4, "Positive")
                                         .when(sparkDF.overall == 3, "Neutral")
                                         .otherwise("Negative"))

# Show the updated DataFrame
sparkDF.show()

+-------+--------------------+----------------+
|overall|          reviewText|Actual sentiment|
+-------+--------------------+----------------+
|      5|My 11 y.o. loved ...|        Positive|
|      5|The pictures are ...|        Positive|
|      5|MY HUSBAND LOVED ...|        Positive|
|      5|             love it|        Positive|
|      4|                cool|        Positive|
|      5|Exactly as descri...|        Positive|
|      5|Sometimes you nee...|        Positive|
|      2|This is indeed a ...|        Negative|
|      4|I bought several ...|        Positive|
|      1|total waste of mo...|        Negative|
|      3|This is pretty mu...|         Neutral|
|      4|its a cute little...|        Positive|
|      3|This is a tiny bo...|         Neutral|
|      2|They were ok but ...|        Negative|
|      5|Great fun for my ...|        Positive|
|      4|This was a stocki...|        Positive|
|      5|            Fun gift|        Positive|
|      5|          great book|        Po

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from textblob import TextBlob


# Define a UDF to compute the sentiment polarity using TextBlob
def get_sentiment(text):
    blob = TextBlob(text)
    sentiment = blob.sentiment.polarity
    if sentiment > 0:
        return "Positive"
    elif sentiment < 0:
        return "Negative"
    else:
        return "Neutral"

In [None]:
udf_get_sentiment = udf(get_sentiment, StringType())

# Add a new column with the sentiment polarity
sparkDF = sparkDF.withColumn("Predicted sentiment", udf_get_sentiment("reviewText"))

# Print the predicted sentiments
sparkDF.select("reviewText", "Predicted sentiment","Actual sentiment").show()

+--------------------+-------------------+----------------+
|          reviewText|Predicted sentiment|Actual sentiment|
+--------------------+-------------------+----------------+
|My 11 y.o. loved ...|           Positive|        Positive|
|The pictures are ...|           Positive|        Positive|
|MY HUSBAND LOVED ...|           Positive|        Positive|
|             love it|           Positive|        Positive|
|                cool|           Positive|        Positive|
|Exactly as descri...|           Positive|        Positive|
|Sometimes you nee...|           Positive|        Positive|
|This is indeed a ...|           Positive|        Negative|
|I bought several ...|           Positive|        Positive|
|total waste of mo...|           Positive|        Negative|
|This is pretty mu...|           Positive|         Neutral|
|its a cute little...|           Positive|        Positive|
|This is a tiny bo...|           Positive|         Neutral|
|They were ok but ...|           Negativ

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import when, count

# Compute the confusion matrix
confusion_matrix = (sparkDF.select(when((col("Predicted sentiment") == "Positive") & (col("Actual sentiment") == "Positive"), 1)
                              .when((col("Predicted sentiment") == "Positive") & (col("Actual sentiment") == "Negative"), 2)
                              .when((col("Predicted sentiment") == "Positive") & (col("Actual sentiment") == "Neutral"), 3)
                              .when((col("Predicted sentiment") == "Negative") & (col("Actual sentiment") == "Positive"), 4)
                              .when((col("Predicted sentiment") == "Negative") & (col("Actual sentiment") == "Negative"), 5)
                              .when((col("Predicted sentiment") == "Negative") & (col("Actual sentiment") == "Neutral"), 6)
                              .when((col("Predicted sentiment") == "Neutral") & (col("Actual sentiment") == "Positive"), 7)
                              .when((col("Predicted sentiment") == "Neutral") & (col("Actual sentiment") == "Negative"), 8)
                              .when((col("Predicted sentiment") == "Neutral") & (col("Actual sentiment") == "Neutral"), 9)
                              .alias("code"))
                   .groupBy("code")
                   .agg(count("code").alias("count"))
                   .orderBy("code"))

# Print the confusion matrix
confusion_matrix.show()

+----+------+
|code| count|
+----+------+
|   1|806548|
|   2| 61886|
|   3| 97058|
|   4| 48236|
|   5| 48650|
|   6| 25084|
|   7| 68921|
|   8| 11100|
|   9|  8338|
+----+------+

