In [1]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import col,udf
from pyspark.ml import Pipeline
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql import functions as F
from langdetect import detect
from pyspark.sql.types import StringType
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics


In [2]:
reviews = spark.read.json("Review_Data/*/**")
print(reviews.count())
print(reviews[reviews["label"]==1].count())
reviews_negative = reviews.filter(F.col('label')==0)


1436
1207


In [3]:
reviews = reviews.dropDuplicates(['review_id'])
reviews.show()


+-------+-----+---------+--------------------+
| app_id|label|review_id|         review_text|
+-------+-----+---------+--------------------+
| 847780|    1|106594456| Amusing short romp.|
| 847780|    1|119065268|I played this ove...|
| 847780|    1|120293405|Curiously enough ...|
| 847780|    1|120367991|Very fun, short R...|
| 861410|    1|122937487|                yea.|
| 861410|    1|126995532|Improving in many...|
|2179720|    0|128009196|Disclaimer: I'm n...|
| 861410|    0|128026207|I don't know if I...|
|2179720|    1|128182864|This game has a l...|
| 861410|    0|128203588|2/11/2023 Update:...|
|2179720|    1|128210905|Great game try it...|
| 861410|    0|128252677|This game is a cu...|
|2179720|    0|128763431|Found the airplan...|
| 861410|    1|129196909|Don't let the fra...|
| 861410|    0|129795263|[b]TL;DR: Don't b...|
|2077590|    1|130528878|Addictive little ...|
|2077590|    1|130529073|Fun gameplay and ...|
|2077590|    1|130571366|This is such a gr...|
|2179720|    

In [4]:

def detect_language(text):
    try:
        return detect(text)
    except:
        return None

In [5]:

detect_language_udf = udf(detect_language, StringType())

df = reviews.withColumn('language', detect_language_udf(reviews['review_text']))
df.show()
language_freq = df.groupBy('language').count().orderBy(col('count').desc())
language_freq.show()

+-------+-----+---------+--------------------+--------+
| app_id|label|review_id|         review_text|language|
+-------+-----+---------+--------------------+--------+
| 847780|    1|106594456| Amusing short romp.|      en|
| 847780|    1|119065268|I played this ove...|      en|
| 847780|    1|120293405|Curiously enough ...|      en|
| 847780|    1|120367991|Very fun, short R...|      en|
| 861410|    1|122937487|                yea.|      sw|
| 861410|    1|126995532|Improving in many...|      en|
|2179720|    0|128009196|Disclaimer: I'm n...|      en|
| 861410|    0|128026207|I don't know if I...|      en|
|2179720|    1|128182864|This game has a l...|      en|
| 861410|    0|128203588|2/11/2023 Update:...|      en|
|2179720|    1|128210905|Great game try it...|      en|
| 861410|    0|128252677|This game is a cu...|      en|
|2179720|    0|128763431|Found the airplan...|      en|
| 861410|    1|129196909|Don't let the fra...|      en|
| 861410|    0|129795263|[b]TL;DR: Don't b...|  

In [6]:
reviews = df.filter(df['language'] == 'en')

In [7]:
print(reviews.count())
print(reviews[reviews["label"]==1].count())
print(reviews[reviews["label"]==0].count())

reviews.show()

792
646
144
+-------+-----+---------+--------------------+--------+
| app_id|label|review_id|         review_text|language|
+-------+-----+---------+--------------------+--------+
| 847780|    1|106594456| Amusing short romp.|      en|
| 847780|    1|119065268|I played this ove...|      en|
| 847780|    1|120293405|Curiously enough ...|      en|
| 847780|    1|120367991|Very fun, short R...|      en|
| 861410|    1|126995532|Improving in many...|      en|
|2179720|    0|128009196|Disclaimer: I'm n...|      en|
| 861410|    0|128026207|I don't know if I...|      en|
|2179720|    1|128182864|This game has a l...|      en|
| 861410|    0|128203588|2/11/2023 Update:...|      en|
|2179720|    1|128210905|Great game try it...|      en|
| 861410|    0|128252677|This game is a cu...|      en|
|2179720|    0|128763431|Found the airplan...|      en|
| 861410|    0|129795263|[b]TL;DR: Don't b...|      en|
|2077590|    1|130528878|Addictive little ...|      en|
|2077590|    1|130529073|Fun gamepla

In [8]:
train_data, test_data = reviews.randomSplit([0.8, 0.2], seed=42)
print(train_data.count())
train_data_positive = train_data.filter(F.col('label')==1)
train_data_negative = train_data.filter(F.col('label')==0)
print(train_data_positive.count())
print(train_data_negative.count())

664
549
116


In [9]:
ratio = int(train_data_positive.count()/ train_data_negative.count())

oversampled_train_data = train_data_negative.withColumn("dummy",F.explode(F.array([F.lit(x) for x in range(ratio)]))).drop('dummy')

print("ratio befween positive and negative review is " + str(ratio))

ratio befween positive and negative review is 4


In [10]:
train_data = train_data_positive.unionAll(oversampled_train_data)

In [11]:
print(train_data.filter(F.col('label')==1).count())
print(train_data[train_data["label"]==1].count())
print(train_data[train_data["label"]==0].count())


547
545
488


In [12]:
tokenizer = Tokenizer(inputCol="review_text", outputCol="words")
stopwords = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered_words")
hashingTF = HashingTF(numFeatures=1000, inputCol=stopwords.getOutputCol(), outputCol="features")
rf = RandomForestClassifier(labelCol="label", featuresCol="features")
pipeline = Pipeline(stages=[tokenizer, stopwords, hashingTF,rf])

model = pipeline.fit(train_data)

#############

predictions = model.transform(test_data)

evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="prediction", metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)
print("AUC: ", auc)

AUC:  0.7502548419979613


In [14]:

# Convert prediction and label columns to float, necessary for the confusion matrix
predictions = predictions.withColumn("label", predictions["label"].cast("double"))
predictions = predictions.withColumn("prediction", predictions["prediction"].cast("double"))

# Select prediction and label columns
predictionAndLabels = predictions.select("prediction", "label")

# Compute raw scores on the test set
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
accuracy = evaluator.evaluate(predictionAndLabels)
print("Test set accuracy = " + str(accuracy))

# Create a confusion matrix
metrics = MulticlassMetrics(predictionAndLabels.rdd)
confusion_matrix = metrics.confusionMatrix().toArray()
print("Confusion Matrix:\n", confusion_matrix)

Test set accuracy = 0.873015873015873




Confusion Matrix:
 [[ 5. 20.]
 [ 7. 94.]]


In [15]:
#save model
model.write().overwrite().save("Model1.RF")