In [45]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [36]:
version = "3.5.3"
!wget https://downloads.apache.org/spark/spark-{version}/spark-{version}-bin-hadoop3.tgz

--2024-11-18 20:46:07--  https://downloads.apache.org/spark/spark-3.5.3/spark-3.5.3-bin-hadoop3.tgz
Resolving downloads.apache.org (downloads.apache.org)... 135.181.214.104, 88.99.208.237, 2a01:4f8:10a:39da::2, ...
Connecting to downloads.apache.org (downloads.apache.org)|135.181.214.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 400864419 (382M) [application/x-gzip]
Saving to: ‘spark-3.5.3-bin-hadoop3.tgz.1’

      spark-3.5.3-b   0%[                    ]   1.18M  1.01MB/s               ^C


In [37]:
!tar xzvf spark-{version}-bin-hadoop3.tgz

spark-3.5.3-bin-hadoop3/
spark-3.5.3-bin-hadoop3/data/
spark-3.5.3-bin-hadoop3/data/graphx/
spark-3.5.3-bin-hadoop3/data/graphx/users.txt
spark-3.5.3-bin-hadoop3/data/graphx/followers.txt
spark-3.5.3-bin-hadoop3/data/mllib/
spark-3.5.3-bin-hadoop3/data/mllib/sample_linear_regression_data.txt
spark-3.5.3-bin-hadoop3/data/mllib/sample_fpgrowth.txt
spark-3.5.3-bin-hadoop3/data/mllib/sample_libsvm_data.txt
spark-3.5.3-bin-hadoop3/data/mllib/gmm_data.txt
spark-3.5.3-bin-hadoop3/data/mllib/kmeans_data.txt
spark-3.5.3-bin-hadoop3/data/mllib/streaming_kmeans_data_test.txt
spark-3.5.3-bin-hadoop3/data/mllib/sample_lda_data.txt
spark-3.5.3-bin-hadoop3/data/mllib/sample_multiclass_classification_data.txt
spark-3.5.3-bin-hadoop3/data/mllib/pagerank_data.txt
spark-3.5.3-bin-hadoop3/data/mllib/sample_isotonic_regression_libsvm_data.txt
spark-3.5.3-bin-hadoop3/data/mllib/sample_lda_libsvm_data.txt
spark-3.5.3-bin-hadoop3/data/mllib/sample_movielens_data.txt
spark-3.5.3-bin-hadoop3/data/mllib/pic_data

In [38]:
!pip install -q findspark

In [39]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/spark-{version}-bin-hadoop3"

In [40]:
import findspark
findspark.init()

In [41]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, udf, length, desc
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import StringIndexer, OneHotEncoder, Tokenizer, StopWordsRemover, HashingTF, IDF, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
from itertools import product


spark = SparkSession.builder.appName("bot_classifier").getOrCreate()

df = spark.read.json("edits.json", multiLine=False)

df = df.select("type", "namespace", "comment", "bot", "length")


def calculate_length_udf(length):
    if isinstance(length, dict):
        return length.get('new', 0) - length.get('old', 0)
    return 0


length_udf = udf(calculate_length_udf, IntegerType())
df = df.withColumn("length_diff", length_udf(col("length")))

df = df.withColumn("comment_length", length(col("comment")))

train, test = df.randomSplit([0.8, 0.2], seed=42)


def create_pipeline(numFeatures, numTrees, maxDepth):
  type_indexer = StringIndexer(inputCol="type", outputCol="type_index", handleInvalid="keep")
  namespace_indexer = StringIndexer(inputCol="namespace", outputCol="namespace_index", handleInvalid="keep")
  type_encoder = OneHotEncoder(inputCol="type_index", outputCol="type_encoded")
  namespace_encoder = OneHotEncoder(inputCol="namespace_index", outputCol="namespace_encoded")

  tokenizer = Tokenizer(inputCol="comment", outputCol="words")
  stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
  hashing_tf = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=numFeatures)
  idf = IDF(inputCol="raw_features", outputCol="tfidf_features")

  assembler = VectorAssembler(
      inputCols=["length_diff", "comment_length", "type_encoded", "namespace_encoded", "tfidf_features"],
      outputCol="features"
  )

  rf = RandomForestClassifier(labelCol="bot", featuresCol="features", numTrees=numTrees, maxDepth=maxDepth)

  pipeline = Pipeline(stages=[
      type_indexer, namespace_indexer,
      type_encoder, namespace_encoder,
      tokenizer, stopwords_remover, hashing_tf, idf,
      assembler, rf
  ])
  return pipeline


def train_and_evaluate(train, test, numFeatures, numTrees, maxDepth):
  pipeline = create_pipeline(numFeatures, numTrees, maxDepth)

  model = pipeline.fit(train)
  predictions = model.transform(test)

  evaluator_f1 = MulticlassClassificationEvaluator(labelCol="bot", predictionCol="prediction", metricName="f1")
  evaluator_accuracy = MulticlassClassificationEvaluator(labelCol="bot", predictionCol="prediction", metricName="accuracy")
  evaluator_precision = MulticlassClassificationEvaluator(labelCol="bot", predictionCol="prediction", metricName="weightedPrecision")
  evaluator_recall = MulticlassClassificationEvaluator(labelCol="bot", predictionCol="prediction", metricName="weightedRecall")

  f1_score = evaluator_f1.evaluate(predictions)
  accuracy = evaluator_accuracy.evaluate(predictions)
  precision = evaluator_precision.evaluate(predictions)
  recall = evaluator_recall.evaluate(predictions)

  return model, f1_score, accuracy, precision, recall

params = {
    "numFeatures": [100, 500, 1000],
    "numTrees": [10, 50, 100],
    "maxDepth": [5, 10, 20]
}

results = []

for numFeatures, numTrees, maxDepth in product(params["numFeatures"], params["numTrees"], params["maxDepth"]):
  print(f"Testing parameters numFeatures={numFeatures}, numTrees={numTrees}, maxDepth={maxDepth}")

  _, f1_score, accuracy, precision, recall = train_and_evaluate(train, test, numFeatures, numTrees, maxDepth)
  results.append(Row(numFeatures=numFeatures, numTrees=numTrees, maxDepth=maxDepth, f1_score=f1_score, accuracy=accuracy, precision=precision, recall=recall))

results_df = spark.createDataFrame(results)
best_result = results_df.orderBy(desc("f1_score")).first()

results_df.show()

Testing parameters numFeatures=100, numTrees=10, maxDepth=5
Testing parameters numFeatures=100, numTrees=10, maxDepth=10
Testing parameters numFeatures=100, numTrees=10, maxDepth=20
Testing parameters numFeatures=100, numTrees=50, maxDepth=5
Testing parameters numFeatures=100, numTrees=50, maxDepth=10
Testing parameters numFeatures=100, numTrees=50, maxDepth=20
Testing parameters numFeatures=100, numTrees=100, maxDepth=5
Testing parameters numFeatures=100, numTrees=100, maxDepth=10
Testing parameters numFeatures=100, numTrees=100, maxDepth=20
Testing parameters numFeatures=500, numTrees=10, maxDepth=5
Testing parameters numFeatures=500, numTrees=10, maxDepth=10
Testing parameters numFeatures=500, numTrees=10, maxDepth=20
Testing parameters numFeatures=500, numTrees=50, maxDepth=5
Testing parameters numFeatures=500, numTrees=50, maxDepth=10
Testing parameters numFeatures=500, numTrees=50, maxDepth=20
Testing parameters numFeatures=500, numTrees=100, maxDepth=5
Testing parameters numFeat

In [42]:
print(f"Best parameters: numFeatures={best_result.numFeatures}, numTrees={best_result.numTrees}, maxDepth={best_result.maxDepth}")
print(f"F1 Score: {f1_score};\tAccuracy: {accuracy};\tPrecision: {precision};\tRecall: {recall}")

Best parameters: numFeatures=100, numTrees=100, maxDepth=20
F1 Score: 0.911711785200636;	Accuracy: 0.9146480659480025;	Precision: 0.9226667180813601;	Recall: 0.9146480659480025


In [43]:
model, f1_score, accuracy, precision, recall = train_and_evaluate(train, test, best_result.numFeatures, best_result.numTrees, best_result.maxDepth)

best_model_path = "bot_classifier"
model.write().overwrite().save(best_model_path)

In [44]:
from pyspark.ml import PipelineModel


loaded_model = PipelineModel.load(best_model_path)