In [1]:
# SparkContext represents the connection to a Spark cluster
from pyspark.context import SparkContext
# Configuration for a Spark application
from pyspark.conf import SparkConf
# The entry point to programming Spark with the Dataset and DataFrame API
from pyspark.sql.session import SparkSession

spark = SparkSession.builder \
    .appName("P03_Clustering") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()
    #.config("spark.sql.repl.eagerEval.enabled", True) \
    #.config("spark.sql.repl.eagerEval.truncate", 500) \
    #.getOrCreate()


In [2]:
dblp_ref_file_path = "dblp-ref/dblp-ref-0.json"
papers_df = spark.read.json(dblp_ref_file_path)

papers_df.printSchema()

root
 |-- abstract: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- id: string (nullable = true)
 |-- n_citation: long (nullable = true)
 |-- references: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- title: string (nullable = true)
 |-- venue: string (nullable = true)
 |-- year: long (nullable = true)



In [3]:
papers_df.show()

+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+
|            abstract|             authors|                  id|n_citation|          references|               title|               venue|year|
+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+
|The purpose of th...|[Makoto Satoh, Ry...|00127ee2-cb05-48c...|         0|[51c7e02e-f5ed-43...|Preliminary Desig...|international con...|2013|
|This paper descri...|[Gareth Beale, Gr...|001c58d3-26ad-46b...|        50|[10482dd3-4642-41...|A methodology for...|visual analytics ...|2011|
|This article appl...|[Altaf Hossain, F...|001c8744-73c4-4b0...|        50|[2d84c0f2-e656-4c...|Comparison of GAR...|pattern recogniti...|2009|
|                NULL|[Jea-Bum Park, By...|00338203-9eb3-40c...|         0|[8c78e4b0-632b-42...|Development of Re...|                   

## Preprocessing

### Required packages and UDFs

In [4]:
!pip install langdetect



In [5]:
!pip install fast-langdetect



In [6]:
# If you're running the language detection for the first time, try running this if the dataframe is filtered to empty
# On first time run it might need to download a small language file that the udf might not trigger
from fast_langdetect import detect_langs

print(detect_langs("Hello, world!") == 'EN')

True


In [7]:
from pyspark.sql.functions import col, udf, lower, regexp_replace, split, coalesce, array
from pyspark.sql.types import ArrayType, StringType, BooleanType
from pyspark.ml.feature import StopWordsRemover
from langdetect import detect, LangDetectException
from fast_langdetect import detect_langs

custom_stop_words = ['doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 
                     'al', 'author', 'figure','rights', 'reserved', 'permission', 'used', 'using', 
                     'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 'al.', 'Elsevier', 'PMC', 'CZI', 'www']


def remove_punctuation(text):
    return regexp_replace(text, r'[!()\[\]{};:"\,<>./?@#$%^&*_~]', '')

# Language detection (EN)
def detect_language(text):
    result = False
    try:
        split_text = text.split(" ")
        if len(split_text) < 9:
            result = detect(text) == 'en'
        else:
            result = " ".join(split_text[:9]) == 'en'
    except LangDetectException:
        result = False
    return result

detect_language_udf = udf(detect_language, BooleanType())

# ASCII detection - unused experiment but left it here for reference
def detect_ascii(text):
    return text.isascii()

detect_ascii_udf = udf(detect_ascii, BooleanType())

# Fast language detection (EN)
def fast_detect_language(text):
    result = False
    try:
        result = detect_langs(text) == 'EN'
    except Exception:
        result = False
    return result

fast_detect_language_udf = udf(fast_detect_language, BooleanType())

### Set up the required filtering and preprocessing steps

In [8]:
papers_cleaned_df = papers_df.filter(col("abstract").isNotNull() & (col("abstract").rlike(r'\w')))
#papers_cleaned_df = papers_cleaned_df.limit(100) # Useful for testing, takes only the first n entries
#papers_cleaned_df = papers_cleaned_df.filter(detect_ascii_udf(col("title"))) # "Language detection" based on if the title contains any non-ASCII characters (slow and kind of wrong)
#papers_cleaned_df = papers_cleaned_df.filter(detect_language_udf(col("title"))) # Language detection using the base language library (very slow)
papers_cleaned_df = papers_cleaned_df.filter(fast_detect_language_udf(col("title")))
papers_cleaned_df = papers_cleaned_df.withColumn("abstract", remove_punctuation(col("abstract")))
papers_cleaned_df = papers_cleaned_df.withColumn("abstract", lower(col("abstract")))
papers_cleaned_df = papers_cleaned_df.withColumn("abstract_words", split(col("abstract"), " "))

papers_cleaned_df = papers_cleaned_df.withColumn("references", coalesce("references", array()))

# Preprocess title column
papers_cleaned_df = papers_cleaned_df.filter(col("title").isNotNull() & (col("title").rlike(r'\w')))
papers_cleaned_df = papers_cleaned_df.withColumn("title", remove_punctuation(col("title")))
papers_cleaned_df = papers_cleaned_df.withColumn("title", lower(col("title")))
papers_cleaned_df = papers_cleaned_df.withColumn("title_words", split(col("title"), " "))

remover = StopWordsRemover(inputCol="title_words", outputCol="filtered_title_words")
papers_cleaned_df = remover.transform(papers_cleaned_df)

remover = StopWordsRemover(inputCol="abstract_words", outputCol="filtered_abstract_words")
papers_cleaned_df = remover.transform(papers_cleaned_df)

def remove_custom_stop_words(words, custom_stop_words):
    if words is None:
        return None
    return [word for word in words if word not in custom_stop_words]

remove_custom_stop_words_udf = udf(lambda words: remove_custom_stop_words(words, custom_stop_words), ArrayType(StringType()))

papers_cleaned_df = papers_cleaned_df.withColumn("final_filtered_abstract", remove_custom_stop_words_udf(col("filtered_abstract_words")))
papers_cleaned_df = papers_cleaned_df.withColumn("final_filtered_title", remove_custom_stop_words_udf(col("filtered_title_words")))


### Run the preprocessing steps

In [9]:
import time

start_ms = time.time_ns() / 1_000_000_000
print("Number of resulting papers:", papers_cleaned_df.count())

print()
papers_cleaned_df.show()
print()

end_ms = time.time_ns() / 1_000_000_000
print("The process took", (end_ms - start_ms), "seconds")

Number of resulting papers: 743421

+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+--------------------+--------------------+--------------------+-----------------------+-----------------------+--------------------+
|            abstract|             authors|                  id|n_citation|          references|               title|               venue|year|      abstract_words|         title_words|filtered_title_words|filtered_abstract_words|final_filtered_abstract|final_filtered_title|
+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+--------------------+--------------------+--------------------+-----------------------+-----------------------+--------------------+
|the purpose of th...|[Makoto Satoh, Ry...|00127ee2-cb05-48c...|         0|[51c7e02e-f5ed-43...|preliminary desig...|international con..

## Vectorization

In [10]:
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.sql.functions import when
#NB! this takes a long time to calculate so don't run this often (run time of ~1 minute)
"""hashing_tf = HashingTF(inputCol="final_filtered", outputCol="raw_features", numFeatures=1024)
tf_df = hashing_tf.transform(papers_cleaned_df)

idf = IDF(inputCol="raw_features", outputCol="features")
idf_model = idf.fit(tf_df)
tfidf_df = idf_model.transform(tf_df)
tfidf_df = tfidf_df.filter(col("features").isNotNull())
"""
# HashingTF and IDF for abstract and title
hashingTF_abstract = HashingTF(inputCol="final_filtered_abstract", outputCol="raw_abstract_features", numFeatures=1024)
hashingTF_title = HashingTF(inputCol="final_filtered_title", outputCol="raw_title_features", numFeatures=1024)

idf_abstract = IDF(inputCol="raw_abstract_features", outputCol="abstract_features")
idf_title = IDF(inputCol="raw_title_features", outputCol="title_features")


In [11]:
from pyspark.ml.feature import StandardScaler, VectorAssembler
# Apply HashingTF and IDF
featurized_df = hashingTF_abstract.transform(papers_cleaned_df)
featurized_df = hashingTF_title.transform(featurized_df)

rescaled_df = idf_abstract.fit(featurized_df).transform(featurized_df)
rescaled_df = idf_title.fit(rescaled_df).transform(rescaled_df)

# Normalize n_citation
assembler = VectorAssembler(inputCols=["n_citation"], outputCol="n_citation_vec")
rescaled_df = assembler.transform(rescaled_df)

scaler = StandardScaler(inputCol="n_citation_vec", outputCol="scaled_n_citation")
rescaled_df = scaler.fit(rescaled_df).transform(rescaled_df)

# HashingTF for authors
hashingTF_authors = HashingTF(inputCol="authors", outputCol="authors_features", numFeatures=1024)
rescaled_df = hashingTF_authors.transform(rescaled_df)

# Combine all features
assembler = VectorAssembler(inputCols=["abstract_features", "title_features", "scaled_n_citation", "authors_features"], outputCol="features")
final_df = assembler.transform(rescaled_df)


+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+--------------------+--------------------+--------------------+-----------------------+-----------------------+--------------------+---------------------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+--------------------+
|            abstract|             authors|                  id|n_citation|          references|               title|               venue|year|      abstract_words|         title_words|filtered_title_words|filtered_abstract_words|final_filtered_abstract|final_filtered_title|raw_abstract_features|  raw_title_features|   abstract_features|      title_features|n_citation_vec|   scaled_n_citation|    authors_features|            features|
+--------------------+--------------------+--------------------+----------+--------------------+--------------------+-----

In [16]:
start_ms = time.time_ns() / 1_000_000_000

print()
final_df.show()
print()

end_ms = time.time_ns() / 1_000_000_000
print("The process took", (end_ms - start_ms), "seconds")




ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving


ConnectionRefusedError: [Errno 111] Connection refused

In [12]:
# Drop intermediary columns
columns_to_drop = ["abstract", "abstract_words", "title", "title_words", "filtered_title_words",
                   "raw_abstract_features", "raw_title_features", "n_citation_vec", "authors_features",
                   "abstract_features", "title_features", "scaled_n_citation", "filtered_abstract_words"]

final_dropped_df = final_df.drop(*columns_to_drop)

final_dropped_df.show()

+--------------------+--------------------+----------+--------------------+--------------------+----+-----------------------+--------------------+--------------------+
|             authors|                  id|n_citation|          references|               venue|year|final_filtered_abstract|final_filtered_title|            features|
+--------------------+--------------------+----------+--------------------+--------------------+----+-----------------------+--------------------+--------------------+
|[Makoto Satoh, Ry...|00127ee2-cb05-48c...|         0|[51c7e02e-f5ed-43...|international con...|2013|   [purpose, study, ...|[preliminary, des...|(3073,[68,73,104,...|
|[Gareth Beale, Gr...|001c58d3-26ad-46b...|        50|[10482dd3-4642-41...|visual analytics ...|2011|   [paper, describes...|[methodology, phy...|(3073,[28,49,59,7...|
|[Altaf Hossain, F...|001c8744-73c4-4b0...|        50|[2d84c0f2-e656-4c...|pattern recogniti...|2009|   [article, applied...|[comparison, garc...|(3073,[5,7,39,

## Clustering

In [14]:
from pyspark.ml.feature import PCA

# Select the first 1000 rows, change this how you see fit and how much your PC can handle
limited_final_dropped_df = final_dropped_df.limit(1000)


pca = PCA(k=128, inputCol="features", outputCol="pca_features")
pca_model = pca.fit(limited_final_dropped_df)
pca_df = pca_model.transform(limited_final_dropped_df)

In [None]:
from pyspark.ml.feature import PCA
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.linalg import Vectors

#extremely slow

# Determine the optimal number of clusters using the silhouette score
cost = []
evaluator = ClusteringEvaluator()

for k in range(3, 8):
    print(k)
    kmeans = KMeans(featuresCol='pca_features', k=k)
    model = kmeans.fit(pca_df)
    predictions = model.transform(pca_df)
    silhouette = evaluator.evaluate(predictions)
    cost.append((k, silhouette))
    print(f"With K={k}, the Silhouette score is {silhouette}")

# Choose the best K (you can automate this step)
best_k = max(cost, key=lambda item: item[1])[0]
print(f"Best K found: {best_k}")

# Fit the final K-means model with the best K
kmeans = KMeans(featuresCol='pca_features', k=best_k)
model = kmeans.fit(pca_df)
predictions = model.transform(pca_df)

# Show the resulting clusters
predictions.select("id", "prediction").show()

# If you want to see the cluster centers
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)


3



KeyboardInterrupt

