In [1]:
# SparkContext represents the connection to a Spark cluster
from pyspark.context import SparkContext
# Configuration for a Spark application
from pyspark.conf import SparkConf
# The entry point to programming Spark with the Dataset and DataFrame API
from pyspark.sql.session import SparkSession

spark = SparkSession.builder \
    .appName("P03_Clustering") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()
    #.config("spark.sql.repl.eagerEval.enabled", True) \
    #.config("spark.sql.repl.eagerEval.truncate", 500) \
    #.getOrCreate()


In [2]:
dblp_ref_file_path = "dblp-ref/dblp-ref-0.json" # This is the first of four files
papers_df = spark.read.json(dblp_ref_file_path)

papers_df.printSchema()

root
 |-- abstract: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- id: string (nullable = true)
 |-- n_citation: long (nullable = true)
 |-- references: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- title: string (nullable = true)
 |-- venue: string (nullable = true)
 |-- year: long (nullable = true)



In [3]:
papers_df.show()

+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+
|            abstract|             authors|                  id|n_citation|          references|               title|               venue|year|
+--------------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+----+
|The purpose of th...|[Makoto Satoh, Ry...|00127ee2-cb05-48c...|         0|[51c7e02e-f5ed-43...|Preliminary Desig...|international con...|2013|
|This paper descri...|[Gareth Beale, Gr...|001c58d3-26ad-46b...|        50|[10482dd3-4642-41...|A methodology for...|visual analytics ...|2011|
|This article appl...|[Altaf Hossain, F...|001c8744-73c4-4b0...|        50|[2d84c0f2-e656-4c...|Comparison of GAR...|pattern recogniti...|2009|
|                NULL|[Jea-Bum Park, By...|00338203-9eb3-40c...|         0|[8c78e4b0-632b-42...|Development of Re...|                   

## Preprocessing

### Required packages and UDFs

In [4]:
!pip install langdetect



In [5]:
!pip install fast-langdetect



In [6]:
# If you're running the language detection for the first time, try running this if the dataframe is filtered to empty
# On first time run it might need to download a small language file that the udf might not trigger
from fast_langdetect import detect_langs

print(detect_langs("Hello, world!") == 'EN')

True


In [7]:
from pyspark.sql.functions import col, udf, lower, regexp_replace, split, coalesce, array
from pyspark.sql.types import ArrayType, StringType, BooleanType
from pyspark.ml.feature import StopWordsRemover
from langdetect import detect, LangDetectException
from fast_langdetect import detect_langs

custom_stop_words = ['doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 
                     'al', 'author', 'figure','rights', 'reserved', 'permission', 'used', 'using', 
                     'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 'al.', 'Elsevier', 'PMC', 'CZI', 'www']

# Remove punctuation
def remove_punctuation(text):
    return regexp_replace(text, r'[!()\[\]{};:"\,<>./?@#$%^&*_~]', '')

# Language detection (EN)
def detect_language(text):
    result = False
    try:
        split_text = text.split(" ")
        if len(split_text) < 9:
            result = detect(text) == 'en'
        else:
            result = " ".join(split_text[:9]) == 'en'
    except LangDetectException:
        result = False
    return result

detect_language_udf = udf(detect_language, BooleanType())

# ASCII detection - unused experiment but left it here for reference
def detect_ascii(text):
    return text.isascii()

detect_ascii_udf = udf(detect_ascii, BooleanType())

# Fast language detection (EN)
def fast_detect_language(text):
    result = False
    try:
        result = detect_langs(text) == 'EN'
    except Exception:
        result = False
    return result

fast_detect_language_udf = udf(fast_detect_language, BooleanType())

# Custom stop words removal
def remove_custom_stop_words(words, custom_stop_words):
    if words is None:
        return None
    return [word for word in words if word not in custom_stop_words]

remove_custom_stop_words_udf = udf(lambda words: remove_custom_stop_words(words, custom_stop_words), ArrayType(StringType()))

### Set up the required filtering and preprocessing steps

In [8]:
# Filter out papers that don't have useable abstracts
papers_cleaned_df = papers_df.filter(col("abstract").isNotNull() & (col("abstract").rlike(r'\w')))

# Filter out papers that are not in English
papers_cleaned_df = papers_cleaned_df.filter(fast_detect_language_udf(col("title")))
#papers_cleaned_df = papers_cleaned_df.limit(100) # Useful for testing, takes only the first n entries
#papers_cleaned_df = papers_cleaned_df.filter(detect_ascii_udf(col("title"))) # "Language detection" based on if the title contains any non-ASCII characters (slow and kind of wrong)
#papers_cleaned_df = papers_cleaned_df.filter(detect_language_udf(col("title"))) # Language detection using the base language library (very slow)

# Simplify the format of the abstract
papers_cleaned_df = papers_cleaned_df.withColumn("abstract", remove_punctuation(col("abstract")))
papers_cleaned_df = papers_cleaned_df.withColumn("abstract", lower(col("abstract")))
papers_cleaned_df = papers_cleaned_df.withColumn("abstract_words", split(col("abstract"), " "))

# Conform references
papers_cleaned_df = papers_cleaned_df.withColumn("references", coalesce("references", array()))

# Preprocess title column
papers_cleaned_df = papers_cleaned_df.filter(col("title").isNotNull() & (col("title").rlike(r'\w')))
papers_cleaned_df = papers_cleaned_df.withColumn("title", remove_punctuation(col("title")))
papers_cleaned_df = papers_cleaned_df.withColumn("title", lower(col("title")))
papers_cleaned_df = papers_cleaned_df.withColumn("title_words", split(col("title"), " "))

# Remove pre-given stop words from the title
remover = StopWordsRemover(inputCol="title_words", outputCol="filtered_title_words")
papers_cleaned_df = remover.transform(papers_cleaned_df)

# Remove pre-given stop words from the abstract
remover = StopWordsRemover(inputCol="abstract_words", outputCol="filtered_abstract_words")
papers_cleaned_df = remover.transform(papers_cleaned_df)

# Remove custom stop words from the title and the abstract
papers_cleaned_df = papers_cleaned_df.withColumn("final_filtered_abstract", remove_custom_stop_words_udf(col("filtered_abstract_words")))
papers_cleaned_df = papers_cleaned_df.withColumn("final_filtered_title", remove_custom_stop_words_udf(col("filtered_title_words")))

In [9]:
# Drop all columns that are not used further down the line
# Keeps the following:
# - id                          (for reference)
# - authors                     (used for clustering)
# - final_filtered_abstract     (used for clustering)
# - final_filtered_title        (used for clustering)
columns_to_drop = ["abstract", "abstract_words", "title", "title_words", "filtered_title_words",
                   "filtered_abstract_words", "n_citation", "references", "venue", "year"]

papers_cleaned_df = papers_cleaned_df.drop(*columns_to_drop)

### Run the preprocessing steps

In [10]:
import time

start = time.time_ns() / 1_000_000_000
print("Number of resulting papers:", papers_cleaned_df.count())

print()
papers_cleaned_df.show()
print()

end = time.time_ns() / 1_000_000_000
print("The process took", (end - start), "seconds")

Number of resulting papers: 743421

+--------------------+--------------------+-----------------------+--------------------+
|             authors|                  id|final_filtered_abstract|final_filtered_title|
+--------------------+--------------------+-----------------------+--------------------+
|[Makoto Satoh, Ry...|00127ee2-cb05-48c...|   [purpose, study, ...|[preliminary, des...|
|[Gareth Beale, Gr...|001c58d3-26ad-46b...|   [paper, describes...|[methodology, phy...|
|[Altaf Hossain, F...|001c8744-73c4-4b0...|   [article, applied...|[comparison, garc...|
|[Ankita Brahmacha...|00a119c4-d367-460...|   [recent, achievem...|[identifying, psy...|
|[Alvaro L. Islas,...|00bcf2d5-1592-46b...|   [recently, bridge...|[multisymplectic,...|
|[Patrick Cousot, ...|00c85316-bddf-4bc...|   [applications, ab...|[relational, abst...|
|[Minoru Shigenaga...|00ca027b-5174-40f...|   [three, speech, t...|[speech, training...|
|[Efthymios Alepis...|00dd5ece-1339-4cb...|   [paper, focuses, ...|[knowle

## Vectorization

In [11]:
# Parameters used throughout the process
N_FEATURES_ABSTRACT = 1024
N_FEATURES_TITLE = 1024
N_FEATURES_AUTHORS = 1024
DATA_LIMIT = 100_000 # Set negative to include all of the data
PCA_k = 128
CLUSTERING_k_RANGE = (3,6)
CLUSTERING_k_FINAL = 3

In [12]:
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.sql.functions import when
import time

# NB! This takes a long time to calculate so don't run this often (run time of ~1 minute)
"""hashing_tf = HashingTF(inputCol="final_filtered", outputCol="raw_features", numFeatures=1024)
tf_df = hashing_tf.transform(papers_cleaned_df)

idf = IDF(inputCol="raw_features", outputCol="features")
idf_model = idf.fit(tf_df)
tfidf_df = idf_model.transform(tf_df)
tfidf_df = tfidf_df.filter(col("features").isNotNull())
"""

# HashingTF for abstract and title
hashingTF_abstract = HashingTF(inputCol="final_filtered_abstract", outputCol="raw_abstract_features", numFeatures=N_FEATURES_ABSTRACT)
hashingTF_title = HashingTF(inputCol="final_filtered_title", outputCol="raw_title_features", numFeatures=N_FEATURES_TITLE)

# IDF for abstract
idf_abstract = IDF(inputCol="raw_abstract_features", outputCol="abstract_features")

# IDF for title
idf_title = IDF(inputCol="raw_title_features", outputCol="title_features")


In [13]:
from pyspark.ml.feature import StandardScaler, VectorAssembler

# Apply HashingTF to abstract and title
start = time.time_ns() / 1_000_000_000
featurized_df = hashingTF_abstract.transform(papers_cleaned_df)
featurized_df = hashingTF_title.transform(featurized_df)
end = time.time_ns() / 1_000_000_000
print("HashingTF for abstract and title took", (end - start), "seconds")

# Apply IDF on abstract
start = time.time_ns() / 1_000_000_000
rescaled_df = idf_abstract.fit(featurized_df).transform(featurized_df)
end = time.time_ns() / 1_000_000_000
print("IDF for abstract took", (end - start), "seconds")

# Apply IDF on title
start = time.time_ns() / 1_000_000_000
rescaled_df = idf_title.fit(rescaled_df).transform(rescaled_df)
end = time.time_ns() / 1_000_000_000
print("IDF for title took", (end - start), "seconds")

# Normalize n_citation
# Skipping this for now as n_citation had an anomaly where ~20% of all data points had the value 50, which did not make sense
#assembler = VectorAssembler(inputCols=["n_citation"], outputCol="n_citation_vec")
#rescaled_df = assembler.transform(rescaled_df)
#scaler = StandardScaler(inputCol="n_citation_vec", outputCol="scaled_n_citation")
#rescaled_df = scaler.fit(rescaled_df).transform(rescaled_df)

# HashingTF for authors
start = time.time_ns() / 1_000_000_000
hashingTF_authors = HashingTF(inputCol="authors", outputCol="authors_features", numFeatures=N_FEATURES_AUTHORS)
rescaled_df = hashingTF_authors.transform(rescaled_df)
end = time.time_ns() / 1_000_000_000
print("HashingTF for authors took", (end - start), "seconds")

# Combine all features
assembler = VectorAssembler(inputCols=["abstract_features", "title_features", "authors_features"], outputCol="features") # Removed "scaled_n_citation" for now
final_df = assembler.transform(rescaled_df)

HashingTF for abstract and title took 0.3699822425842285 seconds
IDF for abstract took 62.56141018867493 seconds
IDF for title took 19.113986253738403 seconds
HashingTF for authors took 0.03081822395324707 seconds


In [14]:
# Drop intermediary columns
columns_to_drop = ["final_filtered_abstract", "final_filtered_title", "authors",
                   "raw_abstract_features", "raw_title_features",
                   "abstract_features", "title_features", "authors_features" ]

final_df = final_df.drop(*columns_to_drop)

In [15]:
import time

start_ms = time.time_ns() / 1_000_000_000

print()
final_df.show()
print()

end_ms = time.time_ns() / 1_000_000_000
print("The process took", (end_ms - start_ms), "seconds")


+--------------------+--------------------+
|                  id|            features|
+--------------------+--------------------+
|00127ee2-cb05-48c...|(3072,[68,73,104,...|
|001c58d3-26ad-46b...|(3072,[28,49,59,7...|
|001c8744-73c4-4b0...|(3072,[5,7,39,44,...|
|00a119c4-d367-460...|(3072,[35,42,58,6...|
|00bcf2d5-1592-46b...|(3072,[23,94,100,...|
|00c85316-bddf-4bc...|(3072,[4,56,107,1...|
|00ca027b-5174-40f...|(3072,[6,7,39,56,...|
|00dd5ece-1339-4cb...|(3072,[5,6,16,80,...|
|01047814-b615-444...|(3072,[31,38,67,8...|
|010d4ce9-0279-416...|(3072,[11,16,30,3...|
|010d9907-45ef-459...|(3072,[84,92,109,...|
|012b88ae-a763-45d...|(3072,[36,43,56,7...|
|016a9a21-e882-4cd...|(3072,[29,31,39,4...|
|01705f09-d395-4a0...|(3072,[11,12,15,3...|
|01b6f2ca-3903-419...|(3072,[46,76,103,...|
|01ccb92f-46f1-400...|(3072,[56,57,58,7...|
|01edeac9-cd8b-46f...|(3072,[55,58,71,1...|
|0265aea8-65f3-4f4...|(3072,[10,17,20,3...|
|028d37c8-b571-41b...|(3072,[24,36,44,9...|
|02a5e8a8-061e-4c1...|(3072,[67

## Clustering

In [16]:
from pyspark.ml.feature import PCA
import time

# Select the first 1000 rows, change this how you see fit and how much your PC can handle
if DATA_LIMIT < 0:
    print("Taking full dataset")
    limited_final_dropped_df = final_df
else:
    print("Limiting the dataset to the first", DATA_LIMIT, "rows")
    limited_final_dropped_df = final_df.limit(DATA_LIMIT)

start = time.time_ns() / 1_000_000_000
pca = PCA(k=PCA_k, inputCol="features", outputCol="pca_features")
pca_model = pca.fit(limited_final_dropped_df)
pca_df = pca_model.transform(limited_final_dropped_df)

end = time.time_ns() / 1_000_000_000
print("PCA for features took", (end - start), "seconds")

Limiting the dataset to the first 100000 rows


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

### Clustering with elbowing
Run this only if you want to **determine the best k** and then do **clustering with it**. <br>
<font color=red>**NB!** The current implementation is **very** slow</font>

In [41]:
from pyspark.ml.feature import PCA
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.linalg import Vectors
import time

# NB! This process is currently extremely slow

# Determine the optimal number of clusters using the silhouette score
cost = []
evaluator = ClusteringEvaluator()
start_det = time.time_ns() / 1_000_000_000

for k in range(CLUSTERING_k_RANGE[0], CLUSTERING_k_RANGE[1]):
    start = time.time_ns() / 1_000_000_000
    print("Clustering for k =", k)
    kmeans = KMeans(featuresCol='pca_features', k=k)
    model = kmeans.fit(pca_df)
    predictions = model.transform(pca_df)
    silhouette = evaluator.evaluate(predictions)
    cost.append((k, silhouette))
    print(f"With K={k}, the Silhouette score is {silhouette}")
    end = time.time_ns() / 1_000_000_000
    print("Clustering for k =", k, "took", (end - start), "seconds")

# Choose the best K (you can automate this step)
best_k = max(cost, key=lambda item: item[1])[0]
print(f"Best K found: {best_k}")
end_det = time.time_ns() / 1_000_000_000
print("Finding the best k in range", CLUSTERING_k_RANGE, "took", (end_det - start_det), "seconds")

# Fit the final K-means model with the best K
start = time.time_ns() / 1_000_000_000
kmeans = KMeans(featuresCol='pca_features', k=best_k)
model = kmeans.fit(pca_df)
predictions = model.transform(pca_df)

# Show the resulting clusters
predictions.select("id", "prediction").show()
end = time.time_ns() / 1_000_000_000
print("\nRefitting the best k took", (end - start), "seconds\n")

# If you want to see the cluster centers
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)


Clustering for k = 3
With K=3, the Silhouette score is 0.059832468082140836
Clustering for k = 3 took 78.08387064933777 seconds
Clustering for k = 4
With K=4, the Silhouette score is -0.07965425721369154
Clustering for k = 4 took 85.41783213615417 seconds
Clustering for k = 5
With K=5, the Silhouette score is -0.12199325453327732
Clustering for k = 5 took 83.16690015792847 seconds
Best K found: 3
Finding the best k in range (3, 6) took 246.67020845413208 seconds
+--------------------+----------+
|                  id|prediction|
+--------------------+----------+
|00127ee2-cb05-48c...|         0|
|001c58d3-26ad-46b...|         2|
|001c8744-73c4-4b0...|         0|
|00a119c4-d367-460...|         1|
|00bcf2d5-1592-46b...|         0|
|00c85316-bddf-4bc...|         0|
|00ca027b-5174-40f...|         0|
|00dd5ece-1339-4cb...|         0|
|01047814-b615-444...|         0|
|010d4ce9-0279-416...|         0|
|010d9907-45ef-459...|         0|
|012b88ae-a763-45d...|         0|
|016a9a21-e882-4cd...| 

### Clustering with predefined k
Run this only if you want to **cluster with the predefined k**. <br>
<font color=red>**NB!** The current implementation is slow, but less so than the previous cell</font>

In [None]:
import time

# Fit the final K-means model with the predefined k
start = time.time_ns() / 1_000_000_000
kmeans = KMeans(featuresCol='pca_features', k=CLUSTERING_k_FINAL)
model = kmeans.fit(pca_df)
predictions = model.transform(pca_df)
end = time.time_ns() / 1_000_000_000
print("Clustering for k =", CLUSTERING_k_FINAL, "took", (end - start), "seconds")

# Show the resulting clusters
predictions.select("id", "prediction").show()

# Show the silhouette score
start = time.time_ns() / 1_000_000_000
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("\nSilhouette score is", silhouette)
print("Calculating the silhouette score took", (end - start), "seconds")

# If you want to see the cluster centers
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

In [None]:
predictions.select("id", "prediction").groupBy("prediction").count().show()

In [None]:
# NB! This overwrites the previous file if any exists
predictions.select("id", "prediction").write.mode('overwrite').format('json').save('prediction/prediction.json')