# Documents Clustering


# Spark-NLP

Clustering 20k documents using LDA model

# Environment Setup

In [1]:
#!pip install spark-nlp
#!pip install fastparquet 
#!pip install spark-nlp==2.6.1

In [1]:
import re
import numpy as np
import pandas as pd

from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.ml import Pipeline

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *

from pyspark.sql.functions import udf
import pyspark.sql.types as T
from pyspark.sql.functions import col, size, length

import pyspark.sql.functions as F
from pyspark.sql.types import *

from pyspark.ml.feature import HashingTF, IDF, StringIndexer, SQLTransformer,IndexToString
from pyspark.ml.feature import CountVectorizer , IDF
from pyspark.ml.clustering import LDA

from custom_utils import CUSTOM_STOP_WORDS

In [3]:
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.6.1")\
    .getOrCreate()

sqlContext = SQLContext(spark)

spark

# Load data

In [4]:
df_spark = sqlContext.read.parquet("newsgroup_20_data.parquet")
print(df_spark.count())
#df_spark.head(2)

18846


In [15]:
d = df_spark.head(1)
d = d[0]

print(d.asDict()['category'])
print(d.asDict()['news'])

rec.sport.hockey
From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!




# Pre-Process Data

## 1. data cleaning
String level data cleaning

In [7]:
col_input = "news"
col_label = "category"
col_nlp = 'col_nlp'

In [16]:
def text_cleaner(sentence):
    
    # clean the punctuations
    punc_re = r'[^a-zA-Z0-9 &]'
    sentence = re.sub(punc_re, ' ', sentence)
    
    # tokens
    arr = sentence.split()
    
    # remove white spaces
    # lowercase
    # filter words having lenght <= 3
    arr = [word.strip().lower() for word in arr if word.isalpha() and len(word)>=4]
    
    # remove starting 4 words as they are email id
    arr = arr[20:-4]
    
    arr = " ".join(arr)
    return arr

In [17]:
#data = df_spark.limit(10000)
data = df_spark

udf_text_cleaner = F.udf(text_cleaner, StringType())

data_clean = data.withColumn(col_nlp, udf_text_cleaner(col_input))

print(data_clean.count())
data_clean.limit(2).show()

18846
+--------------------+--------------------+--------------------+
|                news|            category|             col_nlp|
+--------------------+--------------------+--------------------+
|From: Mamatha Dev...|    rec.sport.hockey|sure some bashers...|
|From: mblawson@mi...|comp.sys.ibm.pc.h...|midway uoknor org...|
+--------------------+--------------------+--------------------+



## 2. nlp pre-processing
token level data pre-processing

In [18]:
documentAssembler = DocumentAssembler()\
.setInputCol("col_nlp")\
.setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

stemmer = Stemmer() \
    .setInputCols(["token"]) \
    .setOutputCol("stem")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("stem")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)\
      #.setStopWords(["no", "without"])
      
tokenassembler = TokenAssembler()\
    .setInputCols(["document", "cleanTokens"]) \
    .setOutputCol("clean_text")


finisher = Finisher() \
    .setInputCols(["cleanTokens"]) \
    .setIncludeMetadata(False) # set to False to remove metadata

nlpPipeline = Pipeline(stages=[
 documentAssembler, 
 tokenizer,
 stemmer,
 stopwords_cleaner,
 #tokenassembler,
 finisher
 ])

empty_df = spark.createDataFrame([['']]).toDF("col_nlp")
pipelineModel = nlpPipeline.fit(empty_df)

In [19]:
#data_in = data_clean.limit(1000)
data_in = data_clean

data_arr = pipelineModel.transform(data_in)

data_arr = data_arr.withColumnRenamed("finished_cleanTokens", "col_nlp_arr")
data_arr = data_arr.select("news", "category", "col_nlp_arr")

data_arr.limit(2).show()

+--------------------+--------------------+--------------------+
|                news|            category|         col_nlp_arr|
+--------------------+--------------------+--------------------+
|From: Mamatha Dev...|    rec.sport.hockey|[sure, basher, pe...|
|From: mblawson@mi...|comp.sys.ibm.pc.h...|[midwai, uoknor, ...|
+--------------------+--------------------+--------------------+



## 3. tfidf
coverting text data to ML features i.e tfidf matrix

In [20]:
# TF
cv = CountVectorizer(inputCol="col_nlp_arr", outputCol="raw_features", minDF=10.0)
cvmodel = cv.fit(data_arr)
result_cv = cvmodel.transform(data_arr)

# IDF
idf = IDF(inputCol="raw_features", outputCol="features")
idfModel = idf.fit(result_cv)
result_tfidf = idfModel.transform(result_cv)


data_pp = result_tfidf.select('category', "news", "features")

print(type(data_pp))
print(data_pp.count())
data_pp.limit(2).show()

<class 'pyspark.sql.dataframe.DataFrame'>
18846
+--------------------+--------------------+--------------------+
|            category|                news|            features|
+--------------------+--------------------+--------------------+
|    rec.sport.hockey|From: Mamatha Dev...|(10936,[1,10,25,2...|
|comp.sys.ibm.pc.h...|From: mblawson@mi...|(10936,[13,16,18,...|
+--------------------+--------------------+--------------------+



In [21]:
# free up the memory by deleting meta data
del df_spark, data_in, data_clean, data, data_arr

# LDA Model

## 1. train model

In [22]:
numTopics = 20 # number of topics
 
lda = LDA(k=numTopics, seed = 1, optimizer="online", optimizeDocConcentration=True,
          maxIter = 100,           # number of iterations
          learningDecay = 0.51,   # kappa, learning rate
          learningOffset = 64.0,  # tau_0, larger values downweigh early iterations
          subsamplingRate = 0.05, # mini batch fraction 
          )
 
model = lda.fit(data_pp)
print("done....")

done....


In [None]:
ll = model.logLikelihood(data_pp)
lp = model.logPerplexity(data_pp)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

The lower bound on the log likelihood of the entire corpus: -62048237.8191261
The upper bound on perplexity: 8.116322726619108


In [None]:
print("hello")

In [57]:
"""
The lower bound on the log likelihood of the entire corpus: -62048237.8191261
The upper bound on perplexity: 8.116322726619108

"""

print("modelling completed..!")

modelling completed..!


## 2. topic insights
analyze the baisc stats and results of LDA model

In [31]:
model.vocabSize()

10936

In [32]:
model.describeTopics().first()

Row(topic=0, termIndices=[1, 662, 4, 0, 491, 866, 188, 572, 3, 697], termWeights=[0.004411460852739662, 0.0035909083370883653, 0.0031902594842626924, 0.0030005012275865113, 0.0029926489641961982, 0.002930798711081479, 0.0027677610071719576, 0.002763779728452812, 0.002734556764124069, 0.002688112941764219])

In [33]:
print("The topics described by their top-weighted terms:")
model.describeTopics(5).limit(6).show()

The topics described by their top-weighted terms:
+-----+--------------------+--------------------+
|topic|         termIndices|         termWeights|
+-----+--------------------+--------------------+
|    0| [1, 662, 4, 0, 491]|[0.00441146085273...|
|    1|[1, 375, 129, 18, 4]|[0.00471588834969...|
|    2|[29, 85, 33, 41, ...|[0.01667896786871...|
|    3|[266, 165, 557, 5...|[0.00802559458063...|
|    4|[141, 547, 760, 8...|[0.02130935804497...|
|    5|[323, 625, 54, 17...|[0.01138897638407...|
+-----+--------------------+--------------------+



In [34]:
model.topicsMatrix()

DenseMatrix(10936, 20, [1024.5231, 1506.2962, 635.8746, 933.7162, 1089.3162, 542.5992, 417.4316, 839.7049, ..., 0.3166, 23.263, 1.348, 0.3556, 0.3033, 1.0197, 0.5744, 1.1436], 0)

## 3. topic assignment
assign most prevalent topic as the prediction label for each document

In [35]:
max_index = F.udf(lambda x: x.tolist().index(max(x)), IntegerType())


data_lda = model.transform(data_pp)
data_lda = data_lda.withColumn("topicID", max_index("topicDistribution"))

In [36]:
print(data_lda.count())
data_lda.limit(2).show()

18846
+--------------------+--------------------+--------------------+--------------------+-------+
|            category|                news|            features|   topicDistribution|topicID|
+--------------------+--------------------+--------------------+--------------------+-------+
|    rec.sport.hockey|From: Mamatha Dev...|(10936,[1,10,25,2...|[2.59616787230400...|      9|
|comp.sys.ibm.pc.h...|From: mblawson@mi...|(10936,[13,16,18,...|[2.63984907034494...|      8|
+--------------------+--------------------+--------------------+--------------------+-------+



In [38]:
"""
topicDistribution : list of topic weights (len==num_topics)
"""
print()




## 4. topic model assesment
analyze the qulaity of model predictions

In [40]:
X_topics = data_lda.select("category", "topicID").toPandas()

print(X_topics.shape)
X_topics.head(2)

(18846, 2)


Unnamed: 0,category,topicID
0,rec.sport.hockey,9
1,comp.sys.ibm.pc.hardware,8


In [42]:
def topic_metrics(df):
    #print(df.head(2))
    arr = df["topicID"].value_counts()
    max_topic = arr.index.values[0]
    perc_dominance = arr[max_topic] / arr.sum()
    
    result = pd.Series(data=[int(max_topic), perc_dominance], index=["category_pred", "perc_dominance"])
    
    return result

In [43]:
"""
for each category:
    1. select the category_pred i.e topicID which is assigned to max number of docs
    for ex: for category: "rec.autos", category_pred/topicID: "0" is assigned to 52% of records
    
We can observe that few category_pred are common in multiple category.
For ex: 
topicId : 1 --> (talk.politics.guns, talk.politics.mideast, talk.politics.misc)
topicId : 2 --> (comp.windows, comp.graphics, comp.os.ms-windows.misc)
This also makes good sense

By doing mannual evauation of topics, this mapping can be improved further
    
"""

X = X_topics.copy()
#X = X_topics.head(10)

X_label_mapping = X.groupby("category").apply(topic_metrics).reset_index()
X_label_mapping["category_pred"] = X_label_mapping["category_pred"].astype("int")
X_label_mapping["perc_dominance"] = np.round(X_label_mapping["perc_dominance"], 2)
X_label_mapping = X_label_mapping.sort_values(by=["category_pred", "perc_dominance"], ascending=[True, False])
X_label_mapping = X_label_mapping.reset_index(drop=True)
X_label_mapping

Unnamed: 0,category,category_pred,perc_dominance
0,rec.autos,0,0.52
1,talk.politics.guns,1,0.4
2,talk.politics.mideast,1,0.38
3,talk.politics.misc,1,0.25
4,comp.windows.x,2,0.74
5,comp.graphics,2,0.6
6,comp.os.ms-windows.misc,2,0.54
7,sci.crypt,3,0.66
8,comp.sys.ibm.pc.hardware,5,0.26
9,rec.sport.hockey,9,0.38


In [2]:
"""
Since I am using free tier AWS sagemaker service, the available instance can't handle the much larger dataset.
Therefore I am using only 20k rows datset.

Although the complete code is in PySpark, therefore based on underlying cluster, it can be scaled up for any size of dataset.
"""
print()


