# Environment Setup

In [1]:
#!pip install spark-nlp
#!pip install fastparquet 
#!pip install spark-nlp==2.6.1

In [2]:
import re
import numpy as np
import pandas as pd

from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.ml import Pipeline

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *

from pyspark.sql.functions import udf
import pyspark.sql.types as T

import pyspark.sql.functions as F
from pyspark.sql.types import *

from pyspark.ml.feature import HashingTF, IDF, StringIndexer, SQLTransformer,IndexToString
from pyspark.ml.feature import CountVectorizer , IDF
from pyspark.ml.clustering import LDA

from utils import CUSTOM_STOP_WORDS

In [3]:
"""
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[4]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.6.1")\
    .config("spark.kryoserializer.buffer.max", "1000M")\
    .getOrCreate()

sqlContext = SQLContext(spark)

spark
"""
print()




In [4]:
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.6.1")\
    .getOrCreate()

sqlContext = SQLContext(spark)

spark

# Load data

In [5]:
df_spark = sqlContext.read.parquet("newsgroup_20_data.parquet")
print(df_spark.count())
#df_spark.head(2)

18846


In [6]:
d = df_spark.head(1)
d = d[0]

print(d.asDict()['category'])
#print(d.asDict()['news'])

rec.sport.hockey


In [7]:
#trainingData = df_spark

# Pre-Process Data

## 1. data cleaning

In [8]:
col_input = "news"
col_label = "category"
col_nlp = 'col_nlp'

In [9]:
def text_cleaner(sentence):
    
    # clean the punctuations
    punc_re = r'[^a-zA-Z0-9 &]'
    sentence = re.sub(punc_re, ' ', sentence)
    
    # tokens
    arr = sentence.split()
    
    # remove white spaces
    # lowercase
    # filter words having lenght <= 3
    arr = [word.strip().lower() for word in arr if word.isalpha() and len(word)>=4]
    
    arr = " ".join(arr)
    return arr

In [10]:
data = df_spark.limit(10000)
#data = df_spark

udf_text_cleaner = F.udf(text_cleaner, StringType())

data_clean = data.withColumn(col_nlp, udf_text_cleaner(col_input))

print(data_clean.count())
data_clean.limit(2).show()

10000
+--------------------+--------------------+--------------------+
|                news|            category|             col_nlp|
+--------------------+--------------------+--------------------+
|From: Mamatha Dev...|    rec.sport.hockey|from mamatha devi...|
|From: mblawson@mi...|comp.sys.ibm.pc.h...|from mblawson mid...|
+--------------------+--------------------+--------------------+



In [11]:
from pyspark.sql.functions import col, size, length

In [12]:

#df = data_clean.filter(col("col_nlp").contains(col("number")))
#df = data_clean.where(length(col("col_nlp")) >=  3)

#df.show()

## 2. vectorizer

In [13]:
def type_changer(sentence):
    return sentence.split(" ")
udf_type_changer = F.udf(type_changer, ArrayType(elementType=StringType()))

#data_in = data_clean.limit(1000)
data_in = data_clean

# get tokens
data_arr = data_in.withColumn("col_nlp_arr", udf_type_changer(col_nlp))

data_arr = data_arr.where(size(col("col_nlp_arr")) >= 50)

# TF
cv = CountVectorizer(inputCol="col_nlp_arr", outputCol="raw_features", minDF=10.0)
cvmodel = cv.fit(data_arr)
result_cv = cvmodel.transform(data_arr)

# IDF
idf = IDF(inputCol="raw_features", outputCol="features")
idfModel = idf.fit(result_cv)
result_tfidf = idfModel.transform(result_cv)

data_pp = result_tfidf.select('category', col_input, "features")

print(type(data_pp))
print(data_pp.count())
data_pp.limit(2).show()

<class 'pyspark.sql.dataframe.DataFrame'>
8672
+--------------------+--------------------+--------------------+
|            category|                news|            features|
+--------------------+--------------------+--------------------+
|    rec.sport.hockey|From: Mamatha Dev...|(11222,[1,3,4,5,6...|
|comp.sys.ibm.pc.h...|From: mblawson@mi...|(11222,[0,1,3,4,6...|
+--------------------+--------------------+--------------------+



# LDA Model

## 1. train model

In [14]:
numTopics = 20 # number of topics
 
lda = LDA(k=numTopics, seed = 1, optimizer="online", optimizeDocConcentration=True,
          maxIter = 10,           # number of iterations
          learningDecay = 0.51,   # kappa, learning rate
          learningOffset = 64.0,  # tau_0, larger values downweigh early iterations
          subsamplingRate = 0.05, # mini batch fraction 
          )
 
model = lda.fit(data_pp)
print("done....")

done....


In [18]:
ll = model.logLikelihood(data_pp.limit(1000))
lp = model.logPerplexity(data_pp.limit(1000))
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

The lower bound on the log likelihood of the entire corpus: -3353019.809906279
The upper bound on perplexity: 8.438440880488498


In [1]:
print("modelling completed..!")

modelling completed..!


## 2. topic insights

In [18]:
model.vocabSize()

11222

In [19]:
model.describeTopics().first()

Row(topic=0, termIndices=[5, 23, 62, 292, 387, 2438, 338, 1082, 25, 140], termWeights=[0.006638956653846592, 0.0064438381700256155, 0.005173762968008978, 0.005089163464615525, 0.004763270586481971, 0.004142239044601799, 0.0037997531017483987, 0.0032487619708818087, 0.0032111439308443713, 0.00319652052859968])

In [20]:
print("The topics described by their top-weighted terms:")
model.describeTopics(5).limit(6).show()

The topics described by their top-weighted terms:
+-----+--------------------+--------------------+
|topic|         termIndices|         termWeights|
+-----+--------------------+--------------------+
|    0|[5, 23, 62, 292, ...|[0.00663895665384...|
|    1|[302, 439, 15, 12...|[0.00644475641676...|
|    2|[259, 23, 241, 19...|[0.00431153449081...|
|    3|[219, 1552, 1687,...|[0.01006660274848...|
|    4|[300, 370, 88, 33...|[0.00613868741279...|
|    5|[936, 829, 909, 1...|[0.00489366810433...|
+-----+--------------------+--------------------+



In [21]:
model.topicsMatrix()

DenseMatrix(11222, 20, [618.6914, 0.3279, 267.4274, 239.3289, 383.351, 1490.7257, 0.3215, 0.9759, ..., 0.4557, 0.3079, 0.5633, 12.3413, 0.711, 0.3757, 0.486, 0.3415], 0)

In [22]:
del df_spark

In [23]:
del data_in, data_clean, data

## 3. topic assignment

In [24]:
max_index = F.udf(lambda x: x.tolist().index(max(x)), IntegerType())


data_lda = model.transform(data_pp)
data_lda = data_lda.withColumn("topicID", max_index("topicDistribution"))

In [25]:
print(data_lda.count())
data_lda.limit(2).show()

8672
+--------------------+--------------------+--------------------+--------------------+-------+
|            category|                news|            features|   topicDistribution|topicID|
+--------------------+--------------------+--------------------+--------------------+-------+
|    rec.sport.hockey|From: Mamatha Dev...|(11222,[1,3,4,5,6...|[1.84369692171841...|     19|
|comp.sys.ibm.pc.h...|From: mblawson@mi...|(11222,[0,1,3,4,6...|[2.11782672755489...|     16|
+--------------------+--------------------+--------------------+--------------------+-------+



In [26]:
"""
topicDistribution : list of topic weights (len==num_topics)
"""
print()




## 4. topic model assesment

In [27]:
#lda_train_data.first()

In [28]:
X_topics = data_lda.select("category", "topicID").toPandas()

print(X_topics.shape)
X_topics.head(2)

(8672, 2)


Unnamed: 0,category,topicID
0,rec.sport.hockey,19
1,comp.sys.ibm.pc.hardware,16


In [29]:
def topic_metrics(df):
    #print(df.head(2))
    arr = df["topicID"].value_counts()
    max_topic = arr.index.values[0]
    perc_dominance = arr[max_topic] / arr.sum()
    
    result = pd.Series(data=[int(max_topic), perc_dominance], index=["category_pred", "perc_dominance"])
    
    return result

In [30]:
X = X_topics.copy()
#X = X_topics.head(10)

X_label_mapping = X.groupby("category").apply(topic_metrics).reset_index()
X_label_mapping["category_pred"] = X_label_mapping["category_pred"].astype("int")
X_label_mapping["perc_dominance"] = np.round(X_label_mapping["perc_dominance"], 2)
X_label_mapping

Unnamed: 0,category,category_pred,perc_dominance
0,alt.atheism,1,0.38
1,comp.graphics,16,0.41
2,comp.os.ms-windows.misc,16,0.43
3,comp.sys.ibm.pc.hardware,13,0.37
4,comp.sys.mac.hardware,16,0.29
5,comp.windows.x,16,0.29
6,misc.forsale,16,0.35
7,rec.autos,8,0.2
8,rec.motorcycles,8,0.13
9,rec.sport.baseball,7,0.5


In [None]:
dict_mapper = {}
for i in zip(X_label_mapping["category"], X_label_mapping["category_pred"]):
    dict_mapper[i[0]] = int(i[1])
    
X = X_topics.copy()
X["category_pred"] = X["topicID"].replace(dict_mapper)

print(X.shape)
X.head(2)

In [None]:
X_topics.info()