# Environment Setup

In [1]:
#!pip install spark-nlp
#!pip install fastparquet 
#!pip install spark-nlp==2.6.1

In [2]:
import re
import numpy as np
import pandas as pd

from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.ml import Pipeline

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *

from pyspark.sql.functions import udf
import pyspark.sql.types as T

import pyspark.sql.functions as F
from pyspark.sql.types import *

from pyspark.ml.feature import HashingTF, IDF, StringIndexer, SQLTransformer,IndexToString
from pyspark.ml.feature import CountVectorizer , IDF
from pyspark.ml.clustering import LDA

from utils import CUSTOM_STOP_WORDS

In [3]:
"""
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[4]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.6.1")\
    .config("spark.kryoserializer.buffer.max", "1000M")\
    .getOrCreate()

sqlContext = SQLContext(spark)

spark
"""
print()




In [4]:
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.6.1")\
    .getOrCreate()

sqlContext = SQLContext(spark)

spark

# Load data

In [5]:
df_spark = sqlContext.read.parquet("newsgroup_20_data.parquet")
print(df_spark.count())
#df_spark.head(2)

18846


In [6]:
d = df_spark.head(1)
d = d[0]

print(d.asDict()['category'])
#print(d.asDict()['news'])

rec.sport.hockey


In [7]:
#trainingData = df_spark

# Pre-Process Data

## 1. data cleaning

In [8]:
col_input = "news"
col_label = "category"
col_nlp = 'col_nlp'

In [9]:
def text_cleaner(sentence):
    
    # clean the punctuations
    punc_re = r'[^a-zA-Z0-9 &]'
    sentence = re.sub(punc_re, ' ', sentence)
    
    # tokens
    arr = sentence.split()
    
    # remove white spaces
    # lowercase
    # filter words having lenght <= 3
    arr = [word.strip().lower() for word in arr if word.isalpha() and len(word)>=4]
    
    arr = " ".join(arr)
    return arr

In [10]:
#data = df_spark.limit(10000)
data = df_spark

udf_text_cleaner = F.udf(text_cleaner, StringType())

data_clean = data.withColumn(col_nlp, udf_text_cleaner(col_input))

print(data_clean.count())
data_clean.limit(2).show()

18846
+--------------------+--------------------+--------------------+
|                news|            category|             col_nlp|
+--------------------+--------------------+--------------------+
|From: Mamatha Dev...|    rec.sport.hockey|from mamatha devi...|
|From: mblawson@mi...|comp.sys.ibm.pc.h...|from mblawson mid...|
+--------------------+--------------------+--------------------+



In [11]:
from pyspark.sql.functions import col, size, length

In [12]:

#df = data_clean.filter(col("col_nlp").contains(col("number")))
#df = data_clean.where(length(col("col_nlp")) >=  3)

#df.show()

## 2. vectorizer

In [13]:
def type_changer(sentence):
    return sentence.split(" ")
udf_type_changer = F.udf(type_changer, ArrayType(elementType=StringType()))

#data_in = data_clean.limit(1000)
data_in = data_clean

# get tokens
data_arr = data_in.withColumn("col_nlp_arr", udf_type_changer(col_nlp))

data_arr = data_arr.where(size(col("col_nlp_arr")) >= 50)

# TF
cv = CountVectorizer(inputCol="col_nlp_arr", outputCol="raw_features", minDF=10.0)
cvmodel = cv.fit(data_arr)
result_cv = cvmodel.transform(data_arr)

# IDF
idf = IDF(inputCol="raw_features", outputCol="features")
idfModel = idf.fit(result_cv)
result_tfidf = idfModel.transform(result_cv)

data_pp = result_tfidf.select('category', col_input, "features")

print(type(data_pp))
print(data_pp.count())
data_pp.limit(2).show()

<class 'pyspark.sql.dataframe.DataFrame'>
16339
+--------------------+--------------------+--------------------+
|            category|                news|            features|
+--------------------+--------------------+--------------------+
|    rec.sport.hockey|From: Mamatha Dev...|(16849,[1,3,4,5,6...|
|comp.sys.ibm.pc.h...|From: mblawson@mi...|(16849,[0,1,3,4,6...|
+--------------------+--------------------+--------------------+



In [14]:
del df_spark, data_in, data_clean, data

# LDA Model

## 1. train model

In [15]:
numTopics = 20 # number of topics
 
lda = LDA(k=numTopics, seed = 1, optimizer="online", optimizeDocConcentration=True,
          maxIter = 10,           # number of iterations
          learningDecay = 0.51,   # kappa, learning rate
          learningOffset = 64.0,  # tau_0, larger values downweigh early iterations
          subsamplingRate = 0.05, # mini batch fraction 
          )
 
model = lda.fit(data_pp)
print("done....")

done....


In [16]:
#ll = model.logLikelihood(data_pp.limit(1000))
#lp = model.logPerplexity(data_pp.limit(1000))
#print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
#print("The upper bound on perplexity: " + str(lp))

In [17]:
print("modelling completed..!")

modelling completed..!


## 2. topic insights

In [18]:
model.vocabSize()

16849

In [19]:
model.describeTopics().first()

Row(topic=0, termIndices=[189, 506, 489, 470, 202, 87, 1827, 1845, 348, 1189], termWeights=[0.009341909851538453, 0.008010012331209352, 0.007990287031446608, 0.006334700437839493, 0.004914310767257193, 0.0042202160860755076, 0.003952959419234768, 0.003765844853605157, 0.0029929132313235244, 0.002910429969153036])

In [20]:
print("The topics described by their top-weighted terms:")
model.describeTopics(5).limit(6).show()

The topics described by their top-weighted terms:
+-----+--------------------+--------------------+
|topic|         termIndices|         termWeights|
+-----+--------------------+--------------------+
|    0|[189, 506, 489, 4...|[0.00934190985153...|
|    1|[1963, 758, 1335,...|[0.00385391834769...|
|    2|[286, 228, 731, 3...|[0.00694982925464...|
|    3|[1919, 111, 5670,...|[0.00671800750702...|
|    4|[5, 24, 66, 31, 342]|[0.00716124958815...|
|    5|[681, 563, 435, 6...|[0.00446859164874...|
+-----+--------------------+--------------------+



In [21]:
model.topicsMatrix()

DenseMatrix(16849, 20, [499.2241, 0.3279, 559.173, 466.4238, 437.912, 346.0993, 0.3215, 2.102, ..., 0.3202, 0.3108, 0.3806, 10.4016, 21.3329, 6.7178, 0.3595, 2.4909], 0)

## 3. topic assignment

In [22]:
max_index = F.udf(lambda x: x.tolist().index(max(x)), IntegerType())


data_lda = model.transform(data_pp)
data_lda = data_lda.withColumn("topicID", max_index("topicDistribution"))

In [23]:
print(data_lda.count())
data_lda.limit(2).show()

16339
+--------------------+--------------------+--------------------+--------------------+-------+
|            category|                news|            features|   topicDistribution|topicID|
+--------------------+--------------------+--------------------+--------------------+-------+
|    rec.sport.hockey|From: Mamatha Dev...|(16849,[1,3,4,5,6...|[1.71382079449240...|      8|
|comp.sys.ibm.pc.h...|From: mblawson@mi...|(16849,[0,1,3,4,6...|[1.96591111033315...|     19|
+--------------------+--------------------+--------------------+--------------------+-------+



In [24]:
"""
topicDistribution : list of topic weights (len==num_topics)
"""
print()




## 4. topic model assesment

In [25]:
#lda_train_data.first()

In [26]:
X_topics = data_lda.select("category", "topicID").toPandas()

print(X_topics.shape)
X_topics.head(2)

(16339, 2)


Unnamed: 0,category,topicID
0,rec.sport.hockey,8
1,comp.sys.ibm.pc.hardware,19


In [29]:
def topic_metrics(df):
    #print(df.head(2))
    arr = df["topicID"].value_counts()
    max_topic = arr.index.values[0]
    perc_dominance = arr[max_topic] / arr.sum()
    
    result = pd.Series(data=[int(max_topic), perc_dominance], index=["category_pred", "perc_dominance"])
    
    return result

In [31]:
X = X_topics.copy()
#X = X_topics.head(10)

X_label_mapping = X.groupby("category").apply(topic_metrics).reset_index()
X_label_mapping["category_pred"] = X_label_mapping["category_pred"].astype("int")
X_label_mapping["perc_dominance"] = np.round(X_label_mapping["perc_dominance"], 2)
X_label_mapping = X_label_mapping.sort_values(by=["category_pred", "perc_dominance"], ascending=[True, False])
X_label_mapping = X_label_mapping.reset_index(drop=True)
X_label_mapping

Unnamed: 0,category,category_pred,perc_dominance
0,rec.sport.hockey,1,0.47
1,talk.politics.guns,4,0.17
2,misc.forsale,7,0.29
3,comp.sys.mac.hardware,7,0.2
4,rec.autos,7,0.13
5,sci.crypt,8,0.25
6,comp.sys.ibm.pc.hardware,10,0.31
7,talk.politics.misc,11,0.25
8,talk.politics.mideast,12,0.36
9,talk.religion.misc,12,0.21


In [36]:
X_topics["category"].value_counts()

sci.crypt                   931
sci.space                   917
soc.religion.christian      909
rec.motorcycles             902
rec.sport.hockey            893
sci.med                     888
talk.politics.mideast       875
talk.politics.guns          867
rec.autos                   847
sci.electronics             828
rec.sport.baseball          824
comp.windows.x              817
comp.os.ms-windows.misc     815
comp.sys.ibm.pc.hardware    813
comp.sys.mac.hardware       780
alt.atheism                 751
talk.politics.misc          731
comp.graphics               728
misc.forsale                639
talk.religion.misc          584
Name: category, dtype: int64

In [38]:
dict_mapper = {}
for i in zip(X_label_mapping["category"], X_label_mapping["category_pred"]):
    dict_mapper[str(i[1])] = i[0]
dict_mapper

{'1': 'rec.sport.hockey',
 '4': 'talk.politics.guns',
 '7': 'rec.autos',
 '8': 'sci.crypt',
 '10': 'comp.sys.ibm.pc.hardware',
 '11': 'talk.politics.misc',
 '12': 'talk.religion.misc',
 '14': 'rec.sport.baseball',
 '15': 'sci.electronics',
 '16': 'alt.atheism',
 '17': 'sci.space',
 '18': 'comp.graphics'}

In [39]:
custom_mapper = {'1': 'rec.sport.hockey',
                 '4': 'talk.politics.guns',
                 '7': 'rec.autos',
                 '8': 'sci.crypt',
                 '10': 'comp.sys.ibm.pc.hardware',
                 '11': 'talk.politics.misc',
                 '12': 'talk.religion.misc',
                 '14': 'rec.sport.baseball',
                 '15': 'sci.electronics',
                 '16': 'alt.atheism',
                 '17': 'sci.space',
                 '18': 'comp.graphics'}

In [51]:
X = X_topics.copy()
X["topicID"] = X["topicID"].astype("str")
X["category_pred"] = X["topicID"].replace(custom_mapper)

#X["category_pred"] = np.where(len(X["category_pred"])<=3, "NA", X["category_pred"])
X["category_pred"] = X["category_pred"].apply(lambda val : "NA" if len(val)<=3 else val)

print(X.shape)
X.head(2)

(16339, 3)


Unnamed: 0,category,topicID,category_pred
0,rec.sport.hockey,8,sci.crypt
1,comp.sys.ibm.pc.hardware,19,


### Classification metrices

In [65]:
from sklearn.metrics import accuracy_score, precision_score, recall_score 
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

In [63]:
y_pred = X["category_pred"]
y_true = X["category"]
#plot_cm()


ac = accuracy_score( y_true, y_pred )
all_vals = precision_recall_fscore_support(y_true, y_pred )
precision = all_vals[0][1]
recall = all_vals[1][1]
fscore = all_vals[2][1]
support = all_vals[3][1]

text_print_plot = \
"""
Confusion Matrix
{} = {} 
{} = {}, {} = {}
{} = {}
{} = {}
""".format(
    'Accuracy', round(ac,2), 
    'Precision', round(precision,2), 
    'Recall', round(recall, 2),
    'Fscore', round(fscore, 2),
    'Support', support
)


In [64]:
print(text_print_plot)


Confusion Matrix
Accuracy = 0.17 
Precision = 0.21, Recall = 0.31
Fscore = 0.25
Support = 751

