# Environment Setup

In [3]:
#!pip install spark-nlp
#!pip install fastparquet 
#!pip install spark-nlp==2.6.1

In [4]:
import re
import numpy as np
import pandas as pd

from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.ml import Pipeline

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *

from pyspark.sql.functions import udf
import pyspark.sql.types as T
from pyspark.sql.functions import col, size, length

import pyspark.sql.functions as F
from pyspark.sql.types import *

from pyspark.ml.feature import HashingTF, IDF, StringIndexer, SQLTransformer,IndexToString
from pyspark.ml.feature import CountVectorizer , IDF
from pyspark.ml.clustering import LDA

from custom_utils import CUSTOM_STOP_WORDS

In [5]:
"""
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .master("local[4]")\
    .config("spark.driver.memory","16G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.6.1")\
    .config("spark.kryoserializer.buffer.max", "1000M")\
    .getOrCreate()

sqlContext = SQLContext(spark)

spark
"""
print()




In [6]:
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.6.1")\
    .getOrCreate()

sqlContext = SQLContext(spark)

spark

# Load data

In [7]:
df_spark = sqlContext.read.parquet("newsgroup_20_data.parquet")
print(df_spark.count())
#df_spark.head(2)

18846


In [8]:
d = df_spark.head(1)
d = d[0]

print(d.asDict()['category'])
print(d.asDict()['news'])

rec.sport.hockey
From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!




In [9]:
#trainingData = df_spark

# Pre-Process Data

## 1. data cleaning

In [10]:
col_input = "news"
col_label = "category"
col_nlp = 'col_nlp'

In [11]:
def text_cleaner(sentence):
    
    # clean the punctuations
    punc_re = r'[^a-zA-Z0-9 &]'
    sentence = re.sub(punc_re, ' ', sentence)
    
    # tokens
    arr = sentence.split()
    
    # remove white spaces
    # lowercase
    # filter words having lenght <= 3
    arr = [word.strip().lower() for word in arr if word.isalpha() and len(word)>=4]
    
    # remove starting 4 words as they are email id
    arr = arr[20:-4]
    
    arr = " ".join(arr)
    return arr

In [12]:
#data = df_spark.limit(10000)
data = df_spark

udf_text_cleaner = F.udf(text_cleaner, StringType())

data_clean = data.withColumn(col_nlp, udf_text_cleaner(col_input))

print(data_clean.count())
data_clean.limit(2).show()

18846
+--------------------+--------------------+--------------------+
|                news|            category|             col_nlp|
+--------------------+--------------------+--------------------+
|From: Mamatha Dev...|    rec.sport.hockey|sure some bashers...|
|From: mblawson@mi...|comp.sys.ibm.pc.h...|midway uoknor org...|
+--------------------+--------------------+--------------------+



## 2. nlp pre-processing

In [13]:
documentAssembler = DocumentAssembler()\
.setInputCol("col_nlp")\
.setOutputCol("document")

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

stemmer = Stemmer() \
    .setInputCols(["token"]) \
    .setOutputCol("stem")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("stem")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)\
      #.setStopWords(["no", "without"]) (e.g. read a list of words from a txt)
      
tokenassembler = TokenAssembler()\
    .setInputCols(["document", "cleanTokens"]) \
    .setOutputCol("clean_text")


finisher = Finisher() \
    .setInputCols(["cleanTokens"]) \
    .setIncludeMetadata(False) # set to False to remove metadata

nlpPipeline = Pipeline(stages=[
 documentAssembler, 
 tokenizer,
 stemmer,
 stopwords_cleaner,
 #tokenassembler,
 finisher
 ])

empty_df = spark.createDataFrame([['']]).toDF("col_nlp")
pipelineModel = nlpPipeline.fit(empty_df)

In [14]:
#data_in = data_clean.limit(1000)
data_in = data_clean

data_arr = pipelineModel.transform(data_in)

data_arr = data_arr.withColumnRenamed("finished_cleanTokens", "col_nlp_arr")
data_arr = data_arr.select("news", "category", "col_nlp_arr")

data_arr.limit(2).show()

+--------------------+--------------------+--------------------+
|                news|            category|         col_nlp_arr|
+--------------------+--------------------+--------------------+
|From: Mamatha Dev...|    rec.sport.hockey|[sure, basher, pe...|
|From: mblawson@mi...|comp.sys.ibm.pc.h...|[midwai, uoknor, ...|
+--------------------+--------------------+--------------------+



## 3. tfidf

In [15]:
# TF
cv = CountVectorizer(inputCol="col_nlp_arr", outputCol="raw_features", minDF=10.0)
cvmodel = cv.fit(data_arr)
result_cv = cvmodel.transform(data_arr)

# IDF
idf = IDF(inputCol="raw_features", outputCol="features")
idfModel = idf.fit(result_cv)
result_tfidf = idfModel.transform(result_cv)


data_pp = result_tfidf.select('category', "news", "features")

print(type(data_pp))
print(data_pp.count())
data_pp.limit(2).show()

<class 'pyspark.sql.dataframe.DataFrame'>
18846
+--------------------+--------------------+--------------------+
|            category|                news|            features|
+--------------------+--------------------+--------------------+
|    rec.sport.hockey|From: Mamatha Dev...|(10936,[1,10,25,2...|
|comp.sys.ibm.pc.h...|From: mblawson@mi...|(10936,[13,16,18,...|
+--------------------+--------------------+--------------------+



In [16]:
del df_spark, data_in, data_clean, data, data_arr

# LDA Model

## 1. train model

In [None]:
numTopics = 20 # number of topics
 
lda = LDA(k=numTopics, seed = 1, optimizer="online", optimizeDocConcentration=True,
          maxIter = 100,           # number of iterations
          learningDecay = 0.51,   # kappa, learning rate
          learningOffset = 64.0,  # tau_0, larger values downweigh early iterations
          subsamplingRate = 0.05, # mini batch fraction 
          )
 
model = lda.fit(data_pp)
print("done....")

In [None]:
ll = model.logLikelihood(data_pp)
lp = model.logPerplexity(data_pp)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

In [None]:
print("hello")

In [None]:
"""

"""

print("modelling completed..!")

## 2. topic insights

In [None]:
model.vocabSize()

In [None]:
model.describeTopics().first()

In [None]:
print("The topics described by their top-weighted terms:")
model.describeTopics(5).limit(6).show()

In [None]:
model.topicsMatrix()

## 3. topic assignment

In [None]:
max_index = F.udf(lambda x: x.tolist().index(max(x)), IntegerType())


data_lda = model.transform(data_pp)
data_lda = data_lda.withColumn("topicID", max_index("topicDistribution"))

In [None]:
print(data_lda.count())
data_lda.limit(2).show()

In [None]:
"""
topicDistribution : list of topic weights (len==num_topics)
"""
print()

## 4. topic model assesment

In [None]:
#lda_train_data.first()

In [None]:
X_topics = data_lda.select("category", "topicID").toPandas()

print(X_topics.shape)
X_topics.head(2)

In [None]:
def topic_metrics(df):
    #print(df.head(2))
    arr = df["topicID"].value_counts()
    max_topic = arr.index.values[0]
    perc_dominance = arr[max_topic] / arr.sum()
    
    result = pd.Series(data=[int(max_topic), perc_dominance], index=["category_pred", "perc_dominance"])
    
    return result

In [None]:
X = X_topics.copy()
#X = X_topics.head(10)

X_label_mapping = X.groupby("category").apply(topic_metrics).reset_index()
X_label_mapping["category_pred"] = X_label_mapping["category_pred"].astype("int")
X_label_mapping["perc_dominance"] = np.round(X_label_mapping["perc_dominance"], 2)
X_label_mapping = X_label_mapping.sort_values(by=["category_pred", "perc_dominance"], ascending=[True, False])
X_label_mapping = X_label_mapping.reset_index(drop=True)
X_label_mapping

In [None]:
dict_mapper = {}
for i in zip(X_label_mapping["category"], X_label_mapping["category_pred"]):
    dict_mapper[str(i[1])] = i[0]
dict_mapper

In [46]:
custom_mapper = {'0': 'rec.autos',
                 '1': 'talk.politics.misc',
                 '2': 'comp.os.ms-windows.misc',
                 '3': 'sci.crypt',
                 '5': 'comp.sys.ibm.pc.hardware',
                 '9': 'rec.motorcycles',
                 '12': 'sci.med',
                 '14': 'alt.atheism',
                 '15': 'rec.sport.baseball',
                 '16': 'sci.electronics',
                 '18': 'sci.space'}

In [47]:
X = X_topics.copy()
X["topicID"] = X["topicID"].astype("str")
X["category_pred"] = X["topicID"].replace(custom_mapper)

#X["category_pred"] = np.where(len(X["category_pred"])<=3, "NA", X["category_pred"])
X["category_pred"] = X["category_pred"].apply(lambda val : "NA" if len(val)<=3 else val)

print(X.shape)
X.head(2)

(18846, 3)


Unnamed: 0,category,topicID,category_pred
0,rec.sport.hockey,9,rec.motorcycles
1,comp.sys.ibm.pc.hardware,8,


### Classification metrices

In [48]:
from sklearn.metrics import accuracy_score, precision_score, recall_score 
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

In [49]:
y_pred = X["category_pred"]
y_true = X["category"]
#plot_cm()


ac = accuracy_score( y_true, y_pred )
all_vals = precision_recall_fscore_support(y_true, y_pred )
precision = all_vals[0][1]
recall = all_vals[1][1]
fscore = all_vals[2][1]
support = all_vals[3][1]

text_print_plot = \
"""
Confusion Matrix
{} = {} 
{} = {}, {} = {}
{} = {}
{} = {}
""".format(
    'Accuracy', round(ac,2), 
    'Precision', round(precision,2), 
    'Recall', round(recall, 2),
    'Fscore', round(fscore, 2),
    'Support', support
)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [1]:
print(text_print_plot)

NameError: name 'text_print_plot' is not defined

In [None]:
"""
Confusion Matrix
Accuracy = 0.2 
Precision = 0.21, Recall = 0.52
Fscore = 0.3
Support = 799

"""
print()