In [1]:
"""
import os
import boto3

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

import sagemaker
from sagemaker import get_execution_role
import sagemaker_pyspark

role = get_execution_role()

# Configure Spark to use the SageMaker Spark dependency jars
jars = sagemaker_pyspark.classpath_jars()

classpath = ":".join(sagemaker_pyspark.classpath_jars())

# See the SageMaker Spark Github to learn how to connect to EMR from a notebook instance
spark = SparkSession.builder.config("spark.driver.extraClassPath", classpath)\
    .master("local[*]").getOrCreate()
    
spark
"""
print()




In [2]:
#!pip install spark-nlp
#!pip install fastparquet 

In [3]:
# Import Spark NLP
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
import sparknlp
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

In [4]:
#!pip install spark-nlp
#!pip install fastparquet 

In [5]:
# Start Spark Session with Spark NLP
#

"""
spark = SparkSession.builder \
    .appName("BBC Text Categorization")\
    .config("spark.driver.memory","8G")\ 
    .config("spark.memory.offHeap.enabled",True)\
    .config("spark.memory.offHeap.size","8G") \
    .config("spark.driver.maxResultSize", "2G") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.5")\
    .config("spark.kryoserializer.buffer.max", "1000M")\
    .config("spark.network.timeout","3600s")\
    .getOrCreate()
"""
spark = sparknlp.start()

spark

# Load data

## pandas

In [6]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

In [7]:
data = fetch_20newsgroups(subset='all')

X = data.data
y = data.target

print(data.filenames.shape, len(X), y.shape)

labels = data.target_names
name_dict = dict()
for i, label in enumerate(labels):
    name_dict[i] = label
    
df = pd.DataFrame(data={"news":X, "category":y})
df["category"] = df["category"].replace(name_dict)

#df.to_csv("newsgroup_20_data.csv", index=False)
df.to_parquet("newsgroup_20_data.parquet", compression="GZIP")

print(df.shape)
df.head(2)

(18846,) 18846 (18846,)
(18846, 2)


Unnamed: 0,news,category
0,From: Mamatha Devineni Ratnam <mr47+@andrew.cm...,rec.sport.hockey
1,From: mblawson@midway.ecn.uoknor.edu (Matthew ...,comp.sys.ibm.pc.hardware


In [8]:
from fastparquet import write, ParquetFile
df_parquet = ParquetFile("newsgroup_20_data.parquet").to_pandas()
df_parquet.head(2)

Unnamed: 0,news,category
0,From: Mamatha Devineni Ratnam <mr47+@andrew.cm...,rec.sport.hockey
1,From: mblawson@midway.ecn.uoknor.edu (Matthew ...,comp.sys.ibm.pc.hardware


## pyspark

In [9]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(spark)

In [10]:
df_spark = sqlContext.read.parquet("newsgroup_20_data.parquet")
print(df_spark.count())
#df_spark.head(2)

18846


In [11]:
d = df_spark.head(1)
d = d[0]

print(d.asDict()['category'])
print(d.asDict()['news'])

rec.sport.hockey
From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA
Lines: 12
NNTP-Posting-Host: po4.andrew.cmu.edu



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!




In [12]:
(trainingData, testData) = df_spark.randomSplit([0.7, 0.3], seed = 100)

# NLP Pipeline using Spark NLP

In [13]:
from pyspark.ml.feature import HashingTF, IDF, StringIndexer, SQLTransformer,IndexToString
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator# convert text column to spark nlp document

In [14]:
input_col = "news"
output_col = "document"

document_assembler = DocumentAssembler() \
    .setInputCol(input_col) \
    .setOutputCol("document")

# convert document to array of tokens
tokenizer = Tokenizer() \
  .setInputCols(["document"]) \
  .setOutputCol("token")
 
# clean tokens 
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

# remove stopwords
stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

# stems tokens to bring it to root form
stemmer = Stemmer() \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("stem")

# Convert custom document structure to array of tokens.
finisher = Finisher() \
    .setInputCols(["stem"]) \
    .setOutputCols(["token_features"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(False)

# To generate Term Frequency
hashingTF = HashingTF(inputCol="token_features", outputCol="rawFeatures", numFeatures=1000)

# To generate Inverse Document Frequency
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)

# convert labels (string) to integers. Easy to process compared to string.
label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")

# define a simple Multinomial logistic regression model. Try different combination of hyperparameters and see what suits your data. You can also try different algorithms and compare the scores.
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.0)

# To convert index(integer) to corresponding class labels
label_to_stringIdx = IndexToString(inputCol="label", outputCol="article_class")

In [15]:
nlp_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            stemmer, 
            finisher,
            hashingTF,
            idf,
            label_stringIdx,
            lr,
            label_to_stringIdx
           ])

nlp_pipeline2 = Pipeline(
    stages=[document_assembler, 
            tokenizer
           ])

In [16]:
data = trainingData
pipeline_model = nlp_pipeline.fit(data)

In [25]:
#data = trainingData.limit(10)
#data.count()

# Perform Predictions

In [18]:
# perform predictions on test data
predictions =  pipeline_model.transform(testData)

In [19]:
predictions.count()

5588

## Evaluate the Model

In [20]:
# import evaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print("Accuracy = %g" % (accuracy))
print("Test Error = %g " % (1.0 - accuracy))

Accuracy = 0.706156
Test Error = 0.293844 


In [21]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
weightedPrecision = evaluator.evaluate(predictions)

print("weightedPrecision = %g" % (weightedPrecision))
print("Test Error = %g " % (1.0 - weightedPrecision))

weightedPrecision = 0.712184
Test Error = 0.287816 


In [22]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="weightedRecall")
weightedRecall = evaluator.evaluate(predictions)

print("weightedRecall = %g" % (weightedRecall))
print("Test Error = %g " % (1.0 - weightedRecall))

weightedRecall = 0.706156
Test Error = 0.293844 


In [23]:
#pipeline_model.save('/path/to/storage_location')

In [24]:
#df_spark.show()
df_spark.limit(2).toPandas()

Unnamed: 0,news,category
0,From: Mamatha Devineni Ratnam <mr47+@andrew.cm...,rec.sport.hockey
1,From: mblawson@midway.ecn.uoknor.edu (Matthew ...,comp.sys.ibm.pc.hardware
