In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import length,udf
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import Tokenizer,StopWordsRemover,CountVectorizer,IDF,StringIndexer,VectorAssembler
from pyspark.ml import pipeline,Pipeline
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator


In [0]:
SS= SparkSession.builder.appName('NLP').getOrCreate()

In [0]:
df = SS.read.csv('/FileStore/tables/SMSSpamCollection',inferSchema=True,sep='\t')

In [0]:
df.show()

In [0]:
df= df.withColumnRenamed('_c0','Class').withColumnRenamed('_c1','text')

In [0]:
def length(x):
  return len(x)

new_udf = udf(length,IntegerType())

In [0]:
df = df.withColumn('length',new_udf(df['text']))

In [0]:
train,test = df.randomSplit([0.7,0.3])

In [0]:
# Tokenizer,StopWordsRemover,CountVectorizer,IDF,StringIndexer
Categorize = StringIndexer(inputCol='Class',outputCol='label')
Token = Tokenizer(inputCol='text',outputCol='t_text')
Stopwords = StopWordsRemover(inputCol='t_text',outputCol='Stopwords')
countvec = CountVectorizer(inputCol='Stopwords',outputCol='Vector')
idf = IDF(inputCol='Vector',outputCol='idf_vec')
VA = VectorAssembler(inputCols=['idf_vec','length'],outputCol='features')
NB = NaiveBayes()
pipe = Pipeline(stages=[Categorize,Token,Stopwords,countvec,idf,VA,NB])

In [0]:
pipeline_p = pipe.fit(train)

In [0]:
pred = pipeline_p.transform(test)

In [0]:
mce = MulticlassClassificationEvaluator(metricName='f1')

In [0]:
eval_1 = mce.evaluate(pred)

In [0]:
eval_1

In [0]:
Bc = BinaryClassificationEvaluator(rawPredictionCol='prediction',metricName='accuracy')

In [0]:
eval_1 = mce.evaluate(pred)