### Create SPARK_HOME and PYLIB env var and update PATH env var

In [1]:
## Set Python - Spark environment.
import os
import sys
os.environ["SPARK_HOME"] = "/usr/hdp/current/spark2-client"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
sys.path.insert(0, os.environ["PYLIB"] + "/py4j-0.10.6-src.zip")
sys.path.insert(0, os.environ["PYLIB"] + "/pyspark.zip")


### Initializing Spark

Build __SparkConf__ object 

    Contains information about your application.  


Create __SparkContext__ object 
    
    Tells Spark how to access a cluster. 
    

Create __SparkSession__ object

    The entry point to programming Spark with the Dataset and DataFrame API.

    Used to create DataFrame, register DataFrame as tables and execute SQL over tables etc.
    


#### Create the spark session with master local[* ] and app name with textapp_username

In [2]:
## Create  SparkSession
from pyspark.sql import SparkSession
from pyspark import SparkConf
spark = SparkSession.builder.master("local[*]") .appName("Structured_stream_SAI").getOrCreate()

### Loading the dependent libraries

In [3]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

### Problem Statement

#### Description:
The SMS Spam Collection v.1 (hereafter the corpus) is a set of SMS tagged messages that have been collected for SMS Spam research. It contains one set of SMS messages in English of 5,574 messages, tagged acording being ham (legitimate) or spam. 

### Reading the data and creating a dataframe

In [25]:
## Read data and create a dataframe
data = spark.read.csv("file:///home/2617B56/StrucutredStreaming/SMSSpamCollectionTrain",sep='\t',header=False)

In [40]:
# Show 3 rows without truncation
data.show(3)

+-----------+--------------------+
|messageType|             message|
+-----------+--------------------+
|        ham|Go until jurong p...|
|        ham|Ok lar... Joking ...|
|       spam|Free entry in 2 a...|
+-----------+--------------------+
only showing top 3 rows



#### Rename Columns as messageType and message

#data =
data = data.withColumnRenamed('_c0','target') 
data = data.withColumnRenamed('_c1','msgs')` 

In [39]:
data = data.withColumnRenamed('masg','message').withColumnRenamed('trgt','messageType')

### Understanding Data

#### Print Schema

In [41]:
data.printSchema()

root
 |-- messageType: string (nullable = true)
 |-- message: string (nullable = true)



#### Total number of Columns and Records

### Data Preprocessing

Checking for null values at each column

In [11]:
data.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in data.columns]).show()

+-----------+-------+
|messageType|message|
+-----------+-------+
|          0|      0|
+-----------+-------+



### Tokenizer

Tokenization is the process of taking text (such as a sentence) and breaking it into individual terms (usually words). Let's tokenize the messages and create a list of words of each message.

In [47]:
from pyspark.ml.feature import Tokenizer
tokenizer  = Tokenizer(inputCol = 'message',outputCol = 'words')

###  CountVectorizer

CountVectorizer converts the list of tokens above to vectors of token counts. 

In [48]:
from pyspark.ml.feature import CountVectorizer
countvectorizer = CountVectorizer(inputCol = 'words' , outputCol = 'rawFeature')

### Inverse Document Frequency

IDF down-weighs features which appear frequently in a corpus. This generally improves performance when using text as features since most frequent, and hence less important words, get down-weighed.

In [50]:
from pyspark.ml.feature import IDF

idf = IDF(inputCol = 'rawFeature',outputCol = 'features')

### String Indexer

In [52]:
from pyspark.ml.feature import StringIndexer
stringindexer = StringIndexer(inputCol = 'messageType' , outputCol = 'label')

### Create list of preprocessing Pipeline Stages

In [53]:
preprocessing_Stages = [tokenizer]+[countvectorizer]+[idf]+[stringindexer] 

### Build Logistic Regression Classification Model

In [55]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol='label',featuresCol="features")

In [59]:
from pyspark.ml import Pipeline

In [60]:
lr_Pipeline = Pipeline(stages=preprocessing_Stages + [lr]) 

lr_Pipeline_model = lr_Pipeline.fit(data)

### Save the Pipeline Model

In [61]:
lr_Pipeline_model.save("give path")

In [62]:
model = lr_Pipeline_model.load("give path")

In [63]:
testdata = spark.read.csv("file:///home/2617B56/StrucutredStreaming/SMSSpamCollectionTest",sep='\t',header=False)

##### Rename the columns as in model

In [71]:
testdata.show(3)

+-----------+--------------------+
|messageType|             message|
+-----------+--------------------+
|        ham|Go until jurong p...|
|        ham|Ok lar... Joking ...|
|       spam|Free entry in 2 a...|
+-----------+--------------------+
only showing top 3 rows



In [66]:
testdata = testdata.withColumnRenamed('_c0','messageType').withColumnRenamed('_c1','message')

##### Run the model on test data to get predictions and select only message,rawprediction and prediction for output

In [70]:
! hdfs dfs -ls

Found 9 items
drwx------   - 2617B56 2617B56          0 2019-04-13 16:03 .staging
drwxr-xr-x   - 2617B56 2617B56          0 2019-03-30 16:27 HDFS_DIR
-rw-r--r--   6 2617B56 2617B56         26 2019-03-30 16:32 Sample.txt
drwxr-xr-x   - 2617B56 2617B56          0 2019-04-21 11:27 Uber
drwxr-xr-x   - 2617B56 2617B56          0 2019-05-04 15:53 give path
drwxr-xr-x   - 2617B56 2617B56          0 2019-04-20 15:20 model_saving
drwxr-xr-x   - 2617B56 2617B56          0 2019-04-20 15:12 modrl_saving
drwxr-xr-x   - 2617B56 2617B56          0 2019-04-13 15:33 mr_wordcount_input
drwxr-xr-x   - 2617B56 2617B56          0 2019-04-21 16:18 uber
