#### 1. Create the Spark Env 2M

In [1]:
import os
import sys
os.environ["SPARK_HOME"] = "/usr/hdp/current/spark2-client"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
sys.path.insert(0, os.environ["PYLIB"] + "/py4j-0.10.6-src.zip")
sys.path.insert(0, os.environ["PYLIB"] + "/pyspark.zip")

#### 2. Load the required Libraries 2M

In [121]:
from pyspark.conf import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer, IndexToString, StringIndexerModel
from pyspark.ml.classification import NaiveBayes
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml import PipelineModel
from itertools import chain
from pyspark.sql.functions import create_map, lit


import numpy as np
from IPython.core.display import display, HTML


In [3]:
np.random.seed(333)

#### 3. Create the Spark Configuration and set the Master and Appname 2M

In [4]:
conf = SparkConf().setAppName("big_data_phd_2618").setMaster('local[*]')
sc = SparkContext(conf=conf)

#### 4. Create the Spark Session 2M

In [5]:
spark = SparkSession(sc)

In [6]:
spark

#### 5. Create the function to get the list of the filepaths in a directory in plain Python 5M

In [7]:
def create_file_list(path):
    '''
    Arguments : path of the directory in linux format (Ex :'/home/username/directory')
    Return : List of files in the directory as file_list (Ex:'file:///home/username/phd/alt')
    '''
    file_list = [os.path.join("file://"+path, file) for file in os.listdir(path)]
    return file_list

#### 6.  Read the text data in each directory separtely as a spark data frame using read.text 5M
#### Read the data from local only do not move the data to HDFS

In [8]:
#path_root = "/home/2618B56/BigData_PHD/"
path_root = "/home/datasets/PHD_Dataset/"

In [9]:
df_alt = spark.read.text(create_file_list(path_root+"Train/alt"),  wholetext=True)
df_comp = spark.read.text(create_file_list(path_root+"Train/comp"),  wholetext=True)
df_misc = spark.read.text(create_file_list(path_root+"Train/misc"),  wholetext=True)
df_rec = spark.read.text(create_file_list(path_root+"Train/rec"),  wholetext=True)
df_sci = spark.read.text(create_file_list(path_root+"Train/sci"),  wholetext=True)
df_soc = spark.read.text(create_file_list(path_root+"Train/soc"),  wholetext=True)
df_talk = spark.read.text(create_file_list(path_root+"Train/talk"),  wholetext=True)

In [10]:
df_misc.count()

585

#### 7. Create the lable for the Data frames (*Hint : use withColumn and expr using repeat) 4M

In [11]:
df_alt_labeled = df_alt.withColumn("target_label", F.lit('alt'))
df_comp_labeled = df_comp.withColumn("target_label", F.lit('comp'))
df_misc_labeled = df_misc.withColumn("target_label", F.lit('misc'))
df_rec_labeled = df_rec.withColumn("target_label", F.lit('rec'))
df_sci_labeled = df_sci.withColumn("target_label", F.lit('sci'))
df_soc_labeled = df_soc.withColumn("target_label", F.lit('soc'))
df_talk_labeled = df_talk.withColumn("target_label", F.lit('talk'))

#### 8. Understand the below function and write the observation in MarkDown 2M

In [12]:
import functools 

def unionAll(list_of_dfs):
    return functools.reduce(lambda df1,df2: df1.union(df2.select(df1.columns)), list_of_dfs) 

This function is used to do a row bind of two dataframes and club into one. 

Note, this operations requires the dataframes to have same column name.

In the above implemenattion scenario, all seven dataframes will be joined into one row wise.

#### Create the data frame for the entire data set using the above function

In [13]:
df_final_with_label = unionAll([df_alt_labeled, df_comp_labeled, df_misc_labeled, df_rec_labeled, df_sci_labeled, df_soc_labeled, df_talk_labeled])


In [14]:
df_final_with_label.show()

+--------------------+------------+
|               value|target_label|
+--------------------+------------+
|The recent rise o...|         alt|
|Archive-name: ath...|         alt|
|Archive-name: ath...|         alt|
|Archive-name: ath...|         alt|
|Archive-name: ath...|         alt|
|This kind of argu...|         alt|
| 
: There are a c...|         alt|
|: I
: |> Jim,
: |...|         alt|
|
True.  At first,...|         alt|
|Archive-name: ath...|         alt|
|

You misrepresen...|         alt|
|->	First I want t...|         alt|
|A new alternative...|         alt|
|
Size of armies, ...|         alt|
|New in this versi...|         alt|
|

But you haven't...|         alt|
|[why do babies ge...|         alt|
|[deleted]
think:
...|         alt|
|
The number of ci...|         alt|
|




Okay... I ar...|         alt|
+--------------------+------------+
only showing top 20 rows



In [15]:
df_final_with_label.count()

11314

#### 9.Rename the Column value as News 2M

In [16]:
df_final_with_label_renamed = df_final_with_label.withColumnRenamed("value", "News")

#### 10. Display the first five rows in the data frame 2M

In [17]:
df_final_with_label_renamed.show(5)

+--------------------+------------+
|                News|target_label|
+--------------------+------------+
|The recent rise o...|         alt|
|Archive-name: ath...|         alt|
|Archive-name: ath...|         alt|
|Archive-name: ath...|         alt|
|Archive-name: ath...|         alt|
+--------------------+------------+
only showing top 5 rows



#### 11. Display the counts of classes in the label 2M

In [18]:
df_final_with_label_renamed.groupBy('target_label').count().withColumnRenamed('count','count_of_classes').show()


+------------+----------------+
|target_label|count_of_classes|
+------------+----------------+
|         soc|             599|
|         alt|             480|
|        talk|            1952|
|         sci|            2373|
|        misc|             585|
|         rec|            2389|
|        comp|            2936|
+------------+----------------+



#### 12. Perform the preprocessing Steps required for the Text Data  ,Split the data into train and validation sets 10M

In [19]:
df_final_with_label_renamed.printSchema()

root
 |-- News: string (nullable = true)
 |-- target_label: string (nullable = false)



In [20]:
(trainingData, testData) = df_final_with_label_renamed.randomSplit([0.7, 0.3])
print("Training data count : {}".format(trainingData.count()))
print("Test data count : {}".format(testData.count()))

Training data count : 7917
Test data count : 3397


In [21]:
trainingData_X = trainingData[['News']]
trainingData_y = trainingData[['target_label']]


testData_X = testData[['News']]
testData_y = testData[['target_label']]

In [51]:
# Tokenize the News column
tokenizer  = Tokenizer(inputCol = 'News',outputCol = 'with_stop_words')

In [52]:
# Remove stop words
remover = StopWordsRemover(inputCol="with_stop_words", outputCol="words")

In [53]:
#Extracts a vocabulary from document collections and generates a CountVectorizerModel.
countvectorizer = CountVectorizer(inputCol = 'words' , outputCol = 'rawFeature')

In [54]:
# Compute the Inverse Document Frequency (IDF) given a collection of documents.
idf = IDF(inputCol = 'rawFeature',outputCol = 'features')

In [55]:
# Use StringIndexer top convert String target_label to num
stringindexer = StringIndexer(inputCol = 'target_label' , outputCol = 'label')

In [56]:
# Create pre=processing flow
preprocessing_Stages = [tokenizer]+[remover]+[countvectorizer]+[idf]+[stringindexer] 

#### 13 .Build only the Naiveabayes Model 5M

In [57]:
nb_model = NaiveBayes(labelCol='label',featuresCol="features")
nb_Pipeline = Pipeline(stages=preprocessing_Stages + [nb_model]) 


#### 14. Evaluate the model using the MulticlassClassification Evaluator 3M

In [58]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", 
                                              predictionCol="prediction",
                                              metricName="accuracy")

In [59]:
#Fit the model with training data
nb_crossval_Model = nb_Pipeline.fit(trainingData)

In [60]:
# Test on training data
predictions_train = nb_crossval_Model.transform(trainingData)

In [61]:
predictions_train.select("prediction", "label").show(10)

+----------+-----+
|prediction|label|
+----------+-----+
|       6.0|  6.0|
|       6.0|  6.0|
|       6.0|  6.0|
|       6.0|  6.0|
|       6.0|  6.0|
|       6.0|  6.0|
|       6.0|  6.0|
|       6.0|  6.0|
|       6.0|  6.0|
|       6.0|  6.0|
+----------+-----+
only showing top 10 rows



In [62]:
# Test on test data
predictions_test = nb_crossval_Model.transform(testData)

In [63]:
predictions_test.select("prediction", "label").show(10)

+----------+-----+
|prediction|label|
+----------+-----+
|       6.0|  6.0|
|       5.0|  6.0|
|       6.0|  6.0|
|       6.0|  6.0|
|       6.0|  6.0|
|       3.0|  6.0|
|       6.0|  6.0|
|       3.0|  6.0|
|       6.0|  6.0|
|       6.0|  6.0|
+----------+-----+
only showing top 10 rows



#### 15. Print the accuracies and save the model 2M

In [64]:
evaluator.evaluate(predictions_train)

0.9642541366679298

In [65]:
evaluator.evaluate(predictions_test)

0.8118928466293789

In [66]:
# Save the model in hdfs
nb_crossval_Model.save("/user/2618B56/big_data_phd")

In [67]:
#Load the model for testing on actual test data
model_loaded = PipelineModel.load("/user/2618B56/big_data_phd")

In [68]:
#Load actual test data
df_test_dataset = spark.read.text(create_file_list(path_root+"Test/"),  wholetext=True).withColumnRenamed("value", "News")
df_test_dataset.count()

10

In [69]:
df_test_dataset.show()

+--------------------+
|                News|
+--------------------+
|
Well, they never...|
|

  I am not a pa...|
|I am writing this...|
|Is anyone out the...|
|Does any one know...|
|A friend recorded...|
|

I would expect ...|
|

Okay, here's wh...|
|Someone asked me ...|
|
Um, Kent... just...|
+--------------------+



In [70]:
# Predict using the above defined model
pred_df_test = model_loaded.transform(df_test_dataset)
pred_df_test.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|                News|     with_stop_words|               words|          rawFeature|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|
Well, they never...|[, well,, they, n...|[, well,, never, ...|(148691,[0,6,10,1...|(148691,[0,6,10,1...|[-9977.1724783872...|[0.0,0.0,0.0,1.0,...|       3.0|
|

  I am not a pa...|[, , , , i, am, n...|[, , , , paranoid...|(148691,[0,4,6,11...|(148691,[0,4,6,11...|[-6898.7457826260...|[0.0,2.7689381950...|       3.0|
|I am writing this...|[i, am, writing, ...|[writing, find, f...|(148691,[0,8,13,1...|(148691,[0,8,13,1...|[-4430.5886414145...|[6.06515916728927...|       1.0|
|Is anyone out the...|[is, anyone, out,.

In [71]:
#Look at required fields
pred_df_test.select("News","probability","prediction").show()

+--------------------+--------------------+----------+
|                News|         probability|prediction|
+--------------------+--------------------+----------+
|
Well, they never...|[0.0,0.0,0.0,1.0,...|       3.0|
|

  I am not a pa...|[0.0,2.7689381950...|       3.0|
|I am writing this...|[6.06515916728927...|       1.0|
|Is anyone out the...|[9.09690000889191...|       3.0|
|Does any one know...|[1.0,8.4746028722...|       0.0|
|A friend recorded...|[6.62009756708638...|       3.0|
|

I would expect ...|[2.26271489587418...|       1.0|
|

Okay, here's wh...|[2.40635710054701...|       4.0|
|Someone asked me ...|[0.00526801016300...|       1.0|
|
Um, Kent... just...|[3.94698672173558...|       2.0|
+--------------------+--------------------+----------+



#### 16. Get the actual labels for the indexed ones after you have created the predictions and print the head for the label and prediction for train and validation sets for atleast 6 records 2M

In [83]:
#Check all stages
model_loaded.stages

[Tokenizer_444ea06fb8a34db0ec45,
 StopWordsRemover_48ed9c6c90820908de19,
 CountVectorizer_4e40aff9d9e88d540017,
 IDF_42919aa96d76def5a266,
 StringIndexer_46c987f5037159b1eb79,
 NaiveBayes_4efeabd2d361f99a89eb]

In [92]:
# Extract the StringIndexer Stage
stringIndexerStage = [x for x in model_loaded.stages if isinstance(x, StringIndexerModel)][0]
type(stringIndexerStage)

pyspark.ml.feature.StringIndexerModel

In [108]:
# check the actual label to index mapping
actual_label_to_index_mapping = stringIndexerStage.transform(trainingData).select('target_label', 'label').distinct()
actual_label_to_index_mapping.show()

+------------+-----+
|target_label|label|
+------------+-----+
|        comp|  0.0|
|         alt|  6.0|
|         sci|  1.0|
|        misc|  4.0|
|         rec|  2.0|
|         soc|  5.0|
|        talk|  3.0|
+------------+-----+



In [112]:
#actual_label_to_index_mapping_dict = 
list_actual_label_to_index_mapping = map(lambda row: row.asDict(), actual_label_to_index_mapping.collect())
dict_actual_label_to_index_mapping = {mapping['label']: mapping['target_label'] for mapping in list_actual_label_to_index_mapping}
dict_actual_label_to_index_mapping


{0.0: u'comp',
 1.0: u'sci',
 2.0: u'rec',
 3.0: u'talk',
 4.0: u'misc',
 5.0: u'soc',
 6.0: u'alt'}

In [117]:
pred_df_test.select("News","probability","prediction").show()

+--------------------+--------------------+----------+
|                News|         probability|prediction|
+--------------------+--------------------+----------+
|
Well, they never...|[0.0,0.0,0.0,1.0,...|       3.0|
|

  I am not a pa...|[0.0,2.7689381950...|       3.0|
|I am writing this...|[6.06515916728927...|       1.0|
|Is anyone out the...|[9.09690000889191...|       3.0|
|Does any one know...|[1.0,8.4746028722...|       0.0|
|A friend recorded...|[6.62009756708638...|       3.0|
|

I would expect ...|[2.26271489587418...|       1.0|
|

Okay, here's wh...|[2.40635710054701...|       4.0|
|Someone asked me ...|[0.00526801016300...|       1.0|
|
Um, Kent... just...|[3.94698672173558...|       2.0|
+--------------------+--------------------+----------+



In [123]:
mapping_expr = create_map([lit(x) for x in chain(*dict_actual_label_to_index_mapping.items())])
pred_df_test = pred_df_test.withColumn('prediction_actual_name', mapping_expr[pred_df_test['prediction']])
pred_df_test.select("News","probability","prediction", "prediction_actual_name").show(100)


+--------------------+--------------------+----------+----------------------+
|                News|         probability|prediction|prediction_actual_name|
+--------------------+--------------------+----------+----------------------+
|
Well, they never...|[0.0,0.0,0.0,1.0,...|       3.0|                  talk|
|

  I am not a pa...|[0.0,2.7689381950...|       3.0|                  talk|
|I am writing this...|[6.06515916728927...|       1.0|                   sci|
|Is anyone out the...|[9.09690000889191...|       3.0|                  talk|
|Does any one know...|[1.0,8.4746028722...|       0.0|                  comp|
|A friend recorded...|[6.62009756708638...|       3.0|                  talk|
|

I would expect ...|[2.26271489587418...|       1.0|                   sci|
|

Okay, here's wh...|[2.40635710054701...|       4.0|                  misc|
|Someone asked me ...|[0.00526801016300...|       1.0|                   sci|
|
Um, Kent... just...|[3.94698672173558...|       2.0|          

## Backup

In [106]:
converter = IndexToString(inputCol="label", outputCol="originalCategory")
converted = converter.transform(stringIndexerStage.transform(trainingData).select('target_label', 'label').distinct())

print("Transformed indexed column '%s' back to original string column '%s' using "
      "labels in metadata" % (converter.getInputCol(), converter.getOutputCol()))
converted.select("target_label", "label").show()

Transformed indexed column 'label' back to original string column 'originalCategory' using labels in metadata
+------------+-----+
|target_label|label|
+------------+-----+
|        comp|  0.0|
|         alt|  6.0|
|         sci|  1.0|
|        misc|  4.0|
|         rec|  2.0|
|         soc|  5.0|
|        talk|  3.0|
+------------+-----+

