# Q2.Part1: K-Means clustering using TF-IDF Encoding

In [None]:
# import basic spark session and requirements
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
from pyspark.sql import SparkSession


# importing ML functionalities
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
import matplotlib.pyplot as plt

# importing functions for concatenation of sentences
from pyspark.sql.functions import concat,lit

# importing libraries for performing tokenization and punctuation removal
import csv
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string


# importing TFIDF
from pyspark.ml.feature import HashingTF, IDF, Tokenizer


# starting spark session
spark = SparkSession \
    .builder \
    .appName("Assignment Q2 Solution") \
    .getOrCreate()

#importing required dataset
data_path = "s3://akum44880/assignmen/train.tsv"
df = spark.read.csv(data_path,header=True,sep='\t')


#selecting only required columns
data_df = df.select(['genre','sentence1','sentence2'])

#concatenating the dataset to form just one column of words
df = data_df.select(['genre',concat(data_df.sentence1, lit(" "), data_df.sentence2).alias('joined')])

# converting df to RDD now
df_rdd = df.rdd

#creating mapper functions

def removePunctuationsFunct(record):
    list_punct=list(string.punctuation)
    x = str(record[1])
    for punct in list_punct:
        if punct in x:
            x = x.replace(punct, '')
    return (record[0],x)

def extract_tokens(record):
    words = word_tokenize(record[1])
    new_words= [word.lower() for word in words if word.isalpha()]
    #filtered_words = [(w,1) for w in new_words if not w in stop_words]
    return (record[0],new_words)


#remove punctuation from sentences
new_rdd = df_rdd.map(removePunctuationsFunct)

# now tokenized the bag of sentences to give bag of words for each genre
tokenized = new_rdd.map(extract_tokens)


#convert tokenized rdd back to dataframe
tokenized_df = spark.createDataFrame(tokenized)

#rename columns
tokenized_df = tokenized_df.withColumnRenamed('_1','genre')
tokenized_df = tokenized_df.withColumnRenamed('_2','BOW')


#Using TFIDF to encode words

hashingTF = HashingTF(inputCol="BOW", outputCol="rawFeatures", numFeatures=300)
featurizedData = hashingTF.transform(tokenized_df)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

data_embedding = rescaledData.select("genre", "features")


data_embedding.cache()

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
5,application_1589965105790_0006,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
#attemtping k-means clustering on the encoded dataset now

kmeans = KMeans(featuresCol='features',k=5).setSeed(56)
model=kmeans.fit(data_embedding)
predictions = model.transform(data_embedding)
centers = model.clusterCenters()

data_embedding.unpersist()
#create final dataframe with true labels (genre) and predicted labels (prediction)
final_predicted = predictions.select('genre','prediction')
#final_predicted.count()


#generate counts for each
result = final_predicted.groupBy("genre","prediction").count().orderBy("prediction","count")
result.cache()


#intialising labels to store each prediction label with actual value count for mapping purpose

list_of_labels=[]

#storing the actual genre and count for prediction 0
list_of_labels.append( result.filter(result.prediction==0).orderBy('count',ascending=False).collect())
#storing the actual genre and count for prediction 1
list_of_labels.append( result.filter(result.prediction==1).orderBy('count',ascending=False).collect())
#storing the actual genre and count for prediction 2
list_of_labels.append( result.filter(result.prediction==2).orderBy('count',ascending=False).collect())
#storing the actual genre and count for prediction 3
list_of_labels.append( result.filter(result.prediction==3).orderBy('count',ascending=False).collect())
#storing the actual genre and count for prediction 4
list_of_labels.append( result.filter(result.prediction==4).orderBy('count',ascending=False).collect())



#if this class is already assigned to some other predicted label, we map the next available actual class to the label

def cmax(z):
    for i in z:
        maxg = i['genre']
        if(flag[maxg]==0):
            flag[maxg]=1
            return maxg
        
#dictionary to keep a check whether the actual class has been assigned to a predicted label or not
flag={'fiction':0,'slate':0,'travel':0,'telephone':0,'government':0} 

#Enabler to map the actual class to predicted label
enabler={'fiction':0,'slate':0,'travel':0,'telephone':0,'government':0} 

mapped_clusters=[]
i=0

for l in list_of_labels:
    g=cmax(l)
    mapped_clusters.append(g) #storing all the actual classes in order of their predicted labels
    enabler[g]=i #mapping the actual class to their predicted label
    i=i+1
    
# replace the actual numerical cluster labels with mapped categorical labels
y = final_predicted.withColumn('prediction', result.prediction.cast('string'))
y = y.na.replace(['0', '1','2','3','4'], mapped_clusters, 'prediction')

# import libraries for visualization and confusion matrix
import sklearn
from sklearn.metrics import confusion_matrix
import seaborn as sns

# create the matrix from a numpy array of the generated results
x = np.array(y.collect())
cf_matrix = confusion_matrix(x[:,1], x[:,0])

# create custom confusion matrix with percentages for our requirement
def my_confusion_matrix(array):
    a = []
    row = array.shape[0]
    column = array.shape[1]
    for i in range(row):
        l = []
        for j in range(column):
            x = (array[i][j]/sum(array[i]))*100
            l.append(round(x,2))
        a.append(l)
    return a

cd = my_confusion_matrix(cf_matrix)


# visualize the final confusion matrix
fig, ax = plt.subplots(figsize=(7,7))
sns.heatmap(cd, annot=True, fmt='.2f', cmap='Blues', xticklabels=predicted, yticklabels=predicted)
plt.ylabel('ACTUAL LABELS')
plt.xlabel('PREDICTED LABELS')
plt.show(block=False)

%matplot plt