In [77]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window as W
import pyspark.sql.functions as f
from pyspark.sql.types import *

import getpass

username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port','0'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

#### Create logic for linkedIn profile views in Apache Spark

In [9]:
#reading data
rdd1=spark.sparkContext.textFile("/user/itv009481/data/linkedin_profile.csv")
rdd1.take(5)

['1,Manasa,Sumit',
 '2,Deepa,Sumit',
 '3,Sumit,Manasa',
 '4,Manasa,Deepa',
 '5,Deepa,Manasa']

In [10]:
#How many times profile is viewed? 
#hence only last column is required
rdd2= rdd1.map(lambda x: (x.split(",")[2],1))
rdd2.take(5)

[('Sumit', 1), ('Sumit', 1), ('Manasa', 1), ('Deepa', 1), ('Manasa', 1)]

In [28]:
#Getting count of distinct names
rdd3=rdd2.reduceByKey(lambda x,y:x+y)
rdd4=rdd3.sortBy(lambda x: x[1],ascending=False)
rdd4.take(5)

[('Manasa', 3), ('Sumit', 2), ('Deepa', 1)]

In [29]:
#Alternate Approach if output is no longer needed
sorted(rdd2.countByKey().items(),key=lambda x: x[1],reverse=True)

[('Manasa', 3), ('Sumit', 2), ('Deepa', 1)]

In [32]:
#Saving to output file 
rdd4.saveAsTextFile('/user/itv009481/data/assignment/assignment1_op/')

#### Another Word Count Problem

In [78]:
rdd1=spark.sparkContext.textFile("/user/itv009481/data/input_data.txt")
rdd1.take(5)

["Reading books is a wonderful pastime that has numerous benefits for individuals of all ages. Whether you're a child just learning to read or an adult seeking knowledge and entertainment, books can enrich your life in many ways.",
 '',
 '1. Knowledge Expansion:',
 '   Reading books allows you to gain knowledge on a wide range of topics. You can explore history, science, philosophy, and more by delving into different books. The more you read, the more you learn.',
 '']

In [79]:
#Splitting based on space
rdd2=rdd1.map(lambda x: x.split(" "))

In [80]:
rdd3=rdd2.flatMap(lambda x: x)
rdd3.take(5)

['Reading', 'books', 'is', 'a', 'wonderful']

In [82]:
#Defining bad words
rdd_bad=spark.sparkContext.parallelize(['','5.'])
rdd_bad_brod=spark.sparkContext.broadcast(rdd_bad.collect())

In [89]:
rdd4=rdd3.map(lambda x: 'Error' if x in rdd_bad_brod.value else x)
rdd4.take(5)

['Reading', 'books', 'is', 'a', 'wonderful']

In [90]:
rdd4=rdd3.map(lambda x: (x,1))
rdd5=rdd4.reduceByKey(lambda x,y: x+y)
rdd6=rdd5.map(lambda x: (x[1],x[0]))
rdd6.take(5)

[(5, 'Reading'), (5, 'books'), (2, 'is'), (1, 'wonderful'), (1, 'benefits')]

In [91]:
rdd6.sortByKey(False).take(5)

[(30, ''), (15, 'a'), (14, 'and'), (8, 'can'), (8, 'you')]

#### Flat Map Example: Map vs Flat Map

In [49]:
# Exploding an RDD of arrays into individual elements
array_rdd = spark.sparkContext.parallelize([(1, [1, 2, 3]), (2, [4, 5]), (3, [6])])
array_rdd.take(5)


[(1, [1, 2, 3]), (2, [4, 5]), (3, [6])]

In [47]:
mapped_rdd=array_rdd.map(lambda x: [(x[0], item) for item in x[1]])
flattened_rdd = array_rdd.flatMap(lambda x: [(x[0], item) for item in x[1]])
flattened_rdd.take(5)

[(1, 1), (1, 2), (1, 3), (2, 4), (2, 5)]

In [48]:
mapped_rdd.take(5)

[[(1, 1), (1, 2), (1, 3)], [(2, 4), (2, 5)], [(3, 6)]]

In [92]:
spark.stop()