"""<br>
@Author: Prayag Bhoir<br>
@Date: 3-09-2024<br>
@Last Modified by: Prayag Bhoir<br>
@Last Modified time: 3-09-2024<br>
@Title : Python programs for word count using Pyspark <br>
"""

In [141]:
pip install pyspark



In [142]:
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession

In [180]:
spark = SparkSession.builder \
    .appName("WordCount") \
    .getOrCreate()


## Word count using text file

In [144]:
# Load a text file into a DataFrame
text_file_df = spark.read.text("/content/text.txt")
# Convert DataFrame to RDD
text_file_rdd = text_file_df.rdd

In [145]:
# Checking text file
print(text_file_rdd.collect())

[Row(value="Today, I want to talk about the Global Happiness Index and India's position on it. The Global Happiness Index is a measure of well-being that evaluates how happy people are in different countries. It considers several factors, including income, life expectancy, social support, freedom to make life choices, and perceptions of corruption.")]


In [146]:
# Perform word count using RDD operations
word_counts = text_file_rdd.flatMap(lambda line: line[0].split()) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda a, b: a + b)


In [147]:
# Collect the word counts to a list and print them
for word ,count in word_counts.collect():
  print(f"{word} : {count}")


Today, : 1
I : 1
want : 1
to : 2
talk : 1
about : 1
the : 1
Global : 2
Happiness : 2
Index : 2
and : 2
India's : 1
position : 1
on : 1
it. : 1
The : 1
is : 1
a : 1
measure : 1
of : 2
well-being : 1
that : 1
evaluates : 1
how : 1
happy : 1
people : 1
are : 1
in : 1
different : 1
countries. : 1
It : 1
considers : 1
several : 1
factors, : 1
including : 1
income, : 1
life : 2
expectancy, : 1
social : 1
support, : 1
freedom : 1
make : 1
choices, : 1
perceptions : 1
corruption. : 1


In [193]:
# Stop the session
spark.stop()

## Word count using JSON with sparkContext

In [191]:
import json

In [210]:
# Create a SparkConf object
conf = SparkConf().setAppName("MyApp").setMaster("local[*]")

# Initialize a SparkContext object
sc = SparkContext(conf=conf)



In [211]:
# Read json file
json_text_rdd = sc.textFile("/content/text.json")

# parsed the json
parsed_rdd = json_text_rdd.map(lambda x: json.loads(x))

In [212]:
# Check text file
json_text_rdd.collect()

['{"paragraph": "In the quiet village nestled between the towering mountains, life moved at a slow, predictable pace. The villagers, known for their warm hospitality and simple way of life, gathered every evening at the central square to share stories passed down through generations. The ancient oak tree, standing tall in the middle of the square, was a silent witness to the countless tales of love, loss, and triumph. As the sun dipped below the horizon, casting a golden hue over the cobblestone streets, the village seemed to glow with an ethereal light, as if the very air was charged with the memories of the past."}']

In [213]:
# Extract paragraph and perform word count
paragraph_rdd = parsed_rdd.map(lambda x: x["paragraph"])

words_rdd = paragraph_rdd.flatMap(lambda x: x.split()) \
            .map(lambda word: (word, 1)) \
            .reduceByKey(lambda a, b: a + b)



In [215]:
# Collect and print results
word_counts = words_rdd.collect()

for word, count in word_counts:
    print(f"{word}: {count}")


quiet: 1
nestled: 1
mountains,: 1
at: 2
pace.: 1
The: 2
known: 1
simple: 1
way: 1
of: 4
life,: 1
gathered: 1
central: 1
square: 1
share: 1
passed: 1
down: 1
generations.: 1
ancient: 1
oak: 1
tree,: 1
in: 1
middle: 1
square,: 1
was: 2
witness: 1
triumph.: 1
sun: 1
dipped: 1
below: 1
horizon,: 1
casting: 1
hue: 1
streets,: 1
seemed: 1
an: 1
light,: 1
as: 1
very: 1
memories: 1
past.: 1
In: 1
the: 13
village: 2
between: 1
towering: 1
life: 1
moved: 1
a: 3
slow,: 1
predictable: 1
villagers,: 1
for: 1
their: 1
warm: 1
hospitality: 1
and: 2
every: 1
evening: 1
to: 3
stories: 1
through: 1
standing: 1
tall: 1
silent: 1
countless: 1
tales: 1
love,: 1
loss,: 1
As: 1
golden: 1
over: 1
cobblestone: 1
glow: 1
with: 2
ethereal: 1
if: 1
air: 1
charged: 1


In [206]:
sc.stop()