In [63]:
from pyspark import SparkContext, SparkConf
import string
import warnings

In [90]:
conf = SparkConf().setAppName("wordcount").setMaster("local")
sc = SparkContext(conf=conf)

In [89]:
# sc.stop()

In [93]:
def wordcountBasic(top25=False):
    warnings.filterwarnings('ignore')
    lines = sc.textFile("adollhouse.txt")
    words = lines.flatMap(lambda line: filter(None, line.split(" ")))
    
    wordCounts = words.map(lambda word: (word, 1))
    wordCounts = wordCounts.reduceByKey(lambda a, b: a + b)

    if top25:
        wordcountSorted = wordCounts.sortBy(lambda x: x[1], ascending=False)
        mostCommon = wordcountSorted.take(25)
        for (word, count) in mostCommon:
            print(f"{word}: {count}")
    else:
        result = wordCounts.collect()
        for (word, count) in result:
            print(f"{word}: {count}")
        with open("wordcount_Basic_Output.txt", 'w') as output:
            for (word, count) in result:
                output.write(f"{word}: {count}\n")

In [95]:
# Set top25=False for writing outputs to wordcount_Basic_Output.txt file without sorting the most common words
# Set top25=True for printing the most common words as outputs
wordcountBasic(top25=False)

ACT: 3
I: 909
_[SCENE.—A: 1
room: 19
furnished: 1
comfortably: 1
and: 493
tastefully,: 1
but: 70
not: 148
extravagantly.: 1
At: 6
the: 743
back,: 3
a: 453
door: 38
to: 655
right: 11
leads: 2
entrance-hall,: 1
another: 9
left: 7
Helmer’s: 2
study.: 2
Between: 1
doors: 1
stands: 10
piano.: 1
In: 14
middle: 3
of: 379
left-hand: 1
wall: 1
is: 396
door,: 7
beyond: 1
it: 322
window.: 1
Near: 1
window: 1
are: 121
round: 11
table,: 4
arm-chairs: 1
small: 10
sofa.: 4
right-hand: 2
wall,: 1
at: 151
farther: 1
end,: 2
door;: 1
on: 117
same: 11
side,: 3
nearer: 4
footlights,: 1
stove,: 4
two: 13
easy: 6
chairs: 2
rocking-chair;: 1
between: 10
stove: 2
table.: 4
Engravings: 1
walls;: 1
cabinet: 1
with: 127
china: 1
other: 21
objects;: 1
book-case: 1
well-bound: 1
books.: 2
The: 42
floors: 1
carpeted,: 1
fire: 2
burns: 1
in: 276
stove.: 2
It: 93
winter._: 1
_A: 1
bell: 3
rings: 2
hall;: 2
shortly: 1
afterwards: 3
heard: 18
open.: 3
Enter: 2
NORA,: 2
humming: 2
tune: 1
high: 3
spirits.: 1
She: 20
out

In [96]:
# Set top25=False for writing outputs to wordcount_Basic_Output.txt file without sorting the most common words
# Set top25=True for printing the most common words as outputs
wordcountBasic(top25=True)



I: 909
the: 743
you: 690
to: 655
NORA.: 566
and: 493
a: 453
is: 396
of: 379
it: 322
that: 314
have: 288
in: 276
HELMER.: 271
for: 196
MRS: 193
LINDE.: 193
my: 178
was: 176
be: 171
as: 167
your: 166
me: 160
will: 152
at: 151


In [91]:
def wordcountExtended(top25=False):
    warnings.filterwarnings('ignore')

    def stopWords():
        with open("stopwords.txt", 'r') as file:
            stopwords = set(word.strip().lower() for word in file)
        return stopwords

    def removePunctuation(text):
        modText = str.maketrans("", "", string.punctuation)
        return text.translate(modText)
    
    stopWords = stopWords()

    stopwordsBroadcast = sc.broadcast(stopWords)
    
    inputFiles = ["adollhouse.txt","metamorphosis.txt"]
    lines = sc.textFile(','.join(inputFiles))
    words = lines.flatMap(lambda line: map(lambda word: removePunctuation(word.lower()), filter(None, line.split(" "))))
    filteredWords = words.filter(lambda word: word not in stopwordsBroadcast.value)


    wordCounts = filteredWords.map(lambda word: (word, 1))
    wordCounts = wordCounts.reduceByKey(lambda a, b: a + b)

    wordcountSorted = wordCounts.sortBy(lambda x: x[1], ascending=False)
    
    if top25:
        mostCommon = wordcountSorted.take(25)
        for (word, count) in mostCommon:
            print(f"{word}: {count}")

    else:

        result = wordcountSorted.collect()
        for (word, count) in result:
            print(f"{word}: {count}")

In [97]:
# Set top25=False for printing just the outputs
# Set top25=True for printing the most common words as outputs
wordcountExtended(top25=False)

nora: 682
helmer: 317
would: 263
mrs: 253
linde: 223
out: 193
from: 189
gregor: 188
room: 177
door: 161
yes: 160
could: 160
then: 155
krogstad: 154
about: 146
little: 144
one: 143
rank: 142
into: 141
must: 125
did: 119
father: 117
think: 114
back: 113
torvald: 112
know: 110
come: 105
well: 104
go: 99
gregor’s: 99
himself: 98
sister: 96
see: 96
time: 95
mother: 95
don’t: 94
way: 92
even: 92
like: 85
tell: 83
get: 79
never: 78
quite: 75
something: 74
after: 71
let: 71
look: 71
much: 71
again: 68
really: 68
first: 67
thing: 66
going: 65
over: 65
shall: 65
still: 63
things: 62
say: 61
said: 60
thought: 60
good: 58
money: 57
anything: 57
away: 56
make: 56
soon: 54
before: 53
want: 53
nothing: 53
without: 52
two: 51
made: 51
right: 50
head: 50
doctor: 50
everything: 50
open: 49
left: 49
life: 49
man: 49
can’t: 48
understand: 48
christine: 47
mean: 46
goes: 45
home: 45
oh: 45
long: 45
came: 45
dear: 45
course: 45
while: 45
take: 44
husband: 42
work: 42
because: 41
it’s: 40
children: 40
chief:

In [98]:
# Set top25=False for printing just the outputs
# Set top25=True for printing the most common words as outputs
wordcountExtended(top25=True)

nora: 682
helmer: 317
would: 263
mrs: 253
linde: 223
out: 193
from: 189
gregor: 188
room: 177
door: 161
yes: 160
could: 160
then: 155
krogstad: 154
about: 146
little: 144
one: 143
rank: 142
into: 141
must: 125
did: 119
father: 117
think: 114
back: 113
torvald: 112
