# Analyzing irregularities in large bodies of text

## Importing Packages

In [0]:
from __future__ import print_function

import sys
from operator import add
from pyspark.sql import SparkSession

## Mounting AWS bucket to access data

In [0]:
AWS_BUCKET_NAME = "estp-test-bucket-2"
MOUNT_NAME = "my-bucket"

ACCESS_KEY = "ACC"
SECRET_KEY = "SEC"
ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")

dbutils.fs.mount("s3a://%s:%s@%s" % (ACCESS_KEY, ENCODED_SECRET_KEY, AWS_BUCKET_NAME), "/mnt/%s" % MOUNT_NAME)
display(dbutils.fs.ls("/mnt/%s" % MOUNT_NAME))

[0;31m---------------------------------------------------------------------------[0m
[0;31mExecutionError[0m                            Traceback (most recent call last)
[0;32m<command-1283093157892170>[0m in [0;36m<module>[0;34m[0m
[1;32m      6[0m [0mENCODED_SECRET_KEY[0m [0;34m=[0m [0mSECRET_KEY[0m[0;34m.[0m[0mreplace[0m[0;34m([0m[0;34m"/"[0m[0;34m,[0m [0;34m"%2F"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m      7[0m [0;34m[0m[0m
[0;32m----> 8[0;31m [0mdbutils[0m[0;34m.[0m[0mfs[0m[0;34m.[0m[0mmount[0m[0;34m([0m[0;34m"s3a://%s:%s@%s"[0m [0;34m%[0m [0;34m([0m[0mACCESS_KEY[0m[0;34m,[0m [0mENCODED_SECRET_KEY[0m[0;34m,[0m [0mAWS_BUCKET_NAME[0m[0;34m)[0m[0;34m,[0m [0;34m"/mnt/%s"[0m [0;34m%[0m [0mMOUNT_NAME[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      9[0m [0mdisplay[0m[0;34m([0m[0mdbutils[0m[0;34m.[0m[0mfs[0m[0;34m.[0m[0mls[0m[0;34m([0m[0;34m"/mnt/%s"[0m [0;34m%[0m [0mMOUNT_NA

## Initialize Spark session

In [0]:
spark = SparkSession\
        .builder\
        .appName("PythonWordCount")\
        .getOrCreate()

## Data Overview

In [0]:
pride10 = spark.read.text("/mnt/my-bucket/pride/pride10.txt")
pride2000 = spark.read.text("/mnt/my-bucket/pride/pride2000.txt")

In [0]:
pride2000.head(n=20)

Out[5]: [Row(value='PRIDE AND PREJUDICE'),
 Row(value=''),
 Row(value='By Jane Austen'),
 Row(value=''),
 Row(value=''),
 Row(value=''),
 Row(value='Chapter 1'),
 Row(value=''),
 Row(value=''),
 Row(value='It is a truth universally acknowledged that a single man in possession'),
 Row(value='of a good fortune must be in want of a wife'),
 Row(value=''),
 Row(value='However little known the feelings or views of such a man may be on his'),
 Row(value='first entering a neighbourhood this truth is so well fixed in the minds'),
 Row(value='of the surrounding families that he is considered the rightful property'),
 Row(value='of some one or other of their daughters'),
 Row(value=''),
 Row(value='My dear Mr Bennet said his lady to him one day have you heard that'),
 Row(value='Netherfield Park is let at last'),
 Row(value='')]

## Count Words
We use the common paradigm of tying the number `1` to each word (key: `word`, value: `1`) and sum (add) all values by key.

In [0]:
lines = pride10.rdd.map(lambda r: r[0])

counts = lines.flatMap(lambda x: x.split(' ')) \
                  .map(lambda x: (x, 1)) \
                  .reduceByKey(add)
output = counts.collect()

output

Out[6]: [('', 5034000),
 ('Jane', 522000),
 ('is', 1668000),
 ('of', 7176000),
 ('However', 12000),
 ('views', 22000),
 ('minds', 8000),
 ('said', 802000),
 ('have', 1660000),
 ('at', 1482000),
 ('Long', 30000),
 ('no', 868000),
 ('Do', 74000),
 ('was', 3678000),
 ('four', 58000),
 ('before', 448000),
 ('thousand', 68000),
 ('year', 56000),
 ('send', 46000),
 ('share', 28000),
 ('anything', 156000),
 ('her', 4256000),
 ('think', 420000),
 ('than', 564000),
 ('an', 692000),
 ('would', 932000),
 ('Indeed', 38000),
 ('us', 246000),
 ('overscrupulous', 2000),
 ('surely', 6000),
 ('though', 412000),
 ('goodhumoured', 12000),
 ('Lydia', 260000),
 ('none', 36000),
 ('something', 136000),
 ('compassion', 28000),
 ('poor', 58000),
 ('mistake', 16000),
 ('high', 34000),
 ('suffer', 26000),
 ('quick', 20000),
 ('reserve', 14000),
 ('Her', 174000),
 ('mind', 112000),
 ('uncertain', 4000),
 ('temper', 48000),
 ('fancied', 14000),
 ('The', 546000),
 ('solace', 2000),
 ('till', 172000),
 ('knowledge'

## Your turn!
Read the above line and understand what's happening. Then
- Find the one changed word in the text body of 10 books
- Repeat the above steps to find the changed word in the larger body of 2000 books
- Bonus: In which line numbers are the changes?