# Analyzing irregularities in large bodies of text

## Importing Packages

In [1]:
import os
import s3fs
from operator import add
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType

## Initialize Spark Session

In [2]:
spark = SparkSession\
       .builder\
       .appName("PythonWordCount")\
       .getOrCreate()

## Access Data from S3 Bucket

In [3]:
AWS_BUCKET_NAME = "estp-test-bucket-2"
MOUNT_NAME = "my-bucket"

# Replace the credentials `ACC` and `SEC` here!
os.environ["AWS_ACCESS_KEY_ID"] = "ACC"
os.environ["AWS_SECRET_ACCESS_KEY"] = "SEC"

s3 = s3fs.S3FileSystem(anon=True)

## Read Data into Spark DataFrame

In [4]:
with s3.open(AWS_BUCKET_NAME + "/pride/pride10.txt") as f:
    pride_10 = f.read().decode('utf-8').splitlines()
    pride_10 = spark.createDataFrame(pride_10, StringType())

## Data Overview

In [5]:
# Show the first 20 lines
pride_10.head(n=20)

[Row(value='PRIDE AND PREJUDICE'),
 Row(value=''),
 Row(value='By Jane Austen'),
 Row(value=''),
 Row(value=''),
 Row(value=''),
 Row(value='Chapter 1'),
 Row(value=''),
 Row(value=''),
 Row(value='It is a truth universally acknowledged that a single man in possession'),
 Row(value='of a good fortune must be in want of a wife'),
 Row(value=''),
 Row(value='However little known the feelings or views of such a man may be on his'),
 Row(value='first entering a neighbourhood this truth is so well fixed in the minds'),
 Row(value='of the surrounding families that he is considered the rightful property'),
 Row(value='of some one or other of their daughters'),
 Row(value=''),
 Row(value='My dear Mr Bennet said his lady to him one day have you heard that'),
 Row(value='Netherfield Park is let at last'),
 Row(value='')]

## Count Words: Map Reduce

In [6]:
lines = pride_10.rdd.map(lambda r: r[0])

counts = lines.flatMap(lambda x: x.split(' ')) \
                  .map(lambda x: (x, 1)) \
                  .reduceByKey(add)
output = counts.collect()

# Show the first 10 lines
output[:10]

[('PRIDE', 10),
 ('AND', 10),
 ('PREJUDICE', 10),
 ('', 25170),
 ('By', 140),
 ('Jane', 2610),
 ('Austen', 20),
 ('Chapter', 610),
 ('1', 10),
 ('It', 2460)]

## Your Turn!

Figure out which word was changed.

In [7]:
for line in output:
    pass