# Text Preparation Notebook
This notebook reads a raw public domain book txt file and processes it for use in transfer learning.

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Text Processing").getOrCreate()

In [2]:
skip_lines = 426

# Load the book text file
Source: https://www.gutenberg.org/files/61043/61043-0.txt

Title: The Color of a Great City

Author: Theodore Dreiser

In [3]:
raw_text = spark.read.text("data/61043-0.txt")

In [4]:
from pyspark.sql.functions import col
from pyspark.sql.functions import monotonically_increasing_id

lines = raw_text.select(col('value').alias('spoken'))
lines = lines.filter(~ lines.spoken.contains('['))
lines = lines.withColumn('normalised',lines.spoken)
lines = lines.withColumn("id", monotonically_increasing_id())
lines = lines.filter('id >= {0}'.format(skip_lines))
lines = lines.filter("spoken != ''")
lines = lines.withColumn("id", monotonically_increasing_id())

lines.createOrReplaceTempView("lines")
lines.head()

Row(spoken='It was silent, the city of my dreams, marble and serene, due perhaps', normalised='It was silent, the city of my dreams, marble and serene, due perhaps', id=0)

In [5]:
from pyspark.sql.types import StringType

numbers_to_word = {"1": "one", "2": "two", "3": "three", "4": "four", "5": "five", "6": "six", "7": "seven", "8": "eight", "9":"nine"}
abbr_to_word = {"No.": "number"}
specials_handling = {"--":" -- "}

def normaliseText(s):
    s = s.strip()
    for key, value in numbers_to_word.items():
        s = s.replace(key,value)
    for key, value in abbr_to_word.items():
        s = s.replace(key,value)
    for key, value in specials_handling.items():
        s = s.replace(key,value)

    return s

def formatIndex(d):
    return f'RR001-{d:04}'

spark.udf.register("normaliseText", normaliseText, StringType())
spark.udf.register("formatIndex", formatIndex, StringType());

In [6]:
processed_lines = spark.sql("""
    select formatIndex(id) as index, spoken, normaliseText(normalised) as normalised
    from lines
""")
processed_lines.head(n=5)

[Row(index='RR001-0000', spoken='It was silent, the city of my dreams, marble and serene, due perhaps', normalised='It was silent, the city of my dreams, marble and serene, due perhaps'),
 Row(index='RR001-0001', spoken='to the fact that in reality I knew nothing of crowds, poverty, the', normalised='to the fact that in reality I knew nothing of crowds, poverty, the'),
 Row(index='RR001-0002', spoken='winds and storms of the inadequate that blow like dust along the paths', normalised='winds and storms of the inadequate that blow like dust along the paths'),
 Row(index='RR001-0003', spoken='of life. It was an amazing city, so far-flung, so beautiful, so dead.', normalised='of life. It was an amazing city, so far-flung, so beautiful, so dead.'),
 Row(index='RR001-0004', spoken='There were tracks of iron stalking through the air, and streets that', normalised='There were tracks of iron stalking through the air, and streets that')]

# Generate sentence fragements
Format the fragements into the output dataframe.

In [7]:
processed_lines.printSchema()

root
 |-- index: string (nullable = true)
 |-- spoken: string (nullable = true)
 |-- normalised: string (nullable = true)



In [8]:
processed_lines_pd = processed_lines.toPandas()
processed_lines_pd

Unnamed: 0,index,spoken,normalised
0,RR001-0000,"It was silent, the city of my dreams, marble a...","It was silent, the city of my dreams, marble a..."
1,RR001-0001,to the fact that in reality I knew nothing of ...,to the fact that in reality I knew nothing of ...
2,RR001-0002,winds and storms of the inadequate that blow l...,winds and storms of the inadequate that blow l...
3,RR001-0003,"of life. It was an amazing city, so far-flung,...","of life. It was an amazing city, so far-flung,..."
4,RR001-0004,There were tracks of iron stalking through the...,There were tracks of iron stalking through the...
...,...,...,...
6775,RR001-6775,http://www.gutenberg.org,http://www.gutenberg.org
6776,RR001-6776,This Web site includes information about Proje...,This Web site includes information about Proje...
6777,RR001-6777,including how to make donations to the Project...,including how to make donations to the Project...
6778,RR001-6778,"Archive Foundation, how to help produce our ne...","Archive Foundation, how to help produce our ne..."


In [9]:
processed_lines_pd.to_csv("The-Color-of-a-Great-City.csv", sep='|', index=False, header=False)