In [0]:
dbutils.fs.rm('/FileStore/tables', True)

Out[4]: True

# Extracting Data

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('ETL').getOrCreate()

df = spark.read.text('/FileStore/tables/WordData.txt')
display(df)

value
This is a Japanese doll
The team members were hard to tell apart since they all wore their hair in a ponytail
As the years pass by we all know owners look more and more like their dogs
If you don't like toenails you probably shouldn't look at your feet
He was disappointed when he found the beach to be so sandy and the sun so sunny
When he encountered maize for the first time he thought it incredibly corny
Situps are a terrible way to end your day
Toddlers feeding raccoons surprised even the seasoned park ranger
Edith could decide if she should paint her teeth or brush her nails
Her daily goal was to improve on yesterday


# Transforming Data

In [0]:
from pyspark.sql.functions import lit, col, explode
import pyspark.sql.functions as f

In [0]:
df1 = df.withColumn('splitedData', f.split('value', ' '))
display(df1)

value,splitedData
This is a Japanese doll,"List(This, is, a, Japanese, doll)"
The team members were hard to tell apart since they all wore their hair in a ponytail,"List(The, team, members, were, hard, to, tell, apart, since, they, all, wore, their, hair, in, a, ponytail)"
As the years pass by we all know owners look more and more like their dogs,"List(As, the, years, pass, by, we, all, know, owners, look, more, and, more, like, their, dogs)"
If you don't like toenails you probably shouldn't look at your feet,"List(If, you, don't, like, toenails, you, probably, shouldn't, look, at, your, feet)"
He was disappointed when he found the beach to be so sandy and the sun so sunny,"List(He, was, disappointed, when, he, found, the, beach, to, be, so, sandy, and, the, sun, so, sunny)"
When he encountered maize for the first time he thought it incredibly corny,"List(When, he, encountered, maize, for, the, first, time, he, thought, it, incredibly, corny)"
Situps are a terrible way to end your day,"List(Situps, are, a, terrible, way, to, end, your, day)"
Toddlers feeding raccoons surprised even the seasoned park ranger,"List(Toddlers, feeding, raccoons, surprised, even, the, seasoned, park, ranger)"
Edith could decide if she should paint her teeth or brush her nails,"List(Edith, could, decide, if, she, should, paint, her, teeth, or, brush, her, nails)"
Her daily goal was to improve on yesterday,"List(Her, daily, goal, was, to, improve, on, yesterday)"


In [0]:
df2 = df1.withColumn('wordsData', explode('splitedData'))
display(df2)

value,splitedData,wordsData
This is a Japanese doll,"List(This, is, a, Japanese, doll)",This
This is a Japanese doll,"List(This, is, a, Japanese, doll)",is
This is a Japanese doll,"List(This, is, a, Japanese, doll)",a
This is a Japanese doll,"List(This, is, a, Japanese, doll)",Japanese
This is a Japanese doll,"List(This, is, a, Japanese, doll)",doll
The team members were hard to tell apart since they all wore their hair in a ponytail,"List(The, team, members, were, hard, to, tell, apart, since, they, all, wore, their, hair, in, a, ponytail)",The
The team members were hard to tell apart since they all wore their hair in a ponytail,"List(The, team, members, were, hard, to, tell, apart, since, they, all, wore, their, hair, in, a, ponytail)",team
The team members were hard to tell apart since they all wore their hair in a ponytail,"List(The, team, members, were, hard, to, tell, apart, since, they, all, wore, their, hair, in, a, ponytail)",members
The team members were hard to tell apart since they all wore their hair in a ponytail,"List(The, team, members, were, hard, to, tell, apart, since, they, all, wore, their, hair, in, a, ponytail)",were
The team members were hard to tell apart since they all wore their hair in a ponytail,"List(The, team, members, were, hard, to, tell, apart, since, they, all, wore, their, hair, in, a, ponytail)",hard


In [0]:
wordsDF = df2.select(df2.wordsData)
display(wordsDF)

wordsData
This
is
a
Japanese
doll
The
team
members
were
hard


In [0]:
countDF = wordsDF.groupBy('wordsData').count()
countDF.show()

+-----------+-----+
|  wordsData|count|
+-----------+-----+
|   Tomorrow|    4|
|         If|    8|
|      leave|    4|
|      corny|    4|
|        day|    4|
|preoccupied|    4|
|       even|    8|
|      crazy|    4|
|    bananas|    4|
|     priest|    4|
|        did|    4|
|    whether|    4|
|     Having|    4|
|        I'm|    4|
|      crime|    4|
|  surprised|    4|
|      James|    4|
|      could|    8|
|        buy|    4|
|        him|    8|
+-----------+-----+
only showing top 20 rows



# Loading Data

In [0]:
# Create AWS RDS and connect it to pgAdmin

In [0]:
driver = 'org.postgresql.Driver'
url = 'jdbc:postgresql://database-1.c1catb0at1ho.us-east-1.rds.amazonaws.com/'
table = 'rohith_schema_pyspark.WorkCount'
user = 'postgres'
password = 'root1234'

In [0]:
countDF.write.format('jdbc').option('driver', driver).option('url', url).option('dbtable', table).option('mode', 'append').option('user', user).option('password', password).save()