## Loading the data into dataframes

In [0]:
# Creating a spark session
from pyspark.sql import SparkSession

spark = (SparkSession
         .builder
         .appName("Table Loading")
         .getOrCreate())

sc = spark.sparkContext

#### Creating the `Posts` dataframe

In [0]:
display(
    dbutils.fs.ls("/mnt/bd-project")
)

In [0]:
# Creating the posts dataframe
file_location = "/mnt/bd-project/ML_Training/Posts/*"

posts = spark.read \
  .parquet(file_location)

display(posts)

#### Creating the `PostTypes` dataframe

In [0]:
# Creating the schema for posttypes table
from pyspark.sql.types import *

PT_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("Type", StringType(), True)
])

In [0]:
# Creating the posttypes dataframe
file_location = "/mnt/bd-project/ML_Training/PostTypes.txt"

postType = (spark.read
  .option("header", "true")
  .option("sep", ",")
  .schema(PT_schema)
  .csv(file_location))

display(postType)

#### Creating the `Users` dataframe

In [0]:
# Creating the schema for the users table
from pyspark.sql.types import *

users_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("Age", IntegerType(), True),
    StructField("CreationDate", DateType(), True),
    StructField("DisplayName", StringType(), True),
    StructField("DownVotes", IntegerType(), True),
    StructField("EmailHash", StringType(), True),
    StructField("Location", StringType(), True),
    StructField("Reputation", IntegerType(), True),
    StructField("UpVotes", IntegerType(), True),
    StructField("Views", IntegerType(), True),
    StructField("WebsiteUrl", StringType(), True),
    StructField("AccountId", IntegerType(), True)
])

In [0]:
# Creating the users dataframe
file_location = "/mnt/bd-project/ML_Training/users.csv"

users = (spark.read
  .option("header", "true")
  .option("sep", ",")
  .schema(users_schema)
  .csv(file_location))

display(users)

#### Saving the dataframes for easy retrieval

In [0]:
# Save the 3 tables to databricks local file system
posts.write.parquet("/tmp/project/posts")
postType.write.parquet("/tmp/project/PostType")
users.write.parquet("/tmp/project/user")

In [0]:
# review the local file system
display(dbutils.fs.ls("/tmp/project/"))