# Data Analytics Model 1 - Getting Comfortable


### Loading csv dataset into the databricks file system (dbfs)

In [0]:
%sh
rm -r /dbfs/spark_lab
mkdir /dbfs/spark_lab
wget -O /dbfs/spark_lab/2019.csv https://raw.githubusercontent.com/kuljotSB/DatabricksUdemyCourse/refs/heads/main/DataAnalytics/2019.csv
wget -O /dbfs/spark_lab/2020.csv https://raw.githubusercontent.com/kuljotSB/DatabricksUdemyCourse/refs/heads/main/DataAnalytics/2020.csv
wget -O /dbfs/spark_lab/2021.csv https://raw.githubusercontent.com/kuljotSB/DatabricksUdemyCourse/refs/heads/main/DataAnalytics/2021.csv

### Loading csv files into a dataframe

In [0]:
df = spark.read.load('spark_lab/*.csv', format='csv')
display(df.limit(100))

### Defining Schema for the dataframe

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
orderSchema = StructType([
    StructField("SalesOrderNumber", StringType()),
    StructField("SalesOrderLineNumber", IntegerType()),
    StructField("OrderDate", DateType()),
    StructField("CustomerName", StringType()),
    StructField("Email", StringType()),
    StructField("Item", StringType()),
    StructField("Quantity", IntegerType()),
    StructField("UnitPrice", FloatType()),
    StructField("Tax", FloatType())
])
df = spark.read.load('/spark_lab/*.csv', format='csv', schema=orderSchema)
display(df.limit(100))

### Query Data using Spark SQL

In [0]:
df.createOrReplaceTempView("salesorders")
spark_df = spark.sql("SELECT * FROM salesorders")
display(spark_df)

In [0]:
sqlQuery = "SELECT CAST(YEAR(OrderDate) AS CHAR(4)) AS OrderYear, \
               SUM((UnitPrice * Quantity) + Tax) AS GrossRevenue \
        FROM salesorders \
        GROUP BY CAST(YEAR(OrderDate) AS CHAR(4)) \
        ORDER BY OrderYear"
df_spark = spark.sql(sqlQuery)
df_spark.show()

### Using Matplotlib for visualisation

In [0]:
from matplotlib import pyplot as plt

# matplotlib requires a Pandas dataframe, not a Spark one
df_sales = df_spark.toPandas()
# Create a bar plot of revenue by year
plt.bar(x=df_sales['OrderYear'], height=df_sales['GrossRevenue'])
# Display the plot
plt.show()

### Using Seaborn Library

In [0]:
import seaborn as sns

# Clear the plot area
plt.clf()
# Create a bar chart
ax = sns.barplot(x="OrderYear", y="GrossRevenue", data=df_sales)
plt.show()