In [79]:
from datetime import datetime, date
import pandas as pd
from pyspark.sql import Row
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import DateType



In [8]:
spark = SparkSession.builder.getOrCreate()

In [9]:
df = spark.createDataFrame([
    Row(a=1, b=2., c='string1', d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),
    Row(a=2, b=3., c='string2', d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),
    Row(a=4, b=5., c='string3', d=date(2000, 3, 1), e=datetime(2000, 1, 3, 12, 0))
],  schema = 'a long, b double, c string, d date, e timestamp'

)
print(df)

DataFrame[a: bigint, b: double, c: string, d: date, e: timestamp]


In [10]:
df.show()
df.printSchema()

                                                                                

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|
|  4|5.0|string3|2000-03-01|2000-01-03 12:00:00|
+---+---+-------+----------+-------------------+

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



In [12]:
df.show(0)

+---+---+---+---+---+
|  a|  b|  c|  d|  e|
+---+---+---+---+---+
+---+---+---+---+---+
only showing top 0 rows



In [14]:
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)
df

a,b,c,d,e
1,2.0,string1,2000-01-01,2000-01-01 12:00:00
2,3.0,string2,2000-02-01,2000-01-02 12:00:00
4,5.0,string3,2000-03-01,2000-01-03 12:00:00


In [15]:
df.show(1, vertical=True)

-RECORD 0------------------
 a   | 1                   
 b   | 2.0                 
 c   | string1             
 d   | 2000-01-01          
 e   | 2000-01-01 12:00:00 
only showing top 1 row



In [18]:
df.columns
df.printSchema()

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



In [19]:
df.select("a","b","c").describe().show()

[Stage 19:>                                                         (0 + 1) / 1]

+-------+------------------+------------------+-------+
|summary|                 a|                 b|      c|
+-------+------------------+------------------+-------+
|  count|                 3|                 3|      3|
|   mean|2.3333333333333335|3.3333333333333335|   null|
| stddev|1.5275252316519468|1.5275252316519468|   null|
|    min|                 1|               2.0|string1|
|    max|                 4|               5.0|string3|
+-------+------------------+------------------+-------+



                                                                                

In [21]:
df

a,b,c,d,e
1,2.0,string1,2000-01-01,2000-01-01 12:00:00
2,3.0,string2,2000-02-01,2000-01-02 12:00:00
4,5.0,string3,2000-03-01,2000-01-03 12:00:00


In [24]:
df.filter(df.a ==1).show()

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
+---+---+-------+----------+-------------------+



In [25]:
df_gro = spark.createDataFrame([
    ['red', 'banana', 1, 10], ['blue', 'banana', 2, 20], ['red', 'carrot', 3, 30],
    ['blue', 'grape', 4, 40], ['red', 'carrot', 5, 50], ['black', 'carrot', 6, 60],
    ['red', 'banana', 7, 70], ['red', 'grape', 8, 80]], schema=['color', 'fruit', 'v1', 'v2'])
df_gro.show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|  red|banana|  1| 10|
| blue|banana|  2| 20|
|  red|carrot|  3| 30|
| blue| grape|  4| 40|
|  red|carrot|  5| 50|
|black|carrot|  6| 60|
|  red|banana|  7| 70|
|  red| grape|  8| 80|
+-----+------+---+---+



# First real live project 
ETL Extract Transform Load

In [106]:
import requests
import pandas as pd
from pyspark.sql import SparkSession

def get_dataset_from_github(url):
    """This function allows you to retrieve data from GitHub repositories.
    
    Input:
    - url: String from the GitHub repository for the dataset in raw data
    
    Output:
    - A dataset if successful, otherwise None
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raises an exception if the status code is not 200 (OK)
        dataset = pd.read_csv(url)
        return dataset
    except requests.exceptions.RequestException as e:
        print('Error downloading the dataset:', e)
        return None



In [107]:
df_pd_1 = get_dataset_from_github(url='https://raw.githubusercontent.com/jhnwr/auto-reporting/main/report1.csv')
df_pd_2 = get_dataset_from_github(url='https://raw.githubusercontent.com/jhnwr/auto-reporting/main/report2.csv')
df_pd_3 = get_dataset_from_github(url='https://raw.githubusercontent.com/jhnwr/auto-reporting/main/report3.csv')
df_pd_4 = get_dataset_from_github(url='https://raw.githubusercontent.com/jhnwr/auto-reporting/main/report4.csv')
df_pd_5 = get_dataset_from_github(url='https://raw.githubusercontent.com/jhnwr/auto-reporting/main/report5.csv')
df_pd_6 = get_dataset_from_github(url='https://raw.githubusercontent.com/jhnwr/auto-reporting/main/report6.csv')

df_concat = pd.concat([df_pd_1, df_pd_2, df_pd_3, df_pd_4, df_pd_5, df_pd_6], verify_integrity=True, ignore_index=True)
df_concat.head()

24/02/22 18:29:01 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:322)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:117)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:116)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:611)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$1(BlockManagerMasterEndpoint.scala:610)
	at org.apache.spar

Unnamed: 0,order_id,date,email,first_name,last_name,address,country,item,size,qty
0,01HPYDWKXGEBJD9ZJDKZZRSJPA,7/19/2023,brieger0@skype.com,Bat,Rieger,8529 Park Meadow Street,United States,Hoody,XX-Large,2
1,01HPYDWKZDZCTC2TPJWM300P14,7/10/2023,mabbitt1@bing.com,Mary,Abbitt,8 Debs Junction,Norway,Gloves,Medium,1
2,01HPYDWKZGM48TRP1XMMB8W8VH,10/7/2023,ckeave2@bravesites.com,Cordula,Keave,618 Packers Avenue,Brazil,Sweater,X-Large,2
3,01HPYDWKZJ71D89EN3A9HKJ7PK,9/27/2023,rgymlett3@bbc.co.uk,Rhoda,Gymlett,30 Stang Lane,Sweden,Hoody,XX-Large,1
4,01HPYDWKZNDT89QSVM5Z5QVAJC,2/8/2023,mdorre4@google.cn,Mead,Dorre,09 Rockefeller Circle,Argentina,Tshirt,Large,4


24/02/22 18:29:11 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:322)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:117)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:116)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:611)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$1(BlockManagerMasterEndpoint.scala:610)
	at org.apache.spar

#### 1. Create Spark session and a app
Two Fundamental step 

- 1. Sparkseesion.builder.appName
- 2. .getOrCreate()

In [90]:
spark = SparkSession.builder.appName("GitHub Dataset").getOrCreate()

#### 2. Create A dataframe with the concated dataframes 
- important spark uses .show() to inspect the sparkframe

In [92]:
df_spark = spark.createDataFrame(df_concat)
df_spark.show()
df_spark.printSchema()

+--------------------+----------+--------------------+----------+----------+--------------------+-------------+-------+--------+---+
|            order_id|      date|               email|first_name| last_name|             address|      country|   item|    size|qty|
+--------------------+----------+--------------------+----------+----------+--------------------+-------------+-------+--------+---+
|01HPYDWKXGEBJD9ZJ...| 7/19/2023|  brieger0@skype.com|       Bat|    Rieger|8529 Park Meadow ...|United States|  Hoody|XX-Large|  2|
|01HPYDWKZDZCTC2TP...| 7/10/2023|   mabbitt1@bing.com|      Mary|    Abbitt|     8 Debs Junction|       Norway| Gloves|  Medium|  1|
|01HPYDWKZGM48TRP1...| 10/7/2023|ckeave2@bravesite...|   Cordula|     Keave|  618 Packers Avenue|       Brazil|Sweater| X-Large|  2|
|01HPYDWKZJ71D89EN...| 9/27/2023| rgymlett3@bbc.co.uk|     Rhoda|   Gymlett|       30 Stang Lane|       Sweden|  Hoody|XX-Large|  1|
|01HPYDWKZNDT89QSV...|  2/8/2023|   mdorre4@google.cn|      Mead|    

In [105]:
display(df_spark)

DataFrame[order_id: string, date: string, email: string, first_name: string, last_name: string, address: string, country: string, item: string, size: string, qty: bigint]

24/02/22 18:27:21 WARN Executor: Issue communicating with driver in heartbeater
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:322)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:101)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:85)
	at org.apache.spark.storage.BlockManagerMaster.registerBlockManager(BlockManagerMaster.scala:80)
	at org.apache.spark.storage.BlockManager.reregister(BlockManager.scala:641)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor.scala:1111)
	at org.apache.spark.executor.Executor.$anonfun$heartbeater$1(Executor.scala:244)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:2088)
	at org.apache.spark.Heartbeater$$anon$1.run(Heartbeater.scala:46)
	at java.util.c

#### The Describe function is the same as is pandas.
Describe gives us all coumns informations 
- count,
- mean,
- staddev,
- min,
- max

In [104]:
df_spark.filter((df_spark.first_name == 'Bat') & (df_spark.last_name == 'Rieger')).show()

+--------------------+---------+------------------+----------+---------+--------------------+-------------+-----+--------+---+
|            order_id|     date|             email|first_name|last_name|             address|      country| item|    size|qty|
+--------------------+---------+------------------+----------+---------+--------------------+-------------+-----+--------+---+
|01HPYDWKXGEBJD9ZJ...|7/19/2023|brieger0@skype.com|       Bat|   Rieger|8529 Park Meadow ...|United States|Hoody|XX-Large|  2|
+--------------------+---------+------------------+----------+---------+--------------------+-------------+-----+--------+---+



24/02/22 14:23:23 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 370503 ms exceeds timeout 120000 ms
24/02/22 14:23:23 WARN SparkContext: Killing executors is not supported by current scheduler.
24/02/22 14:23:29 WARN Executor: Issue communicating with driver in heartbeater
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:322)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:101)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:85)
	at org.apache.spark.storage.BlockManagerMaster.registerBlockManager(BlockManagerMaster.scala:80)
	at org.apache.spark.storage.BlockManager.reregister(BlockManager.scala:641)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor.scala:1111)
	at org.apache.spark.executor.Executor.$anonfun$heartbeater$1(Executor.scala:244)
	at sc

#### How to filter with on one condition

In [87]:
df_spark.fitler("")

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `2023-01-01` cannot be resolved. Did you mean one of the following? [`address`, `country`, `date`, `email`, `first_name`].;
'Filter (date#5373 > to_date('2023-01-01, None, Some(Europe/Berlin)))
+- Project [order_id#1656, cast(date#5362 as date) AS date#5373, email#1658, first_name#1659, last_name#1660, address#1661, country#1662, item#1663, size#1664, qty#1665L]
   +- Project [order_id#1656, cast(date#1657 as date) AS date#5362, email#1658, first_name#1659, last_name#1660, address#1661, country#1662, item#1663, size#1664, qty#1665L]
      +- LogicalRDD [order_id#1656, date#1657, email#1658, first_name#1659, last_name#1660, address#1661, country#1662, item#1663, size#1664, qty#1665L], false
