diff --git a/.obsidian/plugins/remember-cursor-position/cursor-positions.json b/.obsidian/plugins/remember-cursor-position/cursor-positions.json index a4636157..0d9ed417 100644 --- a/.obsidian/plugins/remember-cursor-position/cursor-positions.json +++ b/.obsidian/plugins/remember-cursor-position/cursor-positions.json @@ -1 +1 @@ -{"Islam/Meetings/جلسة سورة البقرة.md":{"scroll":0,"cursor":{"from":{"ch":80,"line":17},"to":{"ch":80,"line":17}}},"Secrets/Work Drafts.md":{"scroll":28.5228,"cursor":{"from":{"ch":26,"line":35},"to":{"ch":26,"line":35}}},"Islam/Courses/Self Restart (إعادة تشغيل الذات)/1. Preparatory Introduction (مقدمات إستعدادية).md":{"scroll":384.7819,"cursor":{"from":{"ch":0,"line":392},"to":{"ch":0,"line":392}}},"Sciences/Applied Sciences/Programming/Versioning/Git.md":{"scroll":66.3854,"cursor":{"from":{"ch":233,"line":40},"to":{"ch":233,"line":40}}},"Sciences/Applied Sciences/Programming/DevOps/DevOps.md":{"scroll":43.6234,"cursor":{"from":{"ch":68,"line":47},"to":{"ch":68,"line":47}}},"Sciences/Applied Sciences/Programming/DevOps/CI-CD.md":{"scroll":86.8542,"cursor":{"from":{"ch":0,"line":96},"to":{"ch":0,"line":96}}},"Obsidian/My Obsidian.md":{"scroll":31.4895,"cursor":{"from":{"ch":23,"line":36},"to":{"ch":15,"line":36}}},"Sciences/Formal Sciences/Mathematics/Mathematics.md":{"scroll":10.0591,"cursor":{"from":{"ch":496,"line":12},"to":{"ch":496,"line":12}}},"Sciences/Applied Sciences/Programming/AI/Data Engineering/Data Pipeline.md":{"scroll":42.3143,"cursor":{"from":{"ch":140,"line":50},"to":{"ch":140,"line":50}}},"Sciences/Applied Sciences/Programming/AI/Data Engineering/Apache Spark.md":{"scroll":93.8008,"cursor":{"from":{"ch":71,"line":101},"to":{"ch":71,"line":101}}},"Sciences/Applied Sciences/Programming/AI/Data Engineering/GCP Big Data And ML Fundamentals Course.md":{"scroll":6.3718,"cursor":{"from":{"ch":0,"line":0},"to":{"ch":0,"line":0}}},"Sciences/Applied Sciences/Programming/AI/Data Engineering/DE Road Map.md":{"scroll":3.6662,"cursor":{"from":{"ch":0,"line":0},"to":{"ch":0,"line":0}}},"Sciences/Applied Sciences/Programming/AI/Data Engineering/Data Engineering.md":{"scroll":0,"cursor":{"from":{"ch":0,"line":2},"to":{"ch":0,"line":2}}},"Sciences/Applied Sciences/Programming/AI/Data Engineering/Hadoop.md":{"scroll":37.9236,"cursor":{"from":{"ch":37,"line":51},"to":{"ch":37,"line":51}}},"Sciences/Applied Sciences/Programming/AI/Data Engineering/PySpark.md":{"scroll":161.0122,"cursor":{"from":{"ch":101,"line":147},"to":{"ch":101,"line":147}}},"Islam/Courses/Self Restart (إعادة تشغيل الذات)/2. Daily Habits (العادات اليومية).md":{"scroll":135.4565,"cursor":{"from":{"ch":0,"line":149},"to":{"ch":0,"line":149}}}} \ No newline at end of file +{"Islam/Meetings/جلسة سورة البقرة.md":{"scroll":0,"cursor":{"from":{"ch":80,"line":17},"to":{"ch":80,"line":17}}},"Secrets/Work Drafts.md":{"scroll":28.5228,"cursor":{"from":{"ch":26,"line":35},"to":{"ch":26,"line":35}}},"Islam/Courses/Self Restart (إعادة تشغيل الذات)/1. Preparatory Introduction (مقدمات إستعدادية).md":{"scroll":384.7819,"cursor":{"from":{"ch":0,"line":392},"to":{"ch":0,"line":392}}},"Sciences/Applied Sciences/Programming/Versioning/Git.md":{"scroll":66.3854,"cursor":{"from":{"ch":233,"line":40},"to":{"ch":233,"line":40}}},"Sciences/Applied Sciences/Programming/DevOps/DevOps.md":{"scroll":43.6234,"cursor":{"from":{"ch":68,"line":47},"to":{"ch":68,"line":47}}},"Sciences/Applied Sciences/Programming/DevOps/CI-CD.md":{"scroll":86.8542,"cursor":{"from":{"ch":0,"line":96},"to":{"ch":0,"line":96}}},"Obsidian/My Obsidian.md":{"scroll":31.4895,"cursor":{"from":{"ch":23,"line":36},"to":{"ch":15,"line":36}}},"Sciences/Formal Sciences/Mathematics/Mathematics.md":{"scroll":10.0591,"cursor":{"from":{"ch":496,"line":12},"to":{"ch":496,"line":12}}},"Sciences/Applied Sciences/Programming/AI/Data Engineering/Data Pipeline.md":{"scroll":42.3143,"cursor":{"from":{"ch":140,"line":50},"to":{"ch":140,"line":50}}},"Sciences/Applied Sciences/Programming/AI/Data Engineering/Apache Spark.md":{"scroll":0,"cursor":{"from":{"ch":1115,"line":2},"to":{"ch":1115,"line":2}}},"Sciences/Applied Sciences/Programming/AI/Data Engineering/GCP Big Data And ML Fundamentals Course.md":{"scroll":6.3718,"cursor":{"from":{"ch":0,"line":0},"to":{"ch":0,"line":0}}},"Sciences/Applied Sciences/Programming/AI/Data Engineering/DE Road Map.md":{"scroll":3.6662,"cursor":{"from":{"ch":0,"line":0},"to":{"ch":0,"line":0}}},"Sciences/Applied Sciences/Programming/AI/Data Engineering/Data Engineering.md":{"scroll":0,"cursor":{"from":{"ch":0,"line":2},"to":{"ch":0,"line":2}}},"Sciences/Applied Sciences/Programming/AI/Data Engineering/Hadoop.md":{"scroll":36.9228,"cursor":{"from":{"ch":37,"line":51},"to":{"ch":37,"line":51}}},"Sciences/Applied Sciences/Programming/AI/Data Engineering/PySpark.md":{"scroll":19.25,"cursor":{"from":{"ch":244,"line":34},"to":{"ch":244,"line":34}}},"Islam/Courses/Self Restart (إعادة تشغيل الذات)/2. Daily Habits (العادات اليومية).md":{"scroll":135.0906,"cursor":{"from":{"ch":0,"line":149},"to":{"ch":0,"line":149}}}} \ No newline at end of file diff --git a/.obsidian/workspace.json b/.obsidian/workspace.json index 3d1b95a4..75959165 100644 --- a/.obsidian/workspace.json +++ b/.obsidian/workspace.json @@ -56,7 +56,7 @@ } } ], - "currentTab": 3 + "currentTab": 2 } ], "direction": "vertical" @@ -173,7 +173,7 @@ "state": { "type": "outline", "state": { - "file": "Islam/Courses/Self Restart (إعادة تشغيل الذات)/2. Daily Habits (العادات اليومية).md" + "file": "Sciences/Applied Sciences/Programming/AI/Data Engineering/PySpark.md" } } }, @@ -195,7 +195,7 @@ "state": { "type": "backlink", "state": { - "file": "Islam/Courses/Self Restart (إعادة تشغيل الذات)/2. Daily Habits (العادات اليومية).md", + "file": "Sciences/Applied Sciences/Programming/AI/Data Engineering/PySpark.md", "collapseAll": false, "extraContext": false, "sortOrder": "alphabetical", @@ -212,7 +212,7 @@ "state": { "type": "outgoing-link", "state": { - "file": "Islam/Courses/Self Restart (إعادة تشغيل الذات)/2. Daily Habits (العادات اليومية).md", + "file": "Sciences/Applied Sciences/Programming/AI/Data Engineering/PySpark.md", "linksCollapsed": false, "unlinkedCollapsed": true } @@ -259,23 +259,24 @@ "periodic-notes:Open today": false } }, - "active": "b65f4c8e5504c5f4", + "active": "95cc1ae200dc70fe", "lastOpenFiles": [ - "Islam/Courses/Self Restart (إعادة تشغيل الذات)/1. Preparatory Introduction (مقدمات إستعدادية).md", - "Islam/Courses/Self Restart (إعادة تشغيل الذات)/2. Daily Habits (العادات اليومية).md", + "Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240126145307.png", + "Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240126145133.png", + "Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240126145130.png", + "Sciences/Applied Sciences/Programming/AI/Data Engineering/Apache Spark.md", "Sciences/Applied Sciences/Programming/AI/Data Engineering/PySpark.md", - "Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240125184550.png", + "Islam/Courses/Self Restart (إعادة تشغيل الذات)/2. Daily Habits (العادات اليومية).md", "Sciences/Applied Sciences/Programming/AI/Data Engineering/Hadoop.md", + "Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240126144538.png", + "Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240126144452.png", + "Islam/Courses/Self Restart (إعادة تشغيل الذات)/1. Preparatory Introduction (مقدمات إستعدادية).md", + "Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240125184550.png", "Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240125163021.png", "Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240125162942.png", "Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240125162936.png", - "Sciences/Applied Sciences/Programming/AI/Data Engineering/Apache Spark.md", "Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240125144217.png", "Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240125144210.png", - "Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240125143003.png", - "Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240125142157.png", - "Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240125142054.png", - "Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240125121533.png", "Sciences/Formal Sciences/Mathematics/Graph Theory/Directed Acyclic Graph (DAG).md", "Sciences/Formal Sciences/Mathematics/Math Jargon.md", "Sciences/Formal Sciences/Mathematics/Graph Theory", @@ -305,7 +306,6 @@ "Islam/Meetings/جلسة.md", "Tasks/Personal Tasks/Daily/2024-01-07.md", "Tasks/Personal Tasks/Weekly/2024-W01.md", - "Tasks/Personal Tasks/Yearly/Annual Template.md", "Tasks/Personal Tasks/Yearly/Annual Template.md.crdownload", "Islam/The Quran/Untitled.canvas" ] diff --git a/Sciences/Applied Sciences/Programming/AI/Data Engineering/Apache Spark.md b/Sciences/Applied Sciences/Programming/AI/Data Engineering/Apache Spark.md index 6fe45d19..4bd32ef9 100644 --- a/Sciences/Applied Sciences/Programming/AI/Data Engineering/Apache Spark.md +++ b/Sciences/Applied Sciences/Programming/AI/Data Engineering/Apache Spark.md @@ -1,7 +1,6 @@ #apache-spark -[source 2](https://www.databricks.com/blog/2014/01/21/spark-and-hadoop.html), [source 3](https://www.youtube.com/watch?v=AGgyf9bO_8M&list=PLlUZLZydkS7_8WnK8fMENmJFSfPwxw9Fi), [source 4](https://emerginginsightsnow.com/2015/05/17/apache-spark-ecosystem-grows-rapidly-has-hadoop-met-its-match/), [source 5](https://mydataexperiments.com/2017/04/11/hadoop-ecosystem-a-quick-glance/), [source 6](https://inoxoft.com/blog/key-differences-between-mapreduce-and-spark/), [source 7](https://www.youtube.com/watch?v=GAK3mbI_sPY&list=PLlUZLZydkS7_8WnK8fMENmJFSfPwxw9Fi&index=3), [source 8](https://www.quora.com/What-is-the-difference-between-spark-and-pyspark), [source 9](https://www.youtube.com/watch?v=YEGnTKRHpu8&list=PLlUZLZydkS7_8WnK8fMENmJFSfPwxw9Fi&index=3), [source 10](https://www.javatpoint.com/apache-spark-architecture), [source 11](https://mallikarjuna_g.gitbooks.io/spark/content/), [source 12](), - +[source 2](https://www.databricks.com/blog/2014/01/21/spark-and-hadoop.html), [source 3](https://www.youtube.com/watch?v=AGgyf9bO_8M&list=PLlUZLZydkS7_8WnK8fMENmJFSfPwxw9Fi), [source 4](https://emerginginsightsnow.com/2015/05/17/apache-spark-ecosystem-grows-rapidly-has-hadoop-met-its-match/), [source 5](https://mydataexperiments.com/2017/04/11/hadoop-ecosystem-a-quick-glance/), [source 6](https://inoxoft.com/blog/key-differences-between-mapreduce-and-spark/), [source 7](https://www.youtube.com/watch?v=GAK3mbI_sPY&list=PLlUZLZydkS7_8WnK8fMENmJFSfPwxw9Fi&index=3), [source 8](https://www.quora.com/What-is-the-difference-between-spark-and-pyspark), [source 9](https://www.youtube.com/watch?v=YEGnTKRHpu8&list=PLlUZLZydkS7_8WnK8fMENmJFSfPwxw9Fi&index=3), [source 10](https://www.javatpoint.com/apache-spark-architecture), [source 11](https://mallikarjuna_g.gitbooks.io/spark/content/), [source 12](), [source 13](https://www.youtube.com/watch?v=71ntq5LImRc), Side notes: * ([s11](https://mallikarjuna_g.gitbooks.io/spark/content/)) is amazing for mastering Spark. It is neither a book, nor a bunch of articles; it's something in between, and its explanations are straight to the point. Kudos to [Jacek Laskowski](https://pl.linkedin.com/in/jaceklaskowski) @@ -134,15 +133,27 @@ Visualization of Spark Context's internal workings ([s11](https://mallikarjuna_g Side note 1: "access is slower" in PySpark because it has to retrieve (i.e., access) the data from multiple nodes. +# Resilient Distributed Datasets (RDDs) + +#apache-spark #rdd -# Important Spark Concepts +TODO. -## Resilient Distributed Datasets (RDDs) +# RDD vs Spark DataFrame vs Spark Datasets -#apache-spark #rdd +#apache-spark #rdd #spark-dataframe #dataframe #spark-dataset #pyspark-dataframe #pandas-dataframe #pandas + +![](Media-Temp/Pasted%20image%2020240126144452.png) + +([s13](https://www.youtube.com/watch?v=71ntq5LImRc)) + +![](Media-Temp/Pasted%20image%2020240126144538.png) + +([s13](https://www.youtube.com/watch?v=71ntq5LImRc)) +Check the [PySpark](PySpark.md) note for [difference](PySpark.md#PySpark%20DataFrame%20vs%20Pandas%20DataFrame) between PySpark's DataFrame and Pandas' DataFrame. -## Data Lineage Graph (DLG) +# Data Lineage Graph (DLG) #apache-spark #data-lineage-graph diff --git a/Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240126144452.png b/Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240126144452.png new file mode 100644 index 00000000..15872667 Binary files /dev/null and b/Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240126144452.png differ diff --git a/Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240126144538.png b/Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240126144538.png new file mode 100644 index 00000000..972c10d9 Binary files /dev/null and b/Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240126144538.png differ diff --git a/Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240126145130.png b/Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240126145130.png new file mode 100644 index 00000000..c9ed7ef9 Binary files /dev/null and b/Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240126145130.png differ diff --git a/Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240126145133.png b/Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240126145133.png new file mode 100644 index 00000000..c9ed7ef9 Binary files /dev/null and b/Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240126145133.png differ diff --git a/Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240126145307.png b/Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240126145307.png new file mode 100644 index 00000000..76be79a7 Binary files /dev/null and b/Sciences/Applied Sciences/Programming/AI/Data Engineering/Media-Temp/Pasted image 20240126145307.png differ diff --git a/Sciences/Applied Sciences/Programming/AI/Data Engineering/PySpark.md b/Sciences/Applied Sciences/Programming/AI/Data Engineering/PySpark.md index ee8273d4..6e49d16d 100644 --- a/Sciences/Applied Sciences/Programming/AI/Data Engineering/PySpark.md +++ b/Sciences/Applied Sciences/Programming/AI/Data Engineering/PySpark.md @@ -1,6 +1,6 @@ #pyspark -[source 1](https://www.quora.com/What-is-the-difference-between-spark-and-pyspark), [source 2](https://www.youtube.com/watch?v=YEGnTKRHpu8&list=PLlUZLZydkS7_8WnK8fMENmJFSfPwxw9Fi&index=3), [source 3](https://medium.com/analytics-vidhya/solving-complex-big-data-problems-using-combinations-of-window-functions-deep-dive-in-pyspark-b1830eb00b7d), [source 4](https://yuanxu-li.github.io/technical/2018/06/10/reduce-and-fold-in-spark.html), [source 5](https://github.com/tirthajyoti/Spark-with-Python), [source 6](https://www.youtube.com/watch?v=rkoYVCJPX6o&list=PLlUZLZydkS7_8WnK8fMENmJFSfPwxw9Fi&index=6) +[source 1](https://www.quora.com/What-is-the-difference-between-spark-and-pyspark), [source 2](https://www.youtube.com/watch?v=YEGnTKRHpu8&list=PLlUZLZydkS7_8WnK8fMENmJFSfPwxw9Fi&index=3), [source 3](https://medium.com/analytics-vidhya/solving-complex-big-data-problems-using-combinations-of-window-functions-deep-dive-in-pyspark-b1830eb00b7d), [source 4](https://yuanxu-li.github.io/technical/2018/06/10/reduce-and-fold-in-spark.html), [source 5](https://github.com/tirthajyoti/Spark-with-Python), [source 6](https://www.youtube.com/watch?v=rkoYVCJPX6o&list=PLlUZLZydkS7_8WnK8fMENmJFSfPwxw9Fi&index=6), [source 7](https://towardsdatascience.com/parallelize-pandas-dataframe-computations-w-spark-dataframe-bba4c924487c), [source 8](https://medium.zenika.com/a-comparison-between-rdd-dataframe-and-dataset-in-spark-from-a-developers-point-of-view-a539b5acf734), [source 9](https://igorshvab.medium.com/from-pandas-to-pyspark-dataframes-c25104879c29) # PySpark vs Apache Spark @@ -20,6 +20,20 @@ Side note 1: "access is slower" in PySpark because it has to retrieve (i.e., access) the data from multiple nodes. +## PySpark DataFrame vs Pandas DataFrame + +#pyspark #pyspark-dataframe #spark-dataframe #pandas-dataframe #pandas + +![](Media-Temp/Pasted%20image%2020240126145130.png) + +([s7](https://towardsdatascience.com/parallelize-pandas-dataframe-computations-w-spark-dataframe-bba4c924487c)) + +![](Media-Temp/Pasted%20image%2020240126145307.png) + +([s8](https://medium.zenika.com/a-comparison-between-rdd-dataframe-and-dataset-in-spark-from-a-developers-point-of-view-a539b5acf734#705c)) + +Important: images comparing between the two DataFrame types can be found in ([s9](https://igorshvab.medium.com/from-pandas-to-pyspark-dataframes-c25104879c29#:~:text=Below%20is%20short%20cheatsheet)). + # PySpark Syntax ## Reduce and Fold @@ -150,7 +164,6 @@ Example 1 ([s6, 17:20](https://youtu.be/rkoYVCJPX6o?list=PLlUZLZydkS7_8WnK8fMENm ![](Media-Temp/Pasted%20image%2020240125184550.png) -# Important PySpark Concepts ## PySpark Window