<a href="https://colab.research.google.com/github/Sam-Ny/PySpark/blob/main/Pyspark_fakefriends_csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark py4j

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=8a051bf5a4ecd977c93c3401ffac32346a6b6c14a9425b916ae336579262148f
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import pyspark.sql.functions as func

In [3]:
#Creating the SparkSession
spark = SparkSession.builder.appName("FirstApp").getOrCreate()

In [4]:
#Defining schema for your Dataframe
myschema = StructType([\
                       StructField("userID", IntegerType(),True),
                       StructField("name", StringType(),True),
                       StructField("age", IntegerType(),True),
                       StructField("friends", IntegerType(),True)
                       ])

In [5]:
#Creating Dataframe on a CSV file
people = spark.read.format("CSV")\
      .schema(myschema)\
      .option("path","/content/fakefriends.csv")\
      .load()

In [6]:
#Performing all the transformations
output=people.select(people.userID,people.name,people.age,people.friends).where(people.age<30).withColumn('insert_timestamp',func.current_timestamp()).orderBy(people.userID)

In [7]:
#taking the count of o/p dataframe
output.show()

+------+--------+---+-------+--------------------+
|userID|    name|age|friends|    insert_timestamp|
+------+--------+---+-------+--------------------+
|     1|Jean-Luc| 26|      2|2024-02-14 08:20:...|
|     9|    Hugh| 27|    181|2024-02-14 08:20:...|
|    16|  Weyoun| 22|    323|2024-02-14 08:20:...|
|    21|   Miles| 19|    268|2024-02-14 08:20:...|
|    24|  Julian| 25|      1|2024-02-14 08:20:...|
|    25|     Ben| 21|    445|2024-02-14 08:20:...|
|    26|  Julian| 22|    100|2024-02-14 08:20:...|
|    32|     Nog| 26|    281|2024-02-14 08:20:...|
|    35| Beverly| 27|    305|2024-02-14 08:20:...|
|    46|    Morn| 25|     96|2024-02-14 08:20:...|
|    47|   Brunt| 24|     49|2024-02-14 08:20:...|
|    48|     Nog| 20|      1|2024-02-14 08:20:...|
|    52| Beverly| 19|    269|2024-02-14 08:20:...|
|    54|   Brunt| 19|      5|2024-02-14 08:20:...|
|    60|  Geordi| 20|    100|2024-02-14 08:20:...|
|    66|  Geordi| 21|    477|2024-02-14 08:20:...|
|    72|  Kasidy| 22|    179|20

In [None]:
#Creating a Temp View
output.createOrReplaceTempView("peoples")

In [None]:
#Running a simple Spark SQL query
spark.sql("select userId,name,age,friends,insert_timestamp from peoples").show()

+------+--------+---+-------+--------------------+
|userId|    name|age|friends|    insert_timestamp|
+------+--------+---+-------+--------------------+
|     1|Jean-Luc| 26|      2|2024-02-13 06:47:...|
|     9|    Hugh| 27|    181|2024-02-13 06:47:...|
|    16|  Weyoun| 22|    323|2024-02-13 06:47:...|
|    21|   Miles| 19|    268|2024-02-13 06:47:...|
|    24|  Julian| 25|      1|2024-02-13 06:47:...|
|    25|     Ben| 21|    445|2024-02-13 06:47:...|
|    26|  Julian| 22|    100|2024-02-13 06:47:...|
|    32|     Nog| 26|    281|2024-02-13 06:47:...|
|    35| Beverly| 27|    305|2024-02-13 06:47:...|
|    46|    Morn| 25|     96|2024-02-13 06:47:...|
|    47|   Brunt| 24|     49|2024-02-13 06:47:...|
|    48|     Nog| 20|      1|2024-02-13 06:47:...|
|    52| Beverly| 19|    269|2024-02-13 06:47:...|
|    54|   Brunt| 19|      5|2024-02-13 06:47:...|
|    60|  Geordi| 20|    100|2024-02-13 06:47:...|
|    66|  Geordi| 21|    477|2024-02-13 06:47:...|
|    72|  Kasidy| 22|    179|20