<a href="https://colab.research.google.com/github/Sam-Ny/PySpark/blob/main/Pyspark_basics_csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark py4j

# To load and analyse the fakefriends.csv data.

In [85]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import pyspark.sql.functions as func

In [86]:
#Creating the SparkSession
spark_fakefriends = SparkSession.builder.appName("FirstApp").getOrCreate()

In [87]:
#Defining schema for your Dataframe
myschema = StructType([\
                       StructField("userID", IntegerType(),True), #True means is nullable
                       StructField("name", StringType(),True),
                       StructField("age", IntegerType(),True),
                       StructField("friends", IntegerType(),True)
                       ])

In [88]:
#Creating Dataframe on a CSV file
people = spark.read.format("CSV")\
      .schema(myschema)\
      .option("path","/content/fakefriends.csv")\
      .load()

In [89]:
#Performing all the transformations
output=people.select(people.userID,people.name,people.age,people.friends).where(people.age<30).withColumn('insert_timestamp',func.current_timestamp()).orderBy(people.userID)

In [90]:
#taking the count of o/p dataframe
output.show()

+------+--------+---+-------+--------------------+
|userID|    name|age|friends|    insert_timestamp|
+------+--------+---+-------+--------------------+
|     1|Jean-Luc| 26|      2|2024-02-15 08:04:...|
|     9|    Hugh| 27|    181|2024-02-15 08:04:...|
|    16|  Weyoun| 22|    323|2024-02-15 08:04:...|
|    21|   Miles| 19|    268|2024-02-15 08:04:...|
|    24|  Julian| 25|      1|2024-02-15 08:04:...|
|    25|     Ben| 21|    445|2024-02-15 08:04:...|
|    26|  Julian| 22|    100|2024-02-15 08:04:...|
|    32|     Nog| 26|    281|2024-02-15 08:04:...|
|    35| Beverly| 27|    305|2024-02-15 08:04:...|
|    46|    Morn| 25|     96|2024-02-15 08:04:...|
|    47|   Brunt| 24|     49|2024-02-15 08:04:...|
|    48|     Nog| 20|      1|2024-02-15 08:04:...|
|    52| Beverly| 19|    269|2024-02-15 08:04:...|
|    54|   Brunt| 19|      5|2024-02-15 08:04:...|
|    60|  Geordi| 20|    100|2024-02-15 08:04:...|
|    66|  Geordi| 21|    477|2024-02-15 08:04:...|
|    72|  Kasidy| 22|    179|20

In [91]:
#Creating a Temp View
output.createOrReplaceTempView("peoples")

In [92]:
#Running a simple Spark SQL query
spark.sql("select userId,name,age,friends,insert_timestamp from peoples").show()

+------+--------+---+-------+--------------------+
|userId|    name|age|friends|    insert_timestamp|
+------+--------+---+-------+--------------------+
|     1|Jean-Luc| 26|      2|2024-02-15 08:04:...|
|     9|    Hugh| 27|    181|2024-02-15 08:04:...|
|    16|  Weyoun| 22|    323|2024-02-15 08:04:...|
|    21|   Miles| 19|    268|2024-02-15 08:04:...|
|    24|  Julian| 25|      1|2024-02-15 08:04:...|
|    25|     Ben| 21|    445|2024-02-15 08:04:...|
|    26|  Julian| 22|    100|2024-02-15 08:04:...|
|    32|     Nog| 26|    281|2024-02-15 08:04:...|
|    35| Beverly| 27|    305|2024-02-15 08:04:...|
|    46|    Morn| 25|     96|2024-02-15 08:04:...|
|    47|   Brunt| 24|     49|2024-02-15 08:04:...|
|    48|     Nog| 20|      1|2024-02-15 08:04:...|
|    52| Beverly| 19|    269|2024-02-15 08:04:...|
|    54|   Brunt| 19|      5|2024-02-15 08:04:...|
|    60|  Geordi| 20|    100|2024-02-15 08:04:...|
|    66|  Geordi| 21|    477|2024-02-15 08:04:...|
|    72|  Kasidy| 22|    179|20

# To load and analyse the operations_management.csv data.

In [43]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, desc

In [23]:
spark_operations = SparkSession.builder.appName('operations_management data analisation').getOrCreate()

In [None]:
print(spark.version)

In [61]:
data_frame = spark_operations.read.format('CSV').\
option('inferSchema','true').\
option('header','true').\
option('path','/content/operations_management.csv').\
load()

In [None]:
data_frame.printSchema()

In [67]:
data_frame_2 = data_frame.select('industry','value').\
where(data_frame.value > 10000).\
orderBy(desc('value'))

In [None]:
data_frame_2.printSchema()

In [None]:
data_frame_2.show(5)

In [65]:
# Or we can use filter instead of where clause to filter using value columns
data_frame_3 = data_frame.select('industry','value').\
filter((col('value') > 200) & (col('industry') != 'total')).\
orderBy(desc('value'))

In [None]:
data_frame_3.printSchema()

In [None]:
data_frame_3.show(5)

In [73]:
# Creating a Temp View
data_frame_3.createOrReplaceTempView('data') #Here data is any name given to view

In [None]:
# To get the temp view data
spark_operations.sql('''select industry, value
from data
where value >200 and
industry !="total" order by value desc
''').show(5)