<a href="https://colab.research.google.com/github/Sam-Ny/PySpark/blob/main/Pyspark_basics_csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [93]:
!pip install pyspark py4j



# To load and analyse the fakefriends.csv data.

In [94]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import pyspark.sql.functions as func

In [95]:
#Creating the SparkSession
spark_fakefriends = SparkSession.builder.appName("FirstApp").getOrCreate()

In [96]:
#Defining schema for your Dataframe
myschema = StructType([\
                       StructField("userID", IntegerType(),True), #True means is nullable
                       StructField("name", StringType(),True),
                       StructField("age", IntegerType(),True),
                       StructField("friends", IntegerType(),True)
                       ])

In [97]:
#Creating Dataframe on a CSV file
people = spark.read.format("CSV")\
      .schema(myschema)\
      .option("path","/content/fakefriends.csv")\
      .load()

In [98]:
#Performing all the transformations
output=people.select(people.userID,people.name,people.age,people.friends).where(people.age<30).withColumn('insert_timestamp',func.current_timestamp()).orderBy(people.userID)

In [99]:
#taking the count of o/p dataframe
output.show()

+------+--------+---+-------+--------------------+
|userID|    name|age|friends|    insert_timestamp|
+------+--------+---+-------+--------------------+
|     1|Jean-Luc| 26|      2|2024-02-15 09:12:...|
|     9|    Hugh| 27|    181|2024-02-15 09:12:...|
|    16|  Weyoun| 22|    323|2024-02-15 09:12:...|
|    21|   Miles| 19|    268|2024-02-15 09:12:...|
|    24|  Julian| 25|      1|2024-02-15 09:12:...|
|    25|     Ben| 21|    445|2024-02-15 09:12:...|
|    26|  Julian| 22|    100|2024-02-15 09:12:...|
|    32|     Nog| 26|    281|2024-02-15 09:12:...|
|    35| Beverly| 27|    305|2024-02-15 09:12:...|
|    46|    Morn| 25|     96|2024-02-15 09:12:...|
|    47|   Brunt| 24|     49|2024-02-15 09:12:...|
|    48|     Nog| 20|      1|2024-02-15 09:12:...|
|    52| Beverly| 19|    269|2024-02-15 09:12:...|
|    54|   Brunt| 19|      5|2024-02-15 09:12:...|
|    60|  Geordi| 20|    100|2024-02-15 09:12:...|
|    66|  Geordi| 21|    477|2024-02-15 09:12:...|
|    72|  Kasidy| 22|    179|20

In [100]:
#Creating a Temp View
output.createOrReplaceTempView("peoples")

In [101]:
#Running a simple Spark SQL query
spark.sql("select userId,name,age,friends,insert_timestamp from peoples").show()

+------+--------+---+-------+--------------------+
|userId|    name|age|friends|    insert_timestamp|
+------+--------+---+-------+--------------------+
|     1|Jean-Luc| 26|      2|2024-02-15 09:12:...|
|     9|    Hugh| 27|    181|2024-02-15 09:12:...|
|    16|  Weyoun| 22|    323|2024-02-15 09:12:...|
|    21|   Miles| 19|    268|2024-02-15 09:12:...|
|    24|  Julian| 25|      1|2024-02-15 09:12:...|
|    25|     Ben| 21|    445|2024-02-15 09:12:...|
|    26|  Julian| 22|    100|2024-02-15 09:12:...|
|    32|     Nog| 26|    281|2024-02-15 09:12:...|
|    35| Beverly| 27|    305|2024-02-15 09:12:...|
|    46|    Morn| 25|     96|2024-02-15 09:12:...|
|    47|   Brunt| 24|     49|2024-02-15 09:12:...|
|    48|     Nog| 20|      1|2024-02-15 09:12:...|
|    52| Beverly| 19|    269|2024-02-15 09:12:...|
|    54|   Brunt| 19|      5|2024-02-15 09:12:...|
|    60|  Geordi| 20|    100|2024-02-15 09:12:...|
|    66|  Geordi| 21|    477|2024-02-15 09:12:...|
|    72|  Kasidy| 22|    179|20

# To load and analyse the operations_management.csv data.

In [102]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, desc

In [103]:
spark_operations = SparkSession.builder.appName('operations_management data analisation').getOrCreate()

In [104]:
print(spark.version)

3.5.0


In [105]:
data_frame = spark_operations.read.format('CSV').\
option('inferSchema','true').\
option('header','true').\
option('path','/content/operations_management.csv').\
load()

In [106]:
data_frame.printSchema()

root
 |-- description: string (nullable = true)
 |-- industry: string (nullable = true)
 |-- level: integer (nullable = true)
 |-- size: string (nullable = true)
 |-- line_code: string (nullable = true)
 |-- value: integer (nullable = true)



In [107]:
data_frame_2 = data_frame.select('industry','value').\
where(data_frame.value > 10000).\
orderBy(desc('value'))

In [108]:
data_frame_2.printSchema()

root
 |-- industry: string (nullable = true)
 |-- value: integer (nullable = true)



In [109]:
data_frame_2.show(5)

+--------+-----+
|industry|value|
+--------+-----+
|   total|41091|
|   total|40431|
|   total|33984|
|   total|33750|
|   total|32652|
+--------+-----+
only showing top 5 rows



In [110]:
# Or we can use filter instead of where clause to filter using value columns
data_frame_3 = data_frame.select('industry','value').\
filter((col('value') > 200) & (col('industry') != 'total')).\
orderBy(desc('value'))

In [111]:
data_frame_3.printSchema()

root
 |-- industry: string (nullable = true)
 |-- value: integer (nullable = true)



In [112]:
data_frame_3.show(5)

+--------------------+-----+
|            industry|value|
+--------------------+-----+
|        Construction| 6030|
|        Construction| 5904|
|        Construction| 5229|
|Accommodation & f...| 5058|
|        Construction| 4965|
+--------------------+-----+
only showing top 5 rows



In [113]:
# Creating a Temp View
data_frame_3.createOrReplaceTempView('data') #Here data is any name given to view

In [114]:
# To get the temp view data
spark_operations.sql('''select industry, value
from data
where value >200 and
industry !="total" order by value desc
''').show(5)

+--------------------+-----+
|            industry|value|
+--------------------+-----+
|        Construction| 6030|
|        Construction| 5904|
|        Construction| 5229|
|Accommodation & f...| 5058|
|        Construction| 4965|
+--------------------+-----+
only showing top 5 rows



# To Create Global and Section Scope view using operations_management.csv data

In [226]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, desc

In [227]:
spark_operations = SparkSession.builder.appName('operations_management data analisation').getOrCreate()

In [228]:
print(spark_operations.version)

3.5.0


In [229]:
data_frame = spark_operations.read.format('CSV').\
option('inferSchema','true').\
option('header','true').\
option('path','/content/operations_management.csv').\
load()

In [230]:
data_frame.printSchema()

root
 |-- description: string (nullable = true)
 |-- industry: string (nullable = true)
 |-- level: integer (nullable = true)
 |-- size: string (nullable = true)
 |-- line_code: string (nullable = true)
 |-- value: integer (nullable = true)



In [231]:
data_frame_3.createOrReplaceGlobalTempView("test")

In [232]:
data_frame_4 = spark_operations.sql('select * from test')
data_frame_4.show()

AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `test` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 1 pos 14;
'Project [*]
+- 'UnresolvedRelation [test], [], false


In [None]:
spark_operations.catalog.dropGlobalTempView('test')
spark_operations.catalog.listDatabases()

In [None]:
# Assuming spark is your SparkSession object
global_temp_views = spark_operations.catalog.listTables("global_temp")

# Print the list of global temporary views
for view in global_temp_views:
    print(view)
