<a href="https://colab.research.google.com/github/Sam-Ny/PySpark/blob/main/Pyspark_basics_csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark py4j

# To load and analyse the fakefriends.csv data | Data Frame reader and writer.

In [19]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import pyspark.sql.functions as func

In [20]:
#Creating the SparkSession
spark_fakefriends = SparkSession.builder.appName("FirstApp").getOrCreate()

In [21]:
#Defining schema for your Dataframe
myschema = StructType([\
                       StructField("userID", IntegerType(),True), #True means is nullable
                       StructField("name", StringType(),True),
                       StructField("age", IntegerType(),True),
                       StructField("friends", IntegerType(),True)
                       ])

In [23]:
#Creating Dataframe on a CSV file
people = spark_fakefriends.read.format("CSV")\
      .schema(myschema)\
      .option("path","/content/fakefriends.csv")\
      .load()

people.printSchema()

root
 |-- userID: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- friends: integer (nullable = true)



In [24]:
#Performing all the transformations
output=people.select(people.userID,people.name,people.age,people.friends).where(people.age<30).withColumn('insert_timestamp',func.current_timestamp()).orderBy(people.userID).cache()

In [None]:
#taking the count of o/p dataframe
output.show()

In [None]:
#Creating a Temp View
output.createOrReplaceTempView("peoples")

In [None]:
#Running a simple Spark SQL query
spark_fakefriends.sql("select userId,name,age,friends,insert_timestamp from peoples").show()

In [34]:
output.write\
.format("CSV")\
.mode("overwrite")\
.option('path','/content/spark-warehouse/')\
.bucketBy(4,'age')\
.saveAsTable('bucketed_fakefreinds')

In [35]:
bucketed_fakefreinds_df = spark_fakefriends.sql('select * from bucketed_fakefreinds')
bucketed_fakefreinds_df.show(1000)

+------+--------+---+-------+--------------------+
|userID|    name|age|friends|    insert_timestamp|
+------+--------+---+-------+--------------------+
|     1|Jean-Luc| 26|      2|2024-02-17 08:26:...|
|     9|    Hugh| 27|    181|2024-02-17 08:26:...|
|    25|     Ben| 21|    445|2024-02-17 08:26:...|
|    32|     Nog| 26|    281|2024-02-17 08:26:...|
|    35| Beverly| 27|    305|2024-02-17 08:26:...|
|    47|   Brunt| 24|     49|2024-02-17 08:26:...|
|    66|  Geordi| 21|    477|2024-02-17 08:26:...|
|    89|    Worf| 24|    492|2024-02-17 08:26:...|
|   126|   Brunt| 26|     84|2024-02-17 08:26:...|
|   165|   Leeta| 26|    282|2024-02-17 08:26:...|
|   178|  Kasidy| 26|    381|2024-02-17 08:26:...|
|   182|  Weyoun| 26|    145|2024-02-17 08:26:...|
|   200|  Kasidy| 21|    472|2024-02-17 08:26:...|
|   206|    Will| 21|    491|2024-02-17 08:26:...|
|   209|   Brunt| 27|    174|2024-02-17 08:26:...|
|   219| Lwaxana| 26|    345|2024-02-17 08:26:...|
|   221|   Dukat| 27|    150|20

# To load and analyse the operations_management.csv data.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, desc

In [None]:
spark_operations = SparkSession.builder.appName('operations_management data analisation').getOrCreate()

In [None]:
print(spark.version)

3.5.0


In [None]:
data_frame = spark_operations.read.format('CSV').\
option('inferSchema','true').\
option('header','true').\
option('path','/content/operations_management.csv').\
load()

In [None]:
data_frame.printSchema()

root
 |-- description: string (nullable = true)
 |-- industry: string (nullable = true)
 |-- level: integer (nullable = true)
 |-- size: string (nullable = true)
 |-- line_code: string (nullable = true)
 |-- value: integer (nullable = true)



In [None]:
data_frame_2 = data_frame.select('industry','value').\
where(data_frame.value > 10000).\
orderBy(desc('value'))

In [None]:
data_frame_2.printSchema()

root
 |-- industry: string (nullable = true)
 |-- value: integer (nullable = true)



In [None]:
data_frame_2.show(5)

+--------+-----+
|industry|value|
+--------+-----+
|   total|41091|
|   total|40431|
|   total|33984|
|   total|33750|
|   total|32652|
+--------+-----+
only showing top 5 rows



In [None]:
# Or we can use filter instead of where clause to filter using value columns
data_frame_3 = data_frame.select('industry','value').\
filter((col('value') > 200) & (col('industry') != 'total')).\
orderBy(desc('value'))

In [None]:
data_frame_3.printSchema()

root
 |-- industry: string (nullable = true)
 |-- value: integer (nullable = true)



In [None]:
data_frame_3.show(5)

+--------------------+-----+
|            industry|value|
+--------------------+-----+
|        Construction| 6030|
|        Construction| 5904|
|        Construction| 5229|
|Accommodation & f...| 5058|
|        Construction| 4965|
+--------------------+-----+
only showing top 5 rows



In [None]:
# Creating a Temp View
data_frame_3.createOrReplaceTempView('data') #Here data is any name given to view

In [None]:
# To get the temp view data
spark_operations.sql('''select industry, value
from data
where value >200 and
industry !="total" order by value desc
''').show(5)

+--------------------+-----+
|            industry|value|
+--------------------+-----+
|        Construction| 6030|
|        Construction| 5904|
|        Construction| 5229|
|Accommodation & f...| 5058|
|        Construction| 4965|
+--------------------+-----+
only showing top 5 rows



# To Create Global and Section Scope view using operations_management.csv data

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, desc

In [3]:
spark_operations = SparkSession.builder.appName('operations_management data analisation').getOrCreate()

In [4]:
print(spark_operations.version)

3.5.0


In [5]:
data_frame = spark_operations.read.format('CSV').\
option('inferSchema','true').\
option('header','true').\
option('path','/content/operations_management.csv').\
load()

In [6]:
data_frame.printSchema()

root
 |-- description: string (nullable = true)
 |-- industry: string (nullable = true)
 |-- level: integer (nullable = true)
 |-- size: string (nullable = true)
 |-- line_code: string (nullable = true)
 |-- value: integer (nullable = true)



In [15]:
data_frame.createOrReplaceGlobalTempView("test")

In [16]:
data_frame_4 = spark_operations.sql('select * from test')
data_frame_4.show()

AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `test` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 1 pos 14;
'Project [*]
+- 'UnresolvedRelation [test], [], false


In [17]:
# spark_operations.catalog.dropGlobalTempView('test')
spark_operations.catalog.listDatabases()

[Database(name='default', catalog='spark_catalog', description='default database', locationUri='file:/content/spark-warehouse')]

In [18]:
# Assuming spark is your SparkSession object
global_temp_views = spark_operations.catalog.listTables("global_temp")

# Print the list of global temporary views
for view in global_temp_views:
    print(view)


Table(name='test', catalog=None, namespace=['global_temp'], description=None, tableType='TEMPORARY', isTemporary=True)
