<a href="https://colab.research.google.com/github/TanishqLambhate/Data-Science-Training/blob/pyspark/Pyspark_day_4_View.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=a772049ac7204c83a3623e5ecef94ef2d4bd5c6a1101078af7dc08e8b8e3fbdf
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [2]:
from pyspark.sql import SparkSession

spark=SparkSession.builder.appName("Example").getOrCreate()

In [3]:
csv_file_path="/content/sample_data/people.csv"

#Now you can read it with pyspark
df_csv=spark.read.format("csv").option("header","true").load(csv_file_path)
df_csv.show()

+----+----+-------+
|Name| Age| Gender|
+----+----+-------+
|John|  28|   Male|
|Jane|  32| Female|
+----+----+-------+



In [4]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Define the schema for the JSON file
schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("gender", StringType(), True),
    StructField("address", StructType([
        StructField("street", StringType(), True),
        StructField("city", StringType(), True)
    ]), True)
])

#load the complex JSON file with the correct path

json_file_path="/content/sample_data/sample.json"
df_json_comlplex=spark.read.schema(schema).json(json_file_path)

# Read the file as text to inspect its contents
with open(json_file_path,'r') as f:
  data=f.read()
  print(data)

[
  {
    "name": "John",
    "age": 28,
    "gender": "Male",
    "address": {
      "street": "123 Main St",
      "city": "New York"
    }
  },
  {
    "name": "Jane",
    "age": 32,
    "gender": "Female",
    "address": {
      "street": "456 Elm St",
      "city": "San Francisco"
    }
  }
]


In [6]:
import pandas as pd

data={
    "name":["John","Jane","Mike","Emily"],
    "age":[25,30,22,28],
    "gender":["Male","Female","Male","Female"],
    "city":["New York","London","Paris","Tokyo"]
}

df=pd.DataFrame(data)

#Save the Dataframe
csv_file_path="/content/sample_data/sample_people.csv"
df.to_csv(csv_file_path,index=False)

#confirm the file has been created
print(f"csv file created at:{csv_file_path}")

from pyspark.sql import SparkSession

spark=SparkSession.builder.appName("Create View").getOrCreate()

df_people=spark.read.format("csv").option("header","true").load(csv_file_path)
df_people.show()

#create a temporary view
df_people.createOrReplaceTempView("people_temp_view")

#Run an sql query on the view
result_temp_view=spark.sql("Select name,age,city from people_temp_view where age<30")

result_temp_view.show()

#create a global view
df_people.createOrReplaceGlobalTempView("people_global_view")

#run an sql query on the global view
result_global_view=spark.sql("Select name,age,city from global_temp.people_global_view where age<30")

result_global_view.show()

#List all temprorary views and tables
spark.catalog.listTables()

#Drop a temporary view
spark.catalog.dropTempView("people_temp_view")

#List all temporary views and tables again
spark.catalog.listTables

#Drop the global temprorary view
spark.catalog.dropGlobalTempView("people_global_view")

#List all temporary views and tables again
spark.catalog.listTables

csv file created at:/content/sample_data/sample_people.csv
+-----+---+------+--------+
| name|age|gender|    city|
+-----+---+------+--------+
| John| 25|  Male|New York|
| Jane| 30|Female|  London|
| Mike| 22|  Male|   Paris|
|Emily| 28|Female|   Tokyo|
+-----+---+------+--------+

+-----+---+--------+
| name|age|    city|
+-----+---+--------+
| John| 25|New York|
| Mike| 22|   Paris|
|Emily| 28|   Tokyo|
+-----+---+--------+

+-----+---+--------+
| name|age|    city|
+-----+---+--------+
| John| 25|New York|
| Mike| 22|   Paris|
|Emily| 28|   Tokyo|
+-----+---+--------+



In [None]:
# # Create a new database in Spark SQL
# spark.sql("CREATE DATABASE IF NOT EXISTS my_database")

# # Use the created database
# spark.sql("USE my_database")

# # Verify that the database is being used
# spark.sql("SHOW DATABASES").show()

In [None]:
# Create a new database in Spark SQL

spark.sql("CREATE DATABASE IF NOT EXISTS my_database")

# Use the created database

spark.sql("USE my_database")

# Verify that the database is being used

spark.sql("SHOW DATABASES").show()


import pandas as pd

# Create a sample CSV data

data = {

    "name": ["John", "Jane", "Mike", "Emily", "Alex"],

    "age": [28, 32, 45, 23, 36],

    "gender": ["Male", "Female", "Male", "Female", "Male"],

    "salary": [60000, 72000, 84000, 52000, 67000]

}

df = pd.DataFrame(data)

# Save the DataFrame as a CSV file

csv_file_path = "/content/sample_people.csv"

df.to_csv(csv_file_path, index=False)

# Confirm the CSV file is created

print(f"CSV file created at: {csv_file_path}")

