In [None]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=fa8c40c59e80e7785cb75de57f3de6530ace8e4e3a4ed5a904b14610c9d8c49e
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
.appName('Data Ingestion')\
.getOrCreate()

In [None]:
csv_file_path="/content/sample_data/people.csv"

df_csv=spark.read.format("csv").option("header","true").load(csv_file_path)
df_csv.show()

+----+---+------+
|Name|Age|Gender|
+----+---+------+
|John| 28|  Male|
|Jane| 32|Female|
+----+---+------+



In [None]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType

# Define the schema for json file
schema=StructType([
    StructField("name",StringType(),True),
    StructField("age",IntegerType(),True),
    StructField("gender",StringType(),True),
    StructField("address", StructType([
        StructField("street", StringType(), True),
        StructField("city", StringType(), True)
    ]), True)
])

json_file_path="/content/sample_data/sample.json"

df_json_complex=spark.read.schema(schema).json(json_file_path)

with open(json_file_path, 'r') as file:
    json_data = file.read()

    print(json_data)




[
  {
    "name": "John",
    "age": 28,
    "gender": "Male",
    "address": {
      "street": "123 Main St",
      "city": "New York"
    }
  },
  {
    "name": "Jane",
    "age": 32,
    "gender": "Female",
    "address": {
      "street": "456 Elm St",
      "city": "San Francisco"
    }
  }
]


In [None]:
import pandas as pd

# create a sample dataframe
data={
    "name":["John","Jane","Mike","Emily"],
    "age":[28,32,45,23],
    "gender":["Male","Female","Male","Female"],
    "City":["New York","San Francisco","Los Angeles","Chicago"]
}

df=pd.DataFrame(data)

# save the dataframe to a csv file
csv_file_path="/content/sample_people.csv"
df.to_csv(csv_file_path,index=False)

# confirm the file has been created
print(f"CSV file created at {csv_file_path}")
from pyspark.sql import SparkSession

# Initialize the SparkSession
spark=SparkSession.builder.appName("Create view example").getOrCreate()

# Load the csv file into a pyspark dataframe
df_people=spark.read.format("csv").option("header","true").option("inferSchema", "true").load(csv_file_path)

df_people.show()

# create a temporary view
df_people.createOrReplaceTempView("people_temp_view")

result_temp_view=spark.sql("SELECT name, age, gender, city FROM people_temp_view WHERE age>30 ")
result_temp_view.show()

# create a global temporary view
df_people.createOrReplaceGlobalTempView("people_global_view")

result_global_view=spark.sql("SELECT name, age, gender, city FROM global_temp.people_global_view WHERE age<30 ")
result_global_view.show()

spark.catalog.listTables()

spark.catalog.dropTempView("people_temp_view")

spark.catalog.dropGlobalTempView("people_global_view")


CSV file created at /content/sample_people.csv
+-----+---+------+-------------+
| name|age|gender|         City|
+-----+---+------+-------------+
| John| 28|  Male|     New York|
| Jane| 32|Female|San Francisco|
| Mike| 45|  Male|  Los Angeles|
|Emily| 23|Female|      Chicago|
+-----+---+------+-------------+

+----+---+------+-------------+
|name|age|gender|         city|
+----+---+------+-------------+
|Jane| 32|Female|San Francisco|
|Mike| 45|  Male|  Los Angeles|
+----+---+------+-------------+

+-----+---+------+--------+
| name|age|gender|    city|
+-----+---+------+--------+
| John| 28|  Male|New York|
|Emily| 23|Female| Chicago|
+-----+---+------+--------+



True

In [None]:
spark.sql("CREATE DATABASE IF NOT EXISTS my_database")
spark.sql("USE my_database")
spark.sql("SHOW DATABASES").show()

+-----------+
|  namespace|
+-----------+
|    default|
|my_database|
+-----------+

