<a href="https://colab.research.google.com/github/Vasugi2003/Big-Data-Analytics/blob/main/Transformations_using_PySpark_SQL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285387 sha256=376418883169ab34ab8c569c9e6a1e0f1a0bec60a2c832befcdf6b3bfe53131f
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [None]:
# Step 1: Install and import PySpark
from pyspark.sql import SparkSession
# Step 2: Create a Spark session
spark = SparkSession.builder.appName("PySparkTransformationsExample").getOrCreate()
# Sample data
data = [("aravind", "DataAnalyst", 28),
        ("Banu", "Analyst", 22),
        ("Cibi", "Manager", 35),
        ("Devi", "Engineer", 30)]
columns = ["name", "job", "age"]

# Step 3: Create a DataFrame
df = spark.createDataFrame(data, columns)

# Register the DataFrame as a temporary SQL table
df.createOrReplaceTempView("people")

In [None]:

# Step 4: Perform basic transformations using PySpark SQL

# Select specific columns
selected_df = spark.sql("SELECT name, age FROM people")

# Step 5: Show the results of each transformation
print("Selected Columns:")
selected_df.show()

Selected Columns:
+-------+---+
|   name|age|
+-------+---+
|aravind| 28|
|   Banu| 22|
|   Cibi| 35|
|   Devi| 30|
+-------+---+



In [None]:
# Filter rows based on a condition
filtered_df = spark.sql("SELECT * FROM people WHERE age > 25")

print("Filtered Rows:")
filtered_df.show()

Filtered Rows:
+-------+-----------+---+
|   name|        job|age|
+-------+-----------+---+
|aravind|DataAnalyst| 28|
|   Cibi|    Manager| 35|
|   Devi|   Engineer| 30|
+-------+-----------+---+



In [None]:
# Rename columns
renamed_df = spark.sql("SELECT name AS full_name, job AS occupation FROM people")
print("Renamed Columns:")
renamed_df.show()

Renamed Columns:
+---------+-----------+
|full_name| occupation|
+---------+-----------+
|  aravind|DataAnalyst|
|     Banu|    Analyst|
|     Cibi|    Manager|
|     Devi|   Engineer|
+---------+-----------+



In [None]:
# Adding a new column (e.g., calculating a new age)
new_age_df = spark.sql("SELECT *, age + 5 AS new_age FROM people")
print("Added New Column (new_age):")
new_age_df.show()


Added New Column (new_age):
+-------+-----------+---+-------+
|   name|        job|age|new_age|
+-------+-----------+---+-------+
|aravind|DataAnalyst| 28|     33|
|   Banu|    Analyst| 22|     27|
|   Cibi|    Manager| 35|     40|
|   Devi|   Engineer| 30|     35|
+-------+-----------+---+-------+



In [None]:

# Aggregation (e.g., finding the average age by job)
avg_age_by_job = spark.sql("SELECT job, AVG(age) AS avg_age FROM people GROUP BY job")
print("Aggregation (Average Age by Job):")
avg_age_by_job.show()


Aggregation (Average Age by Job):
+-----------+-------+
|        job|avg_age|
+-----------+-------+
|    Analyst|   22.0|
|DataAnalyst|   28.0|
|   Engineer|   30.0|
|    Manager|   35.0|
+-----------+-------+



In [None]:
# Sorting by age in descending order
sorted_df = spark.sql("SELECT * FROM people ORDER BY age DESC")
print("Sorted by Age:")
sorted_df.show()


Sorted by Age:
+-------+-----------+---+
|   name|        job|age|
+-------+-----------+---+
|   Cibi|    Manager| 35|
|   Devi|   Engineer| 30|
|aravind|DataAnalyst| 28|
|   Banu|    Analyst| 22|
+-------+-----------+---+



In [None]:
# Distinct values in the job column
distinct_jobs_df = spark.sql("SELECT DISTINCT job FROM people")
print("Distinct Jobs:")
distinct_jobs_df.show()

Distinct Jobs:
+-----------+
|        job|
+-----------+
|    Analyst|
|DataAnalyst|
|   Engineer|
|    Manager|
+-----------+



In [None]:
# Joining two DataFrames based on a common column (job)
data2 = [("Engineer", "Python"),
         ("Analyst", "SQL")]
columns2 = ["job", "skill"]
df2 = spark.createDataFrame(data2, columns2)
df2.show()

+--------+------+
|     job| skill|
+--------+------+
|Engineer|Python|
| Analyst|   SQL|
+--------+------+



In [None]:
df2.createOrReplaceTempView("people_1")
joined_df = spark.sql("SELECT * FROM people JOIN people_1 ON people.job = people_1.job")

print("Joined DataFrames:")
joined_df.show()

Joined DataFrames:
+----+--------+---+--------+------+
|name|     job|age|     job| skill|
+----+--------+---+--------+------+
|Banu| Analyst| 22| Analyst|   SQL|
|Devi|Engineer| 30|Engineer|Python|
+----+--------+---+--------+------+



In [None]:
# Step 6: Stop the Spark session
spark.stop()