# Alias

In [None]:
# What is alias in PySpark?

# alias is used to give a temporary name to a column or expression in a DataFrame.
# Often used with select, withColumn, groupBy, or agg.
# It does not rename the column in the original DataFrame; it’s just for the result of that operation.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, upper, lit

spark = SparkSession.builder.appName("select").getOrCreate()
data = [("Alice", 25, "USA"), ("Bob", 30, "UK")]
df = spark.createDataFrame(data, ["name", "age", "country"])
df.show()

3️⃣ Using alias in select

In [None]:
df.select(
    col("name").alias("employee_name"),
    (col("age") + 5).alias("age_plus_5"),
    upper(col("country")).alias("country_upper")
).show()

4️⃣ Using alias in withColumn

In [None]:
df_new = df.withColumn("age_new", (col("age") + 10).alias("age_plus_10"))
df_new.show()

5️⃣ Using alias in groupBy / agg

In [None]:
from pyspark.sql.functions import avg

df.groupBy("country") \
  .agg(avg("age").alias("average_age")) \
  .show()

In [27]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import first

spark = SparkSession.builder \
    .master("local[*]") \
    .appName("ExcelRead") \
    .getOrCreate()

pdf = pd.read_excel(
    r"C:\Git files\My git files\PySpark\files\7_Day_Data_Engineer_Interview_Plan.xlsx",
    sheet_name="7-Day Timed Plan"
)

df = spark.createDataFrame(pdf)

# pivot_df = (
#     df.groupBy("Time")          # string OR col() both OK
#       .pivot("Focus Area")      # ❗ MUST be string
#       .agg(first("Topics"))
# )

# pivot_df.show(truncate=False)
df.show(truncate=False)


+------+-----------+----------+---------------------------------+---------------------+
|Date  |Time       |Focus Area|Topics                           |Interview Angle      |
+------+-----------+----------+---------------------------------+---------------------+
|26 Dec|10:30-11:00|Warm-up   |Revision + mindset               |Speak answers aloud  |
|26 Dec|11:00-13:00|Python    |Lists, Dicts, Comprehensions     |Core Python questions|
|26 Dec|13:00-14:00|Break     |Lunch                            |NULL                 |
|26 Dec|14:00-15:30|SQL       |Joins, WHERE vs HAVING           |Production SQL       |
|26 Dec|15:30-15:45|Break     |Tea                              |NULL                 |
|26 Dec|15:45-17:30|PySpark   |RDD vs DataFrame, Lazy evaluation|Internals            |
|26 Dec|17:30-18:00|Break     |Rest                             |NULL                 |
|26 Dec|18:00-19:30|GCP       |BigQuery basics                  |Analytics design     |
|26 Dec|19:30-20:00|Break     |D

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 59351)
Traceback (most recent call last):
  File "c:\Users\sachi\AppData\Local\Programs\Python\Python310\lib\socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "c:\Users\sachi\AppData\Local\Programs\Python\Python310\lib\socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "c:\Users\sachi\AppData\Local\Programs\Python\Python310\lib\socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "c:\Users\sachi\AppData\Local\Programs\Python\Python310\lib\socketserver.py", line 747, in __init__
    self.handle()
  File "c:\Users\sachi\AppData\Local\Programs\Python\Python310\lib\site-packages\pyspark\accumulators.py", line 295, in handle
    poll(accum_updates)
  File "c:\Users\sachi\AppData\Local\Programs\Python\Python310\lib