# BESICS

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("ExampleApp").master("local[*]").getOrCreate()
print("=" * 50)
print("1. INITIAL DATAFRAME")
print("=" * 50)
data = [("Alice", 34), ("Bob", 45), ("Cathy", 29)]
df = spark.createDataFrame(data, ["Name", "Age"])
df.show()
spark.stop()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
26/01/22 07:24:54 WARN Utils: Your hostname, Somnath, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
26/01/22 07:24:54 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/22 07:24:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


1. INITIAL DATAFRAME


                                                                                

+-----+---+
| Name|Age|
+-----+---+
|Alice| 34|
|  Bob| 45|
|Cathy| 29|
+-----+---+



In [None]:
spark._jvm.org.apache.hadoop.util.VersionInfo.getVersion()


'3.4.1'

In [None]:
from pyspark.sql.functions import col
df_filtered = df.filter(col("Age") > 30)
df_filtered.show()
spark.stop()

+-----+---+
| Name|Age|
+-----+---+
|Alice| 34|
|  Bob| 45|
+-----+---+



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when, avg, sum, count, upper, lower, concat
print("=" * 50)
print("2. ADDING MORE ROWS")
print("=" * 50)
new_data = [("David", 38), ("Emma", 27), ("Frank", 52)]
new_df = spark.createDataFrame(new_data, ["Name", "Age"])
new_df.show()


2. ADDING MORE ROWS
+-----+---+
| Name|Age|
+-----+---+
|David| 38|
| Emma| 27|
|Frank| 52|
+-----+---+



In [None]:
df_combined = df.union(new_df)
print("After adding more rows:")
df_combined.show()


After adding more rows:
+-----+---+
| Name|Age|
+-----+---+
|Alice| 34|
|  Bob| 45|
|Cathy| 29|
|David| 38|
| Emma| 27|
|Frank| 52|
+-----+---+



In [None]:
# 1. Re-create the full combined DataFrame (if you overwrote it with the filter)
df_full = df.union(new_df)

# 2. Sort the DataFrame by 'Age' in descending order and show the top row
print("=" * 50)
print("OLDEST PERSON IN THE COMBINED DATASET")
print("=" * 50)

df_full.orderBy(col("Age").desc()).show(4, truncate=False)

OLDEST PERSON IN THE COMBINED DATASET
+-----+---+
|Name |Age|
+-----+---+
|Frank|52 |
|Bob  |45 |
|David|38 |
|Alice|34 |
+-----+---+
only showing top 4 rows


In [None]:
more_data = [
    ("Grace", 31),
    ("Henry", 41),
    ("Iris", 25)
]
more_df = spark.createDataFrame(more_data, ["Name", "Age"])
df_all = df_combined.union(more_df)
print("After adding even more rows:")
df_all.show()

After adding even more rows:
+-----+---+
| Name|Age|
+-----+---+
|Alice| 34|
|  Bob| 45|
|Cathy| 29|
|David| 38|
| Emma| 27|
|Frank| 52|
|Grace| 31|
|Henry| 41|
| Iris| 25|
+-----+---+



In [None]:
data = [("A", 34), ("B", 45), ("C", 29)]
df1 = spark.createDataFrame(data, ["Name", "Age"])
df1.show()

+----+---+
|Name|Age|
+----+---+
|   A| 34|
|   B| 45|
|   C| 29|
+----+---+



In [None]:
df_allin = df_combined.union(df1)
print("After adding even more rows:")
df_allin.show()

After adding even more rows:
+-----+---+
| Name|Age|
+-----+---+
|Alice| 34|
|  Bob| 45|
|Cathy| 29|
|David| 38|
| Emma| 27|
|Frank| 52|
|    A| 34|
|    B| 45|
|    C| 29|
+-----+---+



In [None]:
print("\n" + "=" * 50)
print("3. ADDING NEW COLUMNS")
print("=" * 50)



3. ADDING NEW COLUMNS


In [None]:
df_with_country = df_allin.withColumn("Country", lit("USA"))
print("Added Country column:")
df_with_country.show()

Added Country column:
+-----+---+-------+
| Name|Age|Country|
+-----+---+-------+
|Alice| 34|    USA|
|  Bob| 45|    USA|
|Cathy| 29|    USA|
|David| 38|    USA|
| Emma| 27|    USA|
|Frank| 52|    USA|
|    A| 34|    USA|
|    B| 45|    USA|
|    C| 29|    USA|
+-----+---+-------+



In [None]:
df_with_age_calc = df_with_country.withColumn("Age_in_5_years", col("Age") + 5)
print("Added calculated column:")
df_with_age_calc.show()

Added calculated column:
+-----+---+-------+--------------+
| Name|Age|Country|Age_in_5_years|
+-----+---+-------+--------------+
|Alice| 34|    USA|            39|
|  Bob| 45|    USA|            50|
|Cathy| 29|    USA|            34|
|David| 38|    USA|            43|
| Emma| 27|    USA|            32|
|Frank| 52|    USA|            57|
|    A| 34|    USA|            39|
|    B| 45|    USA|            50|
|    C| 29|    USA|            34|
+-----+---+-------+--------------+



In [None]:
df_multi_cols = df_with_age_calc \
    .withColumn("Salary", lit(50000)) \
    .withColumn("Department", lit("IT"))
print("Added multiple columns:")
df_multi_cols.show()

Added multiple columns:
+-----+---+-------+--------------+------+----------+
| Name|Age|Country|Age_in_5_years|Salary|Department|
+-----+---+-------+--------------+------+----------+
|Alice| 34|    USA|            39| 50000|        IT|
|  Bob| 45|    USA|            50| 50000|        IT|
|Cathy| 29|    USA|            34| 50000|        IT|
|David| 38|    USA|            43| 50000|        IT|
| Emma| 27|    USA|            32| 50000|        IT|
|Frank| 52|    USA|            57| 50000|        IT|
|    A| 34|    USA|            39| 50000|        IT|
|    B| 45|    USA|            50| 50000|        IT|
|    C| 29|    USA|            34| 50000|        IT|
+-----+---+-------+--------------+------+----------+



# Add column with conditional logic 

In [None]:
df_with_category = df_multi_cols.withColumn(
    "Age_Category",
    when(col("Age") < 30, "Young")
    .when((col("Age") >= 30) & (col("Age") < 40), "Middle")
    .otherwise("Senior")
)
print("Added conditional column:")
df_with_category.show()

Added conditional column:


                                                                                

+-----+---+-------+--------------+------+----------+------------+
| Name|Age|Country|Age_in_5_years|Salary|Department|Age_Category|
+-----+---+-------+--------------+------+----------+------------+
|Alice| 34|    USA|            39| 50000|        IT|      Middle|
|  Bob| 45|    USA|            50| 50000|        IT|      Senior|
|Cathy| 29|    USA|            34| 50000|        IT|       Young|
|David| 38|    USA|            43| 50000|        IT|      Middle|
| Emma| 27|    USA|            32| 50000|        IT|       Young|
|Frank| 52|    USA|            57| 50000|        IT|      Senior|
|    A| 34|    USA|            39| 50000|        IT|      Middle|
|    B| 45|    USA|            50| 50000|        IT|      Senior|
|    C| 29|    USA|            34| 50000|        IT|       Young|
+-----+---+-------+--------------+------+----------+------------+



# csv file generation

# excel file generation

# Filter by age

In [None]:
print("People older than 35:")
df_with_category.filter(col("Age") > 35).show()

People older than 35:
+-----+---+-------+--------------+------+----------+------------+
| Name|Age|Country|Age_in_5_years|Salary|Department|Age_Category|
+-----+---+-------+--------------+------+----------+------------+
|  Bob| 45|    USA|            50| 50000|        IT|      Senior|
|David| 38|    USA|            43| 50000|        IT|      Middle|
|Frank| 52|    USA|            57| 50000|        IT|      Senior|
|    B| 45|    USA|            50| 50000|        IT|      Senior|
+-----+---+-------+--------------+------+----------+------------+



# Filter with multiple conditions (AND)

In [None]:
print("\n Middle-aged people from USA:")
df_with_category.filter(
    (col("Age_Category") == "Middle") & (col("Country") == "USA")
).show()


 Middle-aged people from USA:


                                                                                

+-----+---+-------+--------------+------+----------+------------+
| Name|Age|Country|Age_in_5_years|Salary|Department|Age_Category|
+-----+---+-------+--------------+------+----------+------------+
|Alice| 34|    USA|            39| 50000|        IT|      Middle|
|David| 38|    USA|            43| 50000|        IT|      Middle|
|    A| 34|    USA|            39| 50000|        IT|      Middle|
+-----+---+-------+--------------+------+----------+------------+



In [None]:
print("\nYoung OR Senior people:")
df_with_category.filter(
    (col("Age_Category") == "Young") | (col("Age_Category") == "Senior")
).show()


Young OR Senior people:
+-----+---+-------+--------------+------+----------+------------+
| Name|Age|Country|Age_in_5_years|Salary|Department|Age_Category|
+-----+---+-------+--------------+------+----------+------------+
|  Bob| 45|    USA|            50| 50000|        IT|      Senior|
|Cathy| 29|    USA|            34| 50000|        IT|       Young|
| Emma| 27|    USA|            32| 50000|        IT|       Young|
|Frank| 52|    USA|            57| 50000|        IT|      Senior|
|    B| 45|    USA|            50| 50000|        IT|      Senior|
|    C| 29|    USA|            34| 50000|        IT|       Young|
+-----+---+-------+--------------+------+----------+------------+



In [None]:
df_with_category = df_with_category.withColumnRenamed(" Name", "Name")
df_with_category.select("Name").show()



+-----+
| Name|
+-----+
|Alice|
|  Bob|
|Cathy|
|David|
| Emma|
|Frank|
|    A|
|    B|
|    C|
+-----+



In [None]:
from pyspark.sql.functions import col

print("\nMiddle-aged people from USA:")
df_with_category.filter(
    (col("Age_Category") == "Middle") & (col("Country") == "USA")
).show()



Middle-aged people from USA:
+-----+---+-------+--------------+------+----------+------------+
| Name|Age|Country|Age_in_5_years|Salary|Department|Age_Category|
+-----+---+-------+--------------+------+----------+------------+
|Alice| 34|    USA|            39| 50000|        IT|      Middle|
|David| 38|    USA|            43| 50000|        IT|      Middle|
|    A| 34|    USA|            39| 50000|        IT|      Middle|
+-----+---+-------+--------------+------+----------+------------+



In [None]:
print("\nAge between 30 and 40:")
df_with_category.filter("Age >= 30 AND Age <= 40").show()


Age between 30 and 40:
+-----+---+-------+--------------+------+----------+------------+
| Name|Age|Country|Age_in_5_years|Salary|Department|Age_Category|
+-----+---+-------+--------------+------+----------+------------+
|Alice| 34|    USA|            39| 50000|        IT|      Middle|
|David| 38|    USA|            43| 50000|        IT|      Middle|
|    A| 34|    USA|            39| 50000|        IT|      Middle|
+-----+---+-------+--------------+------+----------+------------+



In [None]:
print("Sorted by Age (ascending):")
df_with_category.orderBy("Age").show()

Sorted by Age (ascending):




+-----+---+-------+--------------+------+----------+------------+
| Name|Age|Country|Age_in_5_years|Salary|Department|Age_Category|
+-----+---+-------+--------------+------+----------+------------+
| Emma| 27|    USA|            32| 50000|        IT|       Young|
|    C| 29|    USA|            34| 50000|        IT|       Young|
|Cathy| 29|    USA|            34| 50000|        IT|       Young|
|    A| 34|    USA|            39| 50000|        IT|      Middle|
|Alice| 34|    USA|            39| 50000|        IT|      Middle|
|David| 38|    USA|            43| 50000|        IT|      Middle|
|    B| 45|    USA|            50| 50000|        IT|      Senior|
|  Bob| 45|    USA|            50| 50000|        IT|      Senior|
|Frank| 52|    USA|            57| 50000|        IT|      Senior|
+-----+---+-------+--------------+------+----------+------------+



                                                                                

In [None]:
print("\nSorted by Age (descending):")
df_with_category.orderBy(col("Age").desc()).show()


Sorted by Age (descending):
+-----+---+-------+--------------+------+----------+------------+
| Name|Age|Country|Age_in_5_years|Salary|Department|Age_Category|
+-----+---+-------+--------------+------+----------+------------+
|Frank| 52|    USA|            57| 50000|        IT|      Senior|
|    B| 45|    USA|            50| 50000|        IT|      Senior|
|  Bob| 45|    USA|            50| 50000|        IT|      Senior|
|David| 38|    USA|            43| 50000|        IT|      Middle|
|    A| 34|    USA|            39| 50000|        IT|      Middle|
|Alice| 34|    USA|            39| 50000|        IT|      Middle|
|    C| 29|    USA|            34| 50000|        IT|       Young|
|Cathy| 29|    USA|            34| 50000|        IT|       Young|
| Emma| 27|    USA|            32| 50000|        IT|       Young|
+-----+---+-------+--------------+------+----------+------------+



                                                                                

In [None]:
print("\nName in uppercase and Age:")
df_with_category.select(
    upper(col("Name")).alias("UPPERCASE_NAME"),
    col("Age")
).show()


Name in uppercase and Age:


                                                                                

+--------------+---+
|UPPERCASE_NAME|Age|
+--------------+---+
|         ALICE| 34|
|           BOB| 45|
|         CATHY| 29|
|         DAVID| 38|
|          EMMA| 27|
|         FRANK| 52|
|             A| 34|
|             B| 45|
|             C| 29|
+--------------+---+



In [None]:
print("\nSorted by Category then Age:")
df_with_category.orderBy("Age_Category", col("Age").desc()).show()



Sorted by Category then Age:




+-----+---+-------+--------------+------+----------+------------+
| Name|Age|Country|Age_in_5_years|Salary|Department|Age_Category|
+-----+---+-------+--------------+------+----------+------------+
|David| 38|    USA|            43| 50000|        IT|      Middle|
|    A| 34|    USA|            39| 50000|        IT|      Middle|
|Alice| 34|    USA|            39| 50000|        IT|      Middle|
|Frank| 52|    USA|            57| 50000|        IT|      Senior|
|    B| 45|    USA|            50| 50000|        IT|      Senior|
|  Bob| 45|    USA|            50| 50000|        IT|      Senior|
|    C| 29|    USA|            34| 50000|        IT|       Young|
|Cathy| 29|    USA|            34| 50000|        IT|       Young|
| Emma| 27|    USA|            32| 50000|        IT|       Young|
+-----+---+-------+--------------+------+----------+------------+



                                                                                

In [None]:
df_renamed = df_with_category.withColumnRenamed("Name", "Full_Name")
print("Renamed 'Name' to 'Full_Name':")

Renamed 'Name' to 'Full_Name':


In [None]:
df_dropped = df_with_category.drop("Department", "Salary")
print("Dropped Department and Salary columns:")
df_dropped.show()

Dropped Department and Salary columns:
+-----+---+-------+--------------+------------+
| Name|Age|Country|Age_in_5_years|Age_Category|
+-----+---+-------+--------------+------------+
|Alice| 34|    USA|            39|      Middle|
|  Bob| 45|    USA|            50|      Senior|
|Cathy| 29|    USA|            34|       Young|
|David| 38|    USA|            43|      Middle|
| Emma| 27|    USA|            32|       Young|
|Frank| 52|    USA|            57|      Senior|
|    A| 34|    USA|            39|      Middle|
|    B| 45|    USA|            50|      Senior|
|    C| 29|    USA|            34|       Young|
+-----+---+-------+--------------+------------+



In [None]:
df_dropped1 = df_dropped.drop("Age_in_5_years")
print("Dropped Age_in_5_years columns:")
df_dropped1.show()

Dropped Age_in_5_years columns:
+-----+---+-------+------------+
| Name|Age|Country|Age_Category|
+-----+---+-------+------------+
|Alice| 34|    USA|      Middle|
|  Bob| 45|    USA|      Senior|
|Cathy| 29|    USA|       Young|
|David| 38|    USA|      Middle|
| Emma| 27|    USA|       Young|
|Frank| 52|    USA|      Senior|
|    A| 34|    USA|      Middle|
|    B| 45|    USA|      Senior|
|    C| 29|    USA|       Young|
+-----+---+-------+------------+



In [None]:
print("\nStatistics by Age Category:")
df_with_category.groupBy("Age_Category").agg(
    count("*").alias("count"),
    avg("Age").alias("avg_age"),
    sum("Salary").alias("total_salary")
).show()


Statistics by Age Category:




+------------+-----+------------------+------------+
|Age_Category|count|           avg_age|total_salary|
+------------+-----+------------------+------------+
|      Middle|    3|35.333333333333336|      150000|
|      Senior|    3|47.333333333333336|      150000|
|       Young|    3|28.333333333333332|      150000|
+------------+-----+------------------+------------+



                                                                                

In [None]:
total_rows = df_with_category.count()
print(f"Total number of rows: {total_rows}")



Total number of rows: 9


                                                                                

In [None]:
spark.stop()
