# BESICS

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("ExampleApp").master("local[*]").getOrCreate()
print("=" * 50)
print("1. INITIAL DATAFRAME")
print("=" * 50)
data = [("Alice", 34), ("Bob", 45), ("Cathy", 29)]
df = spark.createDataFrame(data, ["Name", "Age"])
df.show()
spark.stop()

In [None]:
spark._jvm.org.apache.hadoop.util.VersionInfo.getVersion()


In [None]:
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import pandas as pd
import os
import sys
df_filtered = df.filter(col("Age") > 30)
df_filtered.show()
spark.stop()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when, avg, sum, count, upper, lower, concat
print("=" * 50)
print("2. ADDING MORE ROWS")
print("=" * 50)
new_data = [("David", 38), ("Emma", 27), ("Frank", 52)]
new_df = spark.createDataFrame(new_data, ["Name", "Age"])
new_df.show()


In [None]:
df_combined = df.union(new_df)
print("After adding more rows:")
df_combined.show()


In [None]:
# 1. Re-create the full combined DataFrame (if you overwrote it with the filter)
df_full = df.union(new_df)

# 2. Sort the DataFrame by 'Age' in descending order and show the top row
print("=" * 50)
print("OLDEST PERSON IN THE COMBINED DATASET")
print("=" * 50)

df_full.orderBy(col("Age").desc()).show(4, truncate=False)

In [None]:
more_data = [
    ("Grace", 31),
    ("Henry", 41),
    ("Iris", 25)
]
more_df = spark.createDataFrame(more_data, ["Name", "Age"])
df_all = df_combined.union(more_df)
print("After adding even more rows:")
df_all.show()

In [None]:
data = [("A", 34), ("B", 45), ("C", 29)]
df1 = spark.createDataFrame(data, ["Name", "Age"])
df1.show()

In [None]:
df_allin = df_combined.union(df1)
print("After adding even more rows:")
df_allin.show()

In [None]:
print("\n" + "=" * 50)
print("3. ADDING NEW COLUMNS")
print("=" * 50)


In [None]:
df_with_country = df_allin.withColumn("Country", lit("USA"))
print("Added Country column:")
df_with_country.show()

In [None]:
df_with_age_calc = df_with_country.withColumn("Age_in_5_years", col("Age") + 5)
print("Added calculated column:")
df_with_age_calc.show()

In [None]:
df_multi_cols = df_with_age_calc \
    .withColumn("Salary", lit(50000)) \
    .withColumn("Department", lit("IT"))
print("Added multiple columns:")
df_multi_cols.show()

# Add column with conditional logic 

In [None]:
df_with_category = df_multi_cols.withColumn(
    "Age_Category",
    when(col("Age") < 30, "Young")
    .when((col("Age") >= 30) & (col("Age") < 40), "Middle")
    .otherwise("Senior")
)
print("Added conditional column:")
df_with_category.show()

# csv file generation

# excel file generation

# Filter by age

In [None]:
print("People older than 35:")
df_with_category.filter(col("Age") > 35).show()

# Filter with multiple conditions (AND)

In [None]:
print("\n Middle-aged people from USA:")
df_with_category.filter(
    (col("Age_Category") == "Middle") & (col("Country") == "USA")
).show()

In [None]:
print("\nYoung OR Senior people:")
df_with_category.filter(
    (col("Age_Category") == "Young") | (col("Age_Category") == "Senior")
).show()

In [None]:
df_with_category = df_with_category.withColumnRenamed(" Name", "Name")
df_with_category.select("Name").show()



In [None]:
from pyspark.sql.functions import col

print("\nMiddle-aged people from USA:")
df_with_category.filter(
    (col("Age_Category") == "Middle") & (col("Country") == "USA")
).show()


In [None]:
print("\nAge between 30 and 40:")
df_with_category.filter("Age >= 30 AND Age <= 40").show()

In [None]:
print("Sorted by Age (ascending):")
df_with_category.orderBy("Age").show()

In [None]:
print("\nSorted by Age (descending):")
df_with_category.orderBy(col("Age").desc()).show()

In [None]:
print("\nName in uppercase and Age:")
df_with_category.select(
    upper(col("Name")).alias("UPPERCASE_NAME"),
    col("Age")
).show()

In [None]:
print("\nSorted by Category then Age:")
df_with_category.orderBy("Age_Category", col("Age").desc()).show()


In [None]:
df_renamed = df_with_category.withColumnRenamed("Name", "Full_Name")
print("Renamed 'Name' to 'Full_Name':")

In [None]:
df_dropped = df_with_category.drop("Department", "Salary")
print("Dropped Department and Salary columns:")
df_dropped.show()

In [None]:
df_dropped1 = df_dropped.drop("Age_in_5_years")
print("Dropped Age_in_5_years columns:")
df_dropped1.show()

In [None]:
print("\nStatistics by Age Category:")
df_with_category.groupBy("Age_Category").agg(
    count("*").alias("count"),
    avg("Age").alias("avg_age"),
    sum("Salary").alias("total_salary")
).show()

In [None]:
total_rows = df_with_category.count()
print(f"Total number of rows: {total_rows}")

In [None]:
spark.stop()
