In [10]:
#-----------------------------------| IMPORT & INITIALIZATION |-------------#
from pyspark.sql import SparkSession
from pyspark.sql.functions import when
from pyspark.sql.functions import col
import kagglehub
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("dataproc_spark") \
    .getOrCreate()
path = kagglehub.dataset_download("iamsouravbanerjee/world-population-dataset")
df = spark.read.csv(path, header=True, inferSchema=True)  # we have methods like read.json(path), read.parquet(path), read.orc(path), read.text(path) for different types

# Reading from JDBC Sources
# df = spark.read \
#     .format("jdbc") \
#     .option("url", "jdbc:postgresql://host:port/database") \
#     .option("dbtable", "table_name") \
#     .option("user", "username") \
#     .option("password", "password") \
#     .load()


In [48]:
df.head(5) # Prints first # records (as passed, default-5)

df.show(5) # Prints the data with schema

df.printSchema() # Prints schedma

df.count() # Prints the # of rows

df.select(df.columns[:1]).show() # returns DataFrame with selected columns

df.fillna(0, subset=["2022 Population"]).show() # Replace nulls in '2022 Population' with 0

df.dropna(how='all', subset=["Continent", "Country/Territory"]).show() # Drop rows with nulls in either 'Continent' or 'Country/Territory'

df.replace("Asia", "Asia Pacific", "Continent").show() # Replace "Asia" with "Asia Pacific" in the 'Continent' column

df.filter(df["World Population Percentage"] > 1).show() # Filter the data based on condition

df.filter(col("Continent").isNotNull() & col("Country/Territory").isNotNull()).show() # will retuen not null rows for the columns

df.distinct().show() # Prints the distinct rows

df.withColumnRenamed("Country/Territory","Country").show() # Renames the column

df.drop("1980 Population","1970 Population").show() # Drops the columns

df.withColumn("Population Density", df["2022 Population"] / df["Area (km²)"]).show() # Calculate population density

df.withColumn("High Population", when(df["2022 Population"] > 100000000, True).otherwise(False)).show() # Categorize high population countries

df.groupBy("Continent") \
  .sum("2022 Population") \
  .orderBy(desc('sum(2022 Population)')).show() # Prints the grouped data with the ordered manner

df.withColumn('Visited',lit("NO")) \
  .filter(df["Country/Territory"] == "India").withColumn('Visited',lit("YES"))\
  .filter(df["Country/Territory"] == "India").show() # Adds new column and with the default value passed and can be updated






+----+----+-----------------+----------------+---------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+----------+-----------------+-----------+---------------------------+
|Rank|CCA3|Country/Territory|         Capital|Continent|2022 Population|2020 Population|2015 Population|2010 Population|2000 Population|1990 Population|1980 Population|1970 Population|Area (km²)|Density (per km²)|Growth Rate|World Population Percentage|
+----+----+-----------------+----------------+---------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+----------+-----------------+-----------+---------------------------+
|  36| AFG|      Afghanistan|           Kabul|     Asia|       41128771|       38972230|       33753499|       28189672|       19542982|       10694796|       12486631|       10752971|    652230|          63.0587|     1.0257|             