In [39]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("spark advanced").getOrCreate()

df = spark.read.option("inferschema","true").option("header","true").csv("/content/top_100_saas_companies_2025.csv")

df.printSchema()
df.show()

root
 |-- Company Name: string (nullable = true)
 |-- Founded Year: integer (nullable = true)
 |-- HQ: string (nullable = true)
 |-- Industry: string (nullable = true)
 |-- Total Funding: string (nullable = true)
 |-- ARR: string (nullable = true)
 |-- Valuation: string (nullable = true)
 |-- Employees: string (nullable = true)
 |-- Top Investors: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- G2 Rating: double (nullable = true)

+------------+------------+--------------------+--------------------+-------------+------+-------------------+---------+--------------------+--------------------+---------+
|Company Name|Founded Year|                  HQ|            Industry|Total Funding|   ARR|          Valuation|Employees|       Top Investors|             Product|G2 Rating|
+------------+------------+--------------------+--------------------+-------------+------+-------------------+---------+--------------------+--------------------+---------+
|   Microsoft|        197

In [41]:
#Problem 1: UDF – Standardize Currency Columns

from pyspark.sql.functions import udf, col, regexp_replace
from pyspark.sql.types import DoubleType

df = df.withColumn("Valuation_modified", regexp_replace("Valuation",r"\s*\(.*\)",""))

def currency_columns(val):
  val = val.replace("$","")
  if "T" in val:
    return float(val.replace("T",""))*1000000000000
  if "B" in val:
    return float(val.replace("B",""))*1000000000
  if "M" in val:
    return float(val.replace("M",""))*1000000

currency_udf = udf(currency_columns, DoubleType())

df = df.withColumn("ARR_Num", currency_udf(col("ARR"))).withColumn("Valuation_Num", currency_udf(col("Valuation_modified"))).withColumn("Funding_Num", currency_udf(col("Total Funding")))
df.select("ARR","ARR_Num","Valuation","Valuation_Num","Total Funding","Funding_Num").show()

+------+--------------------+-------------------+-------------+-------------+--------------------+
|   ARR|             ARR_Num|          Valuation|Valuation_Num|Total Funding|         Funding_Num|
+------+--------------------+-------------------+-------------+-------------+--------------------+
| $270B|              2.7E11|                $3T|       3.0E12|          $1B|               1.0E9|
|$37.9B|             3.79E10|            $227.8B|     2.278E11|       $65.4M| 6.540000000000001E7|
|$19.4B|             1.94E10|              $240B|       2.4E11|        $2.5M|           2500000.0|
|$52.9B|             5.29E10|              $350B|       3.5E11|          $2K|                NULL|
|$32.5B|             3.25E10|              $215B|      2.15E11|          N/A|                NULL|
|$14.4B|             1.44E10|              $180B|       1.8E11|        $273M|              2.73E8|
| $8.9B|               8.9E9|              $147B|      1.47E11|       $82.5M|              8.25E7|
| $7.3B|  

In [50]:
# Problem 2: Window Function – Identify Top Performers per Industry
# Title: Top 2 Companies by Valuation Within Each Industry

from pyspark.sql.functions import desc, col, rank
from pyspark.sql.window import Window

windowed_df = Window.partitionBy("Industry").orderBy(col("Valuation_Num").desc())

df.withColumn("rank", rank().over(windowed_df)).filter(col("rank")<=2).select("Industry","Company Name","Valuation_Num","rank").show()


+--------------------+------------+-------------+----+
|            Industry|Company Name|Valuation_Num|rank|
+--------------------+------------+-------------+----+
|                 APM| AppDynamics|        3.7E9|   1|
|                BNPL|      Affirm|       1.2E10|   1|
|Business Intellig...|      Looker|        2.6E9|   1|
|               CI/CD|    CircleCI|        1.7E9|   1|
|                 CRM|  Salesforce|     2.278E11|   1|
|        Card Issuing|     Marqeta|        4.3E9|   1|
|      Cloud Security|     Zscaler|       3.0E10|   1|
|      Cloud Security|    Netskope|        7.5E9|   2|
|       Cloud Storage|     Dropbox|        8.5E9|   1|
|       Cloud Storage|         Box|        3.5E9|   2|
|       Collaboration|        Miro|      1.75E10|   1|
|Collaboration Sof...|   Atlassian|       5.5E10|   1|
|      Communications|      Twilio|       1.2E10|   1|
|      Communications| RingCentral|        5.0E9|   2|
|        Construction|     Procore|        9.0E9|   1|
|      Con

In [56]:
# Problem 3: Window Function – ARR Growth Gaps
# Title: Understand Revenue Distribution Among Competitors

from pyspark.sql.functions import desc, col, lag
from pyspark.sql.window import Window

windowed_df = Window.partitionBy("Industry").orderBy(col("ARR_Num").desc())

df.withColumn("lag_ARR_Num", lag("ARR_Num").over(windowed_df)).filter(col("lag_ARR_Num")-col("ARR_Num")>1000000000).select("Industry","ARR_Num","lag_ARR_Num").show()

+-------------------+-------+--------------------+
|           Industry|ARR_Num|         lag_ARR_Num|
+-------------------+-------+--------------------+
|     Cloud Security|  5.0E8|               1.6E9|
|      Cloud Storage|  1.0E9|               2.5E9|
|     Communications|  2.2E9|4.0999999999999995E9|
|      Cybersecurity|  3.1E9|               7.5E9|
|     Data Analytics|  2.2E9|               3.7E9|
|           Database|  1.0E8|               1.7E9|
|             Design|  6.0E8|               2.0E9|
|Enterprise Software|3.25E10|              2.7E11|
|           Payments| 1.4E10|             1.97E10|
+-------------------+-------+--------------------+



In [61]:
# Problem 4: CASE WHEN – Label Companies by G2 Rating
# Title: Classify Companies Based on User Sentiment

from pyspark.sql.functions import col, when

df = df.withColumn("Rating_Tier", when(col("G2 Rating")>=4.7, "Excellent")
            .when((col("G2 Rating")>=4.3) & (col("G2 Rating")<4.7), "Very Good")
            .when((col("G2 Rating")>=4.0) & (col("G2 Rating")<4.3), "Good")
            .otherwise("Average"))
df.select("Industry", "Company Name", "Rating_Tier").show()

+--------------------+------------+-----------+
|            Industry|Company Name|Rating_Tier|
+--------------------+------------+-----------+
| Enterprise Software|   Microsoft|  Very Good|
|                 CRM|  Salesforce|  Very Good|
|   Creative Software|       Adobe|  Very Good|
|Database & Enterp...|      Oracle|       Good|
| Enterprise Software|         SAP|       Good|
|  Financial Software|      Intuit|  Very Good|
|IT Service Manage...|  ServiceNow|  Very Good|
|        HR & Finance|     Workday|       Good|
|Video Communications|        Zoom|  Very Good|
|          E-commerce|     Shopify|  Very Good|
|Collaboration Sof...|   Atlassian|  Very Good|
|    Data Warehousing|   Snowflake|  Very Good|
|   Marketing & Sales|     HubSpot|  Very Good|
|  Digital Agreements|    DocuSign|  Very Good|
|  Team Communication|       Slack|  Very Good|
|        Productivity|      Notion|  Excellent|
|Monitoring & Anal...|     Datadog|  Very Good|
|            Database|     MongoDB|  Ver

In [93]:
# Problem 5: Join – Investor Tier Enrichment
# Title: Understand Impact of Tier-1 Investors

from pyspark.sql.functions import split

investor_tiers = spark.createDataFrame([
    ("Accel", "Tier 1"),
    ("Sequoia", "Tier 1"),
    ("Andreessen Horowitz", "Tier 1"),
    ("SoftBank", "Tier 2"),
    ("Lightspeed", "Tier 2"),
    ("Unknown", "Tier 3")
], ["Investor", "Tier"])

df = df.withColumn("list_of_investors", split(col("Top Investors"),","))
df = df.withColumn("Investor",col("list_of_investors")[0])
# df.show()

joined_df = df.join(investor_tiers, df.Investor==investor_tiers.Investor, "inner")
# joined_df.show()
filtered_df = joined_df.filter((col("Tier")=="Tier 1")|(col("Tier")=="Tier 2"))
filtered_df.select(df.Industry, "Company Name", df.Investor, "Tier").show()

+--------------------+------------------+-------------------+------+
|            Industry|      Company Name|           Investor|  Tier|
+--------------------+------------------+-------------------+------+
|  Search & Discovery|           Algolia|              Accel|Tier 1|
|      Log Management|        Sumo Logic|              Accel|Tier 1|
|       Customer Data|           Segment|              Accel|Tier 1|
| Customer Engagement|        Freshworks|              Accel|Tier 1|
|  Team Communication|             Slack|              Accel|Tier 1|
|      Cloud Security|          Netskope|            Sequoia|Tier 1|
|   Physical Security|           Verkada|            Sequoia|Tier 1|
|Revenue Intelligence|              Gong|            Sequoia|Tier 1|
|Experience Manage...|         Qualtrics|            Sequoia|Tier 1|
|      Communications|       RingCentral|            Sequoia|Tier 1|
|       Cybersecurity|Palo Alto Networks|            Sequoia|Tier 1|
|   Product Analytics|         Amp

In [78]:
# Problem 6: Join – Compare with Industry Median
# Title: Classify Companies Based on Valuation Position

industry_medians = spark.createDataFrame([
    ("Enterprise Software", 150_000_000_000),
    ("CRM", 100_000_000_000),
    ("AI", 70_000_000_000),
    ("HRTech", 50_000_000_000),
], ["Industry", "Median_Valuation"])

joined_df = df.join(industry_medians, df.Industry==industry_medians.Industry, "inner")

joined_df = joined_df.withColumn("Valuation_Position", when(col("Valuation_Num")>col("Median_Valuation"), "Median")
              .otherwise("Below"))
joined_df.select(df.Industry, "Company Name", "Valuation_Num", "Median_Valuation", "Valuation_Position").show()



+-------------------+------------+-------------+----------------+------------------+
|           Industry|Company Name|Valuation_Num|Median_Valuation|Valuation_Position|
+-------------------+------------+-------------+----------------+------------------+
|Enterprise Software|         SAP|      2.15E11|    150000000000|            Median|
|Enterprise Software|   Microsoft|       3.0E12|    150000000000|            Median|
|                CRM|  Salesforce|     2.278E11|    100000000000|            Median|
+-------------------+------------+-------------+----------------+------------------+

