In [2]:
from google.colab import files
uploaded=files.upload()

Saving top_100_saas_companies_2025.csv to top_100_saas_companies_2025.csv


In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, udf
from pyspark.sql.types import FloatType

# Start Spark Session
spark = SparkSession.builder.appName("Valuation vs Median").getOrCreate()

# Load the dataset
df = spark.read.csv("/content/top_100_saas_companies_2025.csv", header=True, inferSchema=True)

# Updated Valuation parser to handle both 'B' and 'T'
def parse_valuation(val):
    if val is not None:
        val = val.replace('$', '').strip()
        if val.endswith('B'):
            return float(val[:-1]) * 1e9
        elif val.endswith('T'):
            return float(val[:-1]) * 1e12
    return None

valuation_udf = udf(parse_valuation, FloatType())

df = df.withColumn("Valuation_Num", valuation_udf(col("Valuation")))

# Industry Medians DataFrame
industry_medians = spark.createDataFrame([
    ("Enterprise Software", 150_000_000_000),
    ("CRM", 100_000_000_000),
    ("AI", 70_000_000_000),
    ("HRTech", 50_000_000_000),
], ["Industry", "Median_Valuation"])

# Join with industry median
joined_df = df.join(industry_medians, on="Industry", how="left")

# Create Valuation_Position column
joined_df = joined_df.withColumn(
    "Valuation_Position",
    when(col("Valuation_Num") > col("Median_Valuation"), "Above Median").otherwise("Below Median")
)

#Show final result
joined_df.select("Company Name", "Industry", "Valuation", "Valuation_Num", "Median_Valuation", "Valuation_Position").show(truncate=False)


+------------+----------------------+-------------------+-------------+----------------+------------------+
|Company Name|Industry              |Valuation          |Valuation_Num|Median_Valuation|Valuation_Position|
+------------+----------------------+-------------------+-------------+----------------+------------------+
|Zoom        |Video Communications  |$85B               |8.5000004E10 |NULL            |Below Median      |
|Stripe      |Payments              |$65B               |6.5000002E10 |NULL            |Below Median      |
|Atlassian   |Collaboration Software|$55B               |5.4999998E10 |NULL            |Below Median      |
|Oracle      |Database & Enterprise |$350B              |3.49999989E11|NULL            |Below Median      |
|Workday     |HR & Finance          |$65B               |6.5000002E10 |NULL            |Below Median      |
|Slack       |Team Communication    |$27.7B (Salesforce)|NULL         |NULL            |Below Median      |
|Adobe       |Creative Softw