In [49]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import DoubleType
from pyspark.sql import Window


In [50]:

spark = SparkSession.builder.appName("CurrencyConversionUDF").getOrCreate()


df = spark.read.csv("/content/top_100_saas_companies_2025.csv", header=True, inferSchema=True)


def convert_currency(value):
    if value is None:
        return None
    value = value.strip().replace("$", "").replace(",", "").upper()
    try:
        if value.endswith("T"):
            return float(value[:-1]) * 1e12
        elif value.endswith("B"):
            return float(value[:-1]) * 1e9
        elif value.endswith("M"):
            return float(value[:-1]) * 1e6
        elif value.endswith("K"):
            return float(value[:-1]) * 1e3
        elif value.replace('.', '', 1).isdigit():
            return float(value)
        else:
            return None
    except:
        return None

convert_currency_udf = udf(convert_currency, DoubleType())


cols_to_convert = ["ARR", "Valuation", "Total Funding"]
for c in cols_to_convert:
    df = df.withColumn(f"{c}_Num", convert_currency_udf(col(c)))

df.select("Company Name", "Valuation", "Valuation_Num",
          "ARR", "ARR_Num",
          "Total Funding", "Total Funding_Num").show(5)


+------------+---------+-------------+------+-------+-------------+-------------------+
|Company Name|Valuation|Valuation_Num|   ARR|ARR_Num|Total Funding|  Total Funding_Num|
+------------+---------+-------------+------+-------+-------------+-------------------+
|   Microsoft|      $3T|       3.0E12| $270B| 2.7E11|          $1B|              1.0E9|
|  Salesforce|  $227.8B|     2.278E11|$37.9B|3.79E10|       $65.4M|6.540000000000001E7|
|       Adobe|    $240B|       2.4E11|$19.4B|1.94E10|        $2.5M|          2500000.0|
|      Oracle|    $350B|       3.5E11|$52.9B|5.29E10|          $2K|             2000.0|
|         SAP|    $215B|      2.15E11|$32.5B|3.25E10|          N/A|               NULL|
+------------+---------+-------------+------+-------+-------------+-------------------+
only showing top 5 rows



In [51]:
window_spec = Window.partitionBy('Industry').orderBy(col('Valuation_Num').desc())

In [52]:
df1 = df.withColumn('rank', rank().over(window_spec))
df1.show(5)



+------------+------------+--------------------+--------------------+-------------+------+--------------+---------+--------------------+--------------------+---------+-------+-------------+-------------------+----+
|Company Name|Founded Year|                  HQ|            Industry|Total Funding|   ARR|     Valuation|Employees|       Top Investors|             Product|G2 Rating|ARR_Num|Valuation_Num|  Total Funding_Num|rank|
+------------+------------+--------------------+--------------------+-------------+------+--------------+---------+--------------------+--------------------+---------+-------+-------------+-------------------+----+
| AppDynamics|        2008|San Francisco, CA...|                 APM|        $315M| $600M| $3.7B (Cisco)|    2,000|Lightspeed, Greylock|Application Perfo...|      4.3|  6.0E8|         NULL|             3.15E8|   1|
|      Affirm|        2012|San Francisco, CA...|                BNPL|        $1.5B| $1.6B|          $12B|    2,500|Founders Fund, Li...|Buy 

In [53]:
df2 = df1.filter(col('rank') < 3 ).show(5)

+------------+------------+--------------------+--------------------+-------------+------+--------------+---------+--------------------+--------------------+---------+-------+-------------+-------------------+----+
|Company Name|Founded Year|                  HQ|            Industry|Total Funding|   ARR|     Valuation|Employees|       Top Investors|             Product|G2 Rating|ARR_Num|Valuation_Num|  Total Funding_Num|rank|
+------------+------------+--------------------+--------------------+-------------+------+--------------+---------+--------------------+--------------------+---------+-------+-------------+-------------------+----+
| AppDynamics|        2008|San Francisco, CA...|                 APM|        $315M| $600M| $3.7B (Cisco)|    2,000|Lightspeed, Greylock|Application Perfo...|      4.3|  6.0E8|         NULL|             3.15E8|   1|
|      Affirm|        2012|San Francisco, CA...|                BNPL|        $1.5B| $1.6B|          $12B|    2,500|Founders Fund, Li...|Buy 

In [58]:
window_spec1 = Window.partitionBy('Industry').orderBy(col('ARR_Num').desc())
df3 = df.withColumn('prev_arr', lag(col('ARR_Num')).over(window_spec1))



In [61]:
df3 = df3.withColumn("ARR_Difference", round(col("prev_arr") - col("ARR_Num"), 2))
df3 = df3.filter((col("ARR_Difference").isNotNull()) & (col('ARR_Difference') > 1_000_000_000))


In [63]:
df3.show(5)

+------------+------------+--------------------+--------------+-------------+-----+---------+---------+--------------------+--------------------+---------+-------+-------------+-----------------+--------------------+--------------+
|Company Name|Founded Year|                  HQ|      Industry|Total Funding|  ARR|Valuation|Employees|       Top Investors|             Product|G2 Rating|ARR_Num|Valuation_Num|Total Funding_Num|            prev_arr|ARR_Difference|
+------------+------------+--------------------+--------------+-------------+-----+---------+---------+--------------------+--------------------+---------+-------+-------------+-----------------+--------------------+--------------+
|    Netskope|        2012|Santa Clara, CA, USA|Cloud Security|        $1.4B|$500M|    $7.5B|    2,500| Sequoia, Lightspeed|       SASE Platform|      4.6|  5.0E8|        7.5E9|            1.4E9|               1.6E9|         1.1E9|
|         Box|        2005|Redwood City, CA,...| Cloud Storage|      $56