In [24]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [43]:
spark= SparkSession.builder.appName("pyspark_test").getOrCreate()
df = spark.read.csv("/content/top_100_saas_companies_2025.csv",header=True , inferSchema=True)
df.show(10)

+------------+------------+--------------------+--------------------+-------------+------+---------+---------+--------------------+--------------------+---------+
|Company Name|Founded Year|                  HQ|            Industry|Total Funding|   ARR|Valuation|Employees|       Top Investors|             Product|G2 Rating|
+------------+------------+--------------------+--------------------+-------------+------+---------+---------+--------------------+--------------------+---------+
|   Microsoft|        1975|    Redmond, WA, USA| Enterprise Software|          $1B| $270B|      $3T|  221,000|Bill Gates, Paul ...|Azure, Office 365...|      4.4|
|  Salesforce|        1999|San Francisco, CA...|                 CRM|       $65.4M|$37.9B|  $227.8B|   75,000|Halsey Minor, Lar...|Sales Cloud, Serv...|      4.3|
|       Adobe|        1982|   San Jose, CA, USA|   Creative Software|        $2.5M|$19.4B|    $240B|   29,945|   Hambrecht & Quist|Creative Cloud, D...|      4.5|
|      Oracle|        

In [44]:
df.printSchema()

root
 |-- Company Name: string (nullable = true)
 |-- Founded Year: integer (nullable = true)
 |-- HQ: string (nullable = true)
 |-- Industry: string (nullable = true)
 |-- Total Funding: string (nullable = true)
 |-- ARR: string (nullable = true)
 |-- Valuation: string (nullable = true)
 |-- Employees: string (nullable = true)
 |-- Top Investors: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- G2 Rating: double (nullable = true)



In [45]:
df1= df.dropna(subset=['ARR','Valuation','G2 Rating'])

In [46]:
df1 = df1.withColumn('G2 Rating', col('G2 Rating').cast("double"))

In [47]:

cols_to_convert = ["ARR", "Valuation", "Total Funding"]

from pyspark.sql.functions import col, regexp_replace, when, trim, upper

def convert_unit(col_name: str):

    cleaned = upper(trim(regexp_replace(col(col_name), "[$,]", "")))

    return (
        when(cleaned.endswith("T"), regexp_replace(cleaned, "T", "").cast("double") * 1e12)
        .when(cleaned.endswith("B"), regexp_replace(cleaned, "B", "").cast("double") * 1e9)
        .when(cleaned.endswith("M"), regexp_replace(cleaned, "M", "").cast("double") * 1e6)
        .when(cleaned.endswith("K"), regexp_replace(cleaned, "K", "").cast("double") * 1e3)
        .when(cleaned.rlike("^[0-9.]+$"), cleaned.cast("double"))
        .otherwise(None)
    )

for c in cols_to_convert:
    df1 = df1.withColumn(c, convert_unit(c))

df1.show(5)


+------------+------------+--------------------+--------------------+-------------------+-------+---------+---------+--------------------+--------------------+---------+
|Company Name|Founded Year|                  HQ|            Industry|      Total Funding|    ARR|Valuation|Employees|       Top Investors|             Product|G2 Rating|
+------------+------------+--------------------+--------------------+-------------------+-------+---------+---------+--------------------+--------------------+---------+
|   Microsoft|        1975|    Redmond, WA, USA| Enterprise Software|              1.0E9| 2.7E11|   3.0E12|  221,000|Bill Gates, Paul ...|Azure, Office 365...|      4.4|
|  Salesforce|        1999|San Francisco, CA...|                 CRM|6.540000000000001E7|3.79E10| 2.278E11|   75,000|Halsey Minor, Lar...|Sales Cloud, Serv...|      4.3|
|       Adobe|        1982|   San Jose, CA, USA|   Creative Software|          2500000.0|1.94E10|   2.4E11|   29,945|   Hambrecht & Quist|Creative Clo

In [39]:

df2 = df1.filter((col('ARR')>1E8) | (col('Valuation')<5E8 )|(col('G2 Rating')>=4) | (col('Founded Year')>=2015))
df2.show(5)

+------------+------------+--------------------+--------------------+-------------+-------+---------+---------+--------------------+--------------------+---------+
|Company Name|Founded Year|                  HQ|            Industry|Total Funding|    ARR|Valuation|Employees|       Top Investors|             Product|G2 Rating|
+------------+------------+--------------------+--------------------+-------------+-------+---------+---------+--------------------+--------------------+---------+
|   Microsoft|        1975|    Redmond, WA, USA| Enterprise Software|          $1B| 2.7E11|   3.0E12|  221,000|Bill Gates, Paul ...|Azure, Office 365...|      4.4|
|  Salesforce|        1999|San Francisco, CA...|                 CRM|       $65.4M|3.79E10| 2.278E11|   75,000|Halsey Minor, Lar...|Sales Cloud, Serv...|      4.3|
|       Adobe|        1982|   San Jose, CA, USA|   Creative Software|        $2.5M|1.94E10|   2.4E11|   29,945|   Hambrecht & Quist|Creative Cloud, D...|      4.5|
|      Oracle|  

In [49]:

df3 = df2.groupBy('Industry').agg(count("*").alias("Company Count"),avg('ARR').alias('avg_arr'),avg('Valuation').alias('avg_valuation')).orderBy('avg_arr',ascending=False)
df3.filter(col('Company Count')>1).show()

+-------------------+-------------+---------+-------------+
|           Industry|Company Count|  avg_arr|avg_valuation|
+-------------------+-------------+---------+-------------+
|Enterprise Software|            2|1.5125E11|    1.6075E12|
|           Payments|            2| 1.685E10|       7.5E10|
|      Cybersecurity|            2|    5.3E9|      8.25E10|
|     Communications|            2|   3.15E9|        8.5E9|
|     Data Analytics|            2|   2.95E9|       5.2E10|
|      Cloud Storage|            2|   1.75E9|        6.0E9|
|             Design|            2|    1.3E9|       4.0E10|
|     Cloud Security|            2|   1.05E9|     1.875E10|
|           Database|            2|    9.0E8|       1.4E10|
|    Work Management|            3|   7.56E8|       9.75E9|
|Customer Engagement|            2|   5.08E8|        5.4E9|
|             DevOps|            2|  4.225E8|       6.85E9|
|   Sales Engagement|            2|    2.0E8|       3.35E9|
|  Product Analytics|            2|  1.6