In [31]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,avg,round,filter

spark = SparkSession.builder.appName("Spark SQL").getOrCreate()
sc = spark.sparkContext

In [32]:
df = spark.read.option("inferSchema",True).parquet(r"C:\Users\Admin\Downloads\csv_files\titanic.parquet")
df.createOrReplaceTempView("passengers")


In [33]:
# Problem 1: Survival Rate by Passenger Class and Gender. Calculate the survival rate grouped by Pclass and Sex. Sort the result by Pclass and descending survival rate.
# Expected Output Columns: Pclass, Sex, SurvivalRate (rounded to 2 decimal places)

# df_survival_rate = spark.sql("""
#                     SELECT Pclass, Sex, ROUND(AVG(Survived), 2) AS SurvivalRate
#                     FROM passengers
#                     GROUP BY Pclass, Sex
#                     ORDER BY Pclass, SurvivalRate DESC
#                 """
#                 )
# df_survival_rate.show()

# OR

# df_survival_rate = df.groupBy("Pclass", "Sex").agg(round(avg("Survived"), 2).alias("SurvivalRate")).orderBy("Pclass", "SurvivalRate", ascending=[True, False])
df_survival_rate = df.groupBy("Pclass","Sex").agg(round(avg("Survived"), 2).alias("SurvivalRate")).orderBy(col("Pclass").asc(),col("SurvivalRate").desc())
df_survival_rate.show()

+------+------+------------+
|Pclass|   Sex|SurvivalRate|
+------+------+------------+
|     1|female|        0.97|
|     1|  male|        0.37|
|     2|female|        0.92|
|     2|  male|        0.16|
|     3|female|         0.5|
|     3|  male|        0.14|
+------+------+------------+



In [34]:
# Problem 2: Average Fare and Age by Embarkation Port. Find the average Fare and Age of passengers grouped by Embarked. Exclude rows where Fare or Age is NULL. Order by average fare descending.
# Expected Output Columns: Embarked, AvgFare, AvgAge

# df_average_fare = spark.sql("""
#                                 SELECT Embarked, ROUND(AVG(Fare), 2) AS AvgFare, ROUND(AVG(Age), 2) AS AvgAge
#                                 FROM passengers
#                                 WHERE Fare IS NOT NULL AND Age IS NOT NULL
#                                 GROUP BY Embarked
#                                 ORDER BY AvgFare DESC
#                             """)
# df_average_fare.show()

# OR

df_average_fare = df.groupBy("Embarked").agg(round(avg("Fare"),2).alias("AvgFare"),round(avg("Age"),2).alias("AvgAge"))
df_average_fare.show()


+--------+-------+------+
|Embarked|AvgFare|AvgAge|
+--------+-------+------+
|       Q|  13.28| 28.09|
|    NULL|   80.0|  50.0|
|       C|  59.95| 30.81|
|       S|  27.08| 29.45|
+--------+-------+------+



In [35]:
# Problem 3: Top 5 Paying Passengers Who Survived. Find the top 5 passengers (by Fare) who survived. Display their Name, Pclass, Sex, Fare, and Cabin.


# df_survived = spark.sql("""
#                         SELECT Name, Pclass, Sex, Fare, Cabin
#                         FROM passengers
#                         WHERE Survived = 1
#                         ORDER BY Fare DESC
#                         LIMIT 5
#                         """)
# df_survived.show(truncate=False)

# OR

df_survived = df.select("Name", "Pclass", "Sex", "Fare", "Cabin").filter(col("survived")== 1).orderBy(col("Fare").desc())
df_survived.show(5,truncate=False)

+----------------------------------+------+------+--------+-----------+
|Name                              |Pclass|Sex   |Fare    |Cabin      |
+----------------------------------+------+------+--------+-----------+
|Ward, Miss. Anna                  |1     |female|512.3292|NULL       |
|Cardeza, Mr. Thomas Drake Martinez|1     |male  |512.3292|B51 B53 B55|
|Lesurer, Mr. Gustave J            |1     |male  |512.3292|B101       |
|Fortune, Miss. Mabel Helen        |1     |female|263.0   |C23 C25 C27|
|Fortune, Miss. Alice Elizabeth    |1     |female|263.0   |C23 C25 C27|
+----------------------------------+------+------+--------+-----------+
only showing top 5 rows



In [36]:
# Problem Statement:

# You're tasked with identifying "hidden gem" startups from the attached CSV— companies that are not heavily funded, but are showing strong performance metrics. Select companies that meet the following conditions:

# -Have an ARR over $100M
# -Have a Valuation under $500M
# -Have a G2 Rating of 4.0 or above
# -Were founded in or after 2015

# You must:
# 1)Group the results by Industry
# 2)For each industry, calculate:
# 3)The number of such companies
# 4)The average ARR
# 5)The average Valuation
# 6)Sort the industries by average ARR descending

# Display only industries that have at least 2 companies matching the above criteria.

In [37]:
df_companies = spark.read.option("header",True).option("inferSchema",True).csv(r"C:\Users\Admin\Downloads\csv_files\top_100_saas_companies_2025.csv")

In [38]:
df_companies.show(5)

+------------+------------+--------------------+--------------------+-------------+------+---------+---------+--------------------+--------------------+---------+
|Company Name|Founded Year|                  HQ|            Industry|Total Funding|   ARR|Valuation|Employees|       Top Investors|             Product|G2 Rating|
+------------+------------+--------------------+--------------------+-------------+------+---------+---------+--------------------+--------------------+---------+
|   Microsoft|        1975|    Redmond, WA, USA| Enterprise Software|          $1B| $270B|      $3T|  221,000|Bill Gates, Paul ...|Azure, Office 365...|      4.4|
|  Salesforce|        1999|San Francisco, CA...|                 CRM|       $65.4M|$37.9B|  $227.8B|   75,000|Halsey Minor, Lar...|Sales Cloud, Serv...|      4.3|
|       Adobe|        1982|   San Jose, CA, USA|   Creative Software|        $2.5M|$19.4B|    $240B|   29,945|   Hambrecht & Quist|Creative Cloud, D...|      4.5|
|      Oracle|        

In [39]:
df_companies.dtypes

[('Company Name', 'string'),
 ('Founded Year', 'int'),
 ('HQ', 'string'),
 ('Industry', 'string'),
 ('Total Funding', 'string'),
 ('ARR', 'string'),
 ('Valuation', 'string'),
 ('Employees', 'string'),
 ('Top Investors', 'string'),
 ('Product', 'string'),
 ('G2 Rating', 'double')]

In [40]:
df_companies.createOrReplaceTempView("companies")


In [41]:
from pyspark.sql.functions import regexp_replace, col, when, avg, count

df_cleaned = df_companies.withColumn(
    "Valuation_cleaned_raw",
    regexp_replace(col("Valuation"), r"\(.*?\)", "")
).withColumn(
    "ARR_cleaned_raw",
    regexp_replace(col("ARR"), r"\(.*?\)", "")
)

df_cleaned = df_cleaned.withColumn(
    "ARR_clean",
    when(col("ARR_cleaned_raw").contains("B"), regexp_replace(col("ARR_cleaned_raw"), "[$B]", "").cast("double") * 1000)
    .otherwise(regexp_replace(col("ARR_cleaned_raw"), "[$M]", "").cast("double"))
).withColumn(
    "Valuation_clean",
    when(col("Valuation_cleaned_raw").contains("B"), regexp_replace(col("Valuation_cleaned_raw"), "[$B]", "").cast("double") * 1000)
    .when(col("Valuation_cleaned_raw").contains("T"), regexp_replace(col("Valuation_cleaned_raw"), "[$T]", "").cast("double") * 1000)
)
 
filtered = df_cleaned.filter(
    (col("ARR_clean") > 100) |
    (col("Valuation_clean") < 500) |
    (col("G2 Rating") >= 4.0) |
    (col("Founded Year") >= 2015)
)


final = filtered.select("Company Name", "ARR","ARR_Clean", "Valuation","Valuation_clean", "Founded Year", "G2 Rating","Industry")
final.show(truncate=False)


+------------+------+---------+-------------------+---------------+------------+---------+----------------------+
|Company Name|ARR   |ARR_Clean|Valuation          |Valuation_clean|Founded Year|G2 Rating|Industry              |
+------------+------+---------+-------------------+---------------+------------+---------+----------------------+
|Microsoft   |$270B |270000.0 |$3T                |3000.0         |1975        |4.4      |Enterprise Software   |
|Salesforce  |$37.9B|37900.0  |$227.8B            |227800.0       |1999        |4.3      |CRM                   |
|Adobe       |$19.4B|19400.0  |$240B              |240000.0       |1982        |4.5      |Creative Software     |
|Oracle      |$52.9B|52900.0  |$350B              |350000.0       |1977        |4.0      |Database & Enterprise |
|SAP         |$32.5B|32500.0  |$215B              |215000.0       |1972        |4.1      |Enterprise Software   |
|Intuit      |$14.4B|14400.0  |$180B              |180000.0       |1983        |4.4     

In [42]:

result = final.groupBy("Industry").agg(
        count("*").alias("CompanyCount"),
        avg("ARR_clean").alias("AvgARR"),
        avg("Valuation_clean").alias("AvgValuation")
    ).filter(col("CompanyCount") >= 2).orderBy(col("AvgARR").desc())

result.show()


+-------------------+------------+--------+------------+
|           Industry|CompanyCount|  AvgARR|AvgValuation|
+-------------------+------------+--------+------------+
|Enterprise Software|           2|151250.0|    109000.0|
|           Payments|           2| 16850.0|     75000.0|
|      Cybersecurity|           2|  5300.0|     82500.0|
|     Communications|           2|  3150.0|      8500.0|
|     Data Analytics|           2|  2950.0|     40000.0|
|      Cloud Storage|           2|  1750.0|      6000.0|
|             Design|           2|  1300.0|     30000.0|
|     Cloud Security|           2|  1050.0|     18750.0|
|           Database|           2|   900.0|     14000.0|
|    Work Management|           3|   756.0|      9300.0|
|Customer Engagement|           2|   508.0|      5400.0|
|             DevOps|           2|   422.5|      6850.0|
|   Sales Engagement|           2|   200.0|      3350.0|
|  Product Analytics|           2|   166.5|      2525.0|
+-------------------+----------

In [44]:
sc.stop()