In [14]:
pip install pyspark



In [15]:
from pyspark.sql import SparkSession

In [16]:
spark = SparkSession.builder \
    .appName("PySpark DataFrame Tutorial") \
    .getOrCreate()

In [17]:
df = spark.read.parquet("/content/titanic.parquet")
df.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
|          6|       0|     3|    Moran, Mr. James|  male|NULL|    0|    0|      

In [18]:
df.printSchema()

root
 |-- PassengerId: long (nullable = true)
 |-- Survived: long (nullable = true)
 |-- Pclass: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: long (nullable = true)
 |-- Parch: long (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [19]:
df.describe().show()

+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|summary|      PassengerId|           Survived|            Pclass|                Name|   Sex|               Age|             SibSp|              Parch|            Ticket|             Fare|Cabin|Embarked|
+-------+-----------------+-------------------+------------------+--------------------+------+------------------+------------------+-------------------+------------------+-----------------+-----+--------+
|  count|              891|                891|               891|                 891|   891|               714|               891|                891|               891|              891|  204|     889|
|   mean|            446.0| 0.3838383838383838| 2.308641975308642|                NULL|  NULL| 29.69911764705882|0.5230078563411896|0.38159371492704824|260318.54916792738| 32.20420

In [20]:
from pyspark.sql.functions import avg, col, round

In [43]:
df.select("Pclass", "Sex", "Survived") \
.groupby("Pclass", "Sex") \
.agg(round(avg(col("Survived")), 2).alias("SurvivalRate")) \
.orderBy("Pclass", col("SurvivalRate").desc()) \
.show()


+------+------+------------+
|Pclass|   Sex|SurvivalRate|
+------+------+------------+
|     1|female|        0.97|
|     1|  male|        0.37|
|     2|female|        0.92|
|     2|  male|        0.16|
|     3|female|         0.5|
|     3|  male|        0.14|
+------+------+------------+



In [22]:
# ------------ 2 -------------

In [24]:
df.show(5)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

In [27]:
from pyspark.sql.functions import desc

In [44]:
df.select("Embarked", "Fare", "Age") \
.where(col("Fare").isNotNull() & col("Age").isNotNull()) \
.groupby("Embarked") \
.agg(round(avg("Fare"), 2).alias("AvgFare"), round(avg("Age"), 2).alias("AvgAge")) \
.orderBy(desc("AvgFare")) \
.show()



+--------+-------+------+
|Embarked|AvgFare|AvgAge|
+--------+-------+------+
|    NULL|   80.0|  50.0|
|       C|   68.3| 30.81|
|       S|  27.48| 29.45|
|       Q|  18.27| 28.09|
+--------+-------+------+



In [46]:
# --------------- 3 -----------------

In [56]:
df.select("Name", "Pclass", "Sex", "Fare", "Cabin") \
.where(col("Survived") == 1) \
.orderBy(col("Fare").desc()) \
.limit(5) \
.show()

+--------------------+------+------+--------+-----------+
|                Name|Pclass|   Sex|    Fare|      Cabin|
+--------------------+------+------+--------+-----------+
|    Ward, Miss. Anna|     1|female|512.3292|       NULL|
|Cardeza, Mr. Thom...|     1|  male|512.3292|B51 B53 B55|
|Lesurer, Mr. Gust...|     1|  male|512.3292|       B101|
|Fortune, Miss. Ma...|     1|female|   263.0|C23 C25 C27|
|Fortune, Miss. Al...|     1|female|   263.0|C23 C25 C27|
+--------------------+------+------+--------+-----------+



In [57]:
# --------------- Final Task -------------------

In [61]:
df = spark.read.option("header", True).csv("/content/top_100_saas_companies_2025.csv")
df.show(5)

+------------+------------+--------------------+--------------------+-------------+------+---------+---------+--------------------+--------------------+---------+
|Company Name|Founded Year|                  HQ|            Industry|Total Funding|   ARR|Valuation|Employees|       Top Investors|             Product|G2 Rating|
+------------+------------+--------------------+--------------------+-------------+------+---------+---------+--------------------+--------------------+---------+
|   Microsoft|        1975|    Redmond, WA, USA| Enterprise Software|          $1B| $270B|      $3T|  221,000|Bill Gates, Paul ...|Azure, Office 365...|      4.4|
|  Salesforce|        1999|San Francisco, CA...|                 CRM|       $65.4M|$37.9B|  $227.8B|   75,000|Halsey Minor, Lar...|Sales Cloud, Serv...|      4.3|
|       Adobe|        1982|   San Jose, CA, USA|   Creative Software|        $2.5M|$19.4B|    $240B|   29,945|   Hambrecht & Quist|Creative Cloud, D...|      4.5|
|      Oracle|        

In [62]:
df.printSchema()

root
 |-- Company Name: string (nullable = true)
 |-- Founded Year: string (nullable = true)
 |-- HQ: string (nullable = true)
 |-- Industry: string (nullable = true)
 |-- Total Funding: string (nullable = true)
 |-- ARR: string (nullable = true)
 |-- Valuation: string (nullable = true)
 |-- Employees: string (nullable = true)
 |-- Top Investors: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- G2 Rating: string (nullable = true)



In [87]:
from pyspark.sql.functions import count

In [88]:
df.select("Company Name", "ARR", "Valuation") \
.where((col("ARR") > "100") & (col("Valuation") < "$500M") & (col("G2 Rating") >= "4.0") & (col("Founded Year") >= "2015")) \
.groupby(col("Industry")) \
.agg(count("Company Name"), avg("ARR"), avg("Valuation")) \
.orderBy(col("ARR").desc()) \
.show()

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `Industry` cannot be resolved. Did you mean one of the following? [`ARR`, `Valuation`, `Company Name`].;
'Aggregate ['Industry], ['Industry, count(Company Name#2124) AS count(Company Name)#2357L, avg(cast(ARR#2129 as double)) AS avg(ARR)#2358, avg(cast(Valuation#2130 as double)) AS avg(Valuation)#2359]
+- Project [Company Name#2124, ARR#2129, Valuation#2130]
   +- Filter ((((ARR#2129 > 100) AND (Valuation#2130 < $500M)) AND (G2 Rating#2134 >= 4.0)) AND (Founded Year#2125 >= 2015))
      +- Project [Company Name#2124, ARR#2129, Valuation#2130, G2 Rating#2134, Founded Year#2125]
         +- Relation [Company Name#2124,Founded Year#2125,HQ#2126,Industry#2127,Total Funding#2128,ARR#2129,Valuation#2130,Employees#2131,Top Investors#2132,Product#2133,G2 Rating#2134] csv
