# Install

In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 27 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 48.2 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=6c0861cfc1fbf3fbc35853cda047235a43d618540a4540e7c91a31dedf2ac539
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [2]:
import pyspark
from pyspark.sql import SparkSession
import pandas as pd

# Start a session

In [3]:
spark = SparkSession.builder.appName("DataFramePractise").getOrCreate()

In [4]:
spark

# Load dataset

In [5]:
!wget https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv

--2022-05-31 03:32:41--  https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 60302 (59K) [text/plain]
Saving to: ‘titanic.csv’


2022-05-31 03:32:41 (4.49 MB/s) - ‘titanic.csv’ saved [60302/60302]



In [6]:
DATA_PATH = "titanic.csv"

In [7]:
df = spark.read.csv(DATA_PATH, inferSchema=True, header=True)
df.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|       0|     3|    Moran, Mr. James|  male|null|    0|    0|      

# GroupBy and Aggregate

In [10]:
g = df.groupBy("Pclass")
g

<pyspark.sql.group.GroupedData at 0x7fca15846150>

In [11]:
g.sum().show()

+------+----------------+-------------+-----------+--------+----------+----------+------------------+
|Pclass|sum(PassengerId)|sum(Survived)|sum(Pclass)|sum(Age)|sum(SibSp)|sum(Parch)|         sum(Fare)|
+------+----------------+-------------+-----------+--------+----------+----------+------------------+
|     1|           99705|          136|        216| 7111.42|        90|        77|18177.412499999984|
|     3|          215625|          119|       1473| 8924.92|       302|       193| 6714.695100000002|
|     2|           82056|           87|        368| 5168.83|        74|        70|3801.8416999999995|
+------+----------------+-------------+-----------+--------+----------+----------+------------------+



In [13]:
g.avg().show()

+------+------------------+-------------------+-----------+------------------+-------------------+-------------------+------------------+
|Pclass|  avg(PassengerId)|      avg(Survived)|avg(Pclass)|          avg(Age)|         avg(SibSp)|         avg(Parch)|         avg(Fare)|
+------+------------------+-------------------+-----------+------------------+-------------------+-------------------+------------------+
|     1|461.59722222222223| 0.6296296296296297|        1.0|38.233440860215055| 0.4166666666666667|0.35648148148148145| 84.15468749999992|
|     3| 439.1547861507128|0.24236252545824846|        3.0| 25.14061971830986|  0.615071283095723|0.39307535641547864|13.675550101832997|
|     2|445.95652173913044|0.47282608695652173|        2.0| 29.87763005780347|0.40217391304347827| 0.3804347826086957| 20.66218315217391|
+------+------------------+-------------------+-----------+------------------+-------------------+-------------------+------------------+



In [15]:
df.groupBy("Sex").mean().show()

+------+------------------+-------------------+-----------------+------------------+-------------------+-------------------+------------------+
|   Sex|  avg(PassengerId)|      avg(Survived)|      avg(Pclass)|          avg(Age)|         avg(SibSp)|         avg(Parch)|         avg(Fare)|
+------+------------------+-------------------+-----------------+------------------+-------------------+-------------------+------------------+
|female|431.02866242038215| 0.7420382165605095|2.159235668789809|27.915708812260537| 0.6942675159235668| 0.6496815286624203| 44.47981783439487|
|  male| 454.1473136915078|0.18890814558058924|2.389948006932409| 30.72664459161148|0.42980935875216636|0.23570190641247835|25.523893414211418|
+------+------------------+-------------------+-----------------+------------------+-------------------+-------------------+------------------+



In [20]:
df.groupBy("Sex").max("Fare").show()

+------+---------+
|   Sex|max(Fare)|
+------+---------+
|female| 512.3292|
|  male| 512.3292|
+------+---------+



In [19]:
df.groupBy("Sex").count().show()

+------+-----+
|   Sex|count|
+------+-----+
|female|  314|
|  male|  577|
+------+-----+



In [21]:
df.agg({'Fare' : 'sum'}).show()

+------------------+
|         sum(Fare)|
+------------------+
|28693.949299999967|
+------------------+

