## Importing Required Libraries

In [2]:
import pyspark
from pyspark.sql import SparkSession

## Initializing a spark session

In [3]:
spark = SparkSession.builder.appName('Testing').getOrCreate()

In [4]:
spark

### Reading data from a csv file

In [17]:
df = spark.read.option('header', 'true').csv('airline-safety.csv', inferSchema = True)
# or
# df = spark.read.csv('airline-safety.csv', header = True, inferSchema = True)

# option('header', 'true')  -  To enable the first row as the column name
# inferSchema = True)       -  If false, every data will be taken as string

In [18]:
type(df)

pyspark.sql.dataframe.DataFrame

In [19]:
df.show(5)

+--------------------+----------------------+---------------+---------------------+----------------+---------------+---------------------+----------------+
|             airline|avail_seat_km_per_week|incidents_85_99|fatal_accidents_85_99|fatalities_85_99|incidents_00_14|fatal_accidents_00_14|fatalities_00_14|
+--------------------+----------------------+---------------+---------------------+----------------+---------------+---------------------+----------------+
|          Aer Lingus|             320906734|              2|                    0|               0|              0|                    0|               0|
|           Aeroflot*|            1197672318|             76|                   14|             128|              6|                    1|              88|
|Aerolineas Argent...|             385803648|              6|                    0|               0|              1|                    0|               0|
|         Aeromexico*|             596871813|              3|   

### Check the Schema of the dataframe

In [20]:
df.printSchema()

root
 |-- airline: string (nullable = true)
 |-- avail_seat_km_per_week: long (nullable = true)
 |-- incidents_85_99: integer (nullable = true)
 |-- fatal_accidents_85_99: integer (nullable = true)
 |-- fatalities_85_99: integer (nullable = true)
 |-- incidents_00_14: integer (nullable = true)
 |-- fatal_accidents_00_14: integer (nullable = true)
 |-- fatalities_00_14: integer (nullable = true)



### Playing with columns and rows

In [25]:
# Select or extract specific columns into a separate dataframe

df.select(['airline', 'fatal_accidents_85_99']).show(5)

type(df.select(['airline', 'fatal_accidents_85_99']))

+--------------------+---------------------+
|             airline|fatal_accidents_85_99|
+--------------------+---------------------+
|          Aer Lingus|                    0|
|           Aeroflot*|                   14|
|Aerolineas Argent...|                    0|
|         Aeromexico*|                    1|
|          Air Canada|                    0|
+--------------------+---------------------+
only showing top 5 rows



pyspark.sql.dataframe.DataFrame

In [34]:
df['airline']

Column<'airline'>

In [32]:
# Check the datatypes of the columns

df.dtypes

[('airline', 'string'),
 ('avail_seat_km_per_week', 'bigint'),
 ('incidents_85_99', 'int'),
 ('fatal_accidents_85_99', 'int'),
 ('fatalities_85_99', 'int'),
 ('incidents_00_14', 'int'),
 ('fatal_accidents_00_14', 'int'),
 ('fatalities_00_14', 'int')]

In [37]:
# Similar to pandas describe

df.describe().show()

+-------+---------------+----------------------+------------------+---------------------+------------------+------------------+---------------------+------------------+
|summary|        airline|avail_seat_km_per_week|   incidents_85_99|fatal_accidents_85_99|  fatalities_85_99|   incidents_00_14|fatal_accidents_00_14|  fatalities_00_14|
+-------+---------------+----------------------+------------------+---------------------+------------------+------------------+---------------------+------------------+
|  count|             56|                    56|                56|                   56|                56|                56|                   56|                56|
|   mean|           NULL|   1.384621304732143E9| 7.178571428571429|   2.1785714285714284|112.41071428571429|             4.125|   0.6607142857142857|55.517857142857146|
| stddev|           NULL|  1.4653168949166625E9|11.035656495456639|    2.861068731385928|146.69111354205404|4.5449772476678225|   0.8586836800228957| 111.3