## Importing Required Libraries

In [111]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit

## Initializing a spark session

In [112]:
spark = SparkSession.builder.appName('Testing').getOrCreate()

In [113]:
spark

### Reading data from a csv file

In [114]:
df = spark.read.option('header', 'true').csv('airline-safety.csv', inferSchema = True)
# or
# df = spark.read.csv('airline-safety.csv', header = True, inferSchema = True)

# option('header', 'true')  -  To enable the first row as the column name
# inferSchema = True)       -  If false, every data will be taken as string

In [115]:
type(df)

pyspark.sql.dataframe.DataFrame

In [116]:
df.show(5)

+--------------------+----------------------+---------------+---------------------+----------------+---------------+---------------------+----------------+
|             airline|avail_seat_km_per_week|incidents_85_99|fatal_accidents_85_99|fatalities_85_99|incidents_00_14|fatal_accidents_00_14|fatalities_00_14|
+--------------------+----------------------+---------------+---------------------+----------------+---------------+---------------------+----------------+
|          Aer Lingus|             320906734|              2|                    0|               0|              0|                    0|               0|
|           Aeroflot*|            1197672318|             76|                   14|             128|              6|                    1|              88|
|Aerolineas Argent...|             385803648|              6|                    0|               0|              1|                    0|               0|
|         Aeromexico*|             596871813|              3|   

### Check the Schema of the dataframe

In [117]:
df.printSchema()

root
 |-- airline: string (nullable = true)
 |-- avail_seat_km_per_week: long (nullable = true)
 |-- incidents_85_99: integer (nullable = true)
 |-- fatal_accidents_85_99: integer (nullable = true)
 |-- fatalities_85_99: integer (nullable = true)
 |-- incidents_00_14: integer (nullable = true)
 |-- fatal_accidents_00_14: integer (nullable = true)
 |-- fatalities_00_14: integer (nullable = true)



### Playing with columns and rows

In [118]:
# Select or extract specific columns into a separate dataframe

df.select(['airline', 'fatal_accidents_85_99']).show(5)

type(df.select(['airline', 'fatal_accidents_85_99']))

+--------------------+---------------------+
|             airline|fatal_accidents_85_99|
+--------------------+---------------------+
|          Aer Lingus|                    0|
|           Aeroflot*|                   14|
|Aerolineas Argent...|                    0|
|         Aeromexico*|                    1|
|          Air Canada|                    0|
+--------------------+---------------------+
only showing top 5 rows



pyspark.sql.dataframe.DataFrame

In [119]:
df['airline']

Column<'airline'>

In [120]:
# Check the datatypes of the columns

df.dtypes

[('airline', 'string'),
 ('avail_seat_km_per_week', 'bigint'),
 ('incidents_85_99', 'int'),
 ('fatal_accidents_85_99', 'int'),
 ('fatalities_85_99', 'int'),
 ('incidents_00_14', 'int'),
 ('fatal_accidents_00_14', 'int'),
 ('fatalities_00_14', 'int')]

In [121]:
# Similar to pandas describe

df.describe().show()

+-------+---------------+----------------------+------------------+---------------------+------------------+------------------+---------------------+------------------+
|summary|        airline|avail_seat_km_per_week|   incidents_85_99|fatal_accidents_85_99|  fatalities_85_99|   incidents_00_14|fatal_accidents_00_14|  fatalities_00_14|
+-------+---------------+----------------------+------------------+---------------------+------------------+------------------+---------------------+------------------+
|  count|             56|                    56|                56|                   56|                56|                56|                   56|                56|
|   mean|           NULL|   1.384621304732143E9| 7.178571428571429|   2.1785714285714284|112.41071428571429|             4.125|   0.6607142857142857|55.517857142857146|
| stddev|           NULL|  1.4653168949166625E9|11.035656495456639|    2.861068731385928|146.69111354205404|4.5449772476678225|   0.8586836800228957| 111.3

In [122]:
# Drop a column from the DataFrame

col = df.select('incidents_85_99')

df = df.drop('incidents_85_99')

In [123]:
df.show(5)

+--------------------+----------------------+---------------------+----------------+---------------+---------------------+----------------+
|             airline|avail_seat_km_per_week|fatal_accidents_85_99|fatalities_85_99|incidents_00_14|fatal_accidents_00_14|fatalities_00_14|
+--------------------+----------------------+---------------------+----------------+---------------+---------------------+----------------+
|          Aer Lingus|             320906734|                    0|               0|              0|                    0|               0|
|           Aeroflot*|            1197672318|                   14|             128|              6|                    1|              88|
|Aerolineas Argent...|             385803648|                    0|               0|              1|                    0|               0|
|         Aeromexico*|             596871813|                    1|              64|              5|                    0|               0|
|          Air Canad

In [124]:
col

DataFrame[incidents_85_99: int]

In [128]:
# Add a column to the DataFrame

df = df.withColumn("incidents_85_99", lit(0))

# df['incidents_85_99'] = col['incidents_85_99']

df = df.withColumn("incidents_85_99", col['incidents_85_99'])

df.show(5)

AnalysisException: [MISSING_ATTRIBUTES.RESOLVED_ATTRIBUTE_APPEAR_IN_OPERATION] Resolved attribute(s) "incidents_85_99" missing from "airline", "avail_seat_km_per_week", "fatal_accidents_85_99", "fatalities_85_99", "incidents_00_14", "fatal_accidents_00_14", "fatalities_00_14", "incidents_85_99" in operator !Project [airline#4683, avail_seat_km_per_week#4684L, fatal_accidents_85_99#4686, fatalities_85_99#4687, incidents_00_14#4688, fatal_accidents_00_14#4689, fatalities_00_14#4690, incidents_85_99#4685 AS incidents_85_99#5502]. Attribute(s) with the same name appear in the operation: "incidents_85_99".
Please check if the right attribute(s) are used.;
!Project [airline#4683, avail_seat_km_per_week#4684L, fatal_accidents_85_99#4686, fatalities_85_99#4687, incidents_00_14#4688, fatal_accidents_00_14#4689, fatalities_00_14#4690, incidents_85_99#4685 AS incidents_85_99#5502]
+- Project [airline#4683, avail_seat_km_per_week#4684L, fatal_accidents_85_99#4686, fatalities_85_99#4687, incidents_00_14#4688, fatal_accidents_00_14#4689, fatalities_00_14#4690, 0 AS incidents_85_99#5493]
   +- Project [airline#4683, avail_seat_km_per_week#4684L, fatal_accidents_85_99#4686, fatalities_85_99#4687, incidents_00_14#4688, fatal_accidents_00_14#4689, fatalities_00_14#4690, 0 AS incidents_85_99#5484]
      +- Project [airline#4683, avail_seat_km_per_week#4684L, fatal_accidents_85_99#4686, fatalities_85_99#4687, incidents_00_14#4688, fatal_accidents_00_14#4689, fatalities_00_14#4690, 0 AS incidents_85_99#5435]
         +- Project [airline#4683, avail_seat_km_per_week#4684L, fatal_accidents_85_99#4686, fatalities_85_99#4687, incidents_00_14#4688, fatal_accidents_00_14#4689, fatalities_00_14#4690, 0 AS incidents_85_99#5425]
            +- Project [airline#4683, avail_seat_km_per_week#4684L, fatal_accidents_85_99#4686, fatalities_85_99#4687, incidents_00_14#4688, fatal_accidents_00_14#4689, fatalities_00_14#4690]
               +- Relation [airline#4683,avail_seat_km_per_week#4684L,incidents_85_99#4685,fatal_accidents_85_99#4686,fatalities_85_99#4687,incidents_00_14#4688,fatal_accidents_00_14#4689,fatalities_00_14#4690] csv


In [130]:
# Renaming the column name
df = df.withColumnRenamed('airline', 'Airline_Name')

df.show(5)

+--------------------+----------------------+---------------------+----------------+---------------+---------------------+----------------+---------------+
|        Airline_Name|avail_seat_km_per_week|fatal_accidents_85_99|fatalities_85_99|incidents_00_14|fatal_accidents_00_14|fatalities_00_14|incidents_85_99|
+--------------------+----------------------+---------------------+----------------+---------------+---------------------+----------------+---------------+
|          Aer Lingus|             320906734|                    0|               0|              0|                    0|               0|              0|
|           Aeroflot*|            1197672318|                   14|             128|              6|                    1|              88|              0|
|Aerolineas Argent...|             385803648|                    0|               0|              1|                    0|               0|              0|
|         Aeromexico*|             596871813|                   

### Handling missing values

#### Drop rows that have nulls

In [140]:
# Find the shape of the pySpark DataFrame
df.count(), len(df.columns)

(56, 8)

In [146]:
df.na.drop(how="any", thresh=2).count()

# how --> 'any' - drop all rows that have a null value
# how --> 'all' - drop all rows that have all the values in the row as null
# thresh --> n - drop the row if it has less than n number of non-null values

56

In [148]:
df.na.drop(how="any", subset=['avail_seat_km_per_week', 'fatal_accidents_85_99']).count()

# subset --> drop all rows that have null value in the specified columns

56

#### Filling the missing values

In [150]:
# Replace null values with a specified value

df.na.fill(0, ['fatal_accidents_85_99', 'Airline_Name']).show(5)
#       value to replace null,            columns to replace in

+--------------------+----------------------+---------------------+----------------+---------------+---------------------+----------------+---------------+
|        Airline_Name|avail_seat_km_per_week|fatal_accidents_85_99|fatalities_85_99|incidents_00_14|fatal_accidents_00_14|fatalities_00_14|incidents_85_99|
+--------------------+----------------------+---------------------+----------------+---------------+---------------------+----------------+---------------+
|          Aer Lingus|             320906734|                    0|               0|              0|                    0|               0|              0|
|           Aeroflot*|            1197672318|                   14|             128|              6|                    1|              88|              0|
|Aerolineas Argent...|             385803648|                    0|               0|              1|                    0|               0|              0|
|         Aeromexico*|             596871813|                   

In [152]:
# Replacing the null values with the mean or median of a particular column using Imputer

# from pyspark.ml.feature import Imputer

# imputer = Imputer(
#     inputCols=['fatal_accidents_85_99', 'Airline_Name'],
#     outputCols=
# )