# Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load data

In [2]:
#load data
data = pd.read_csv('faa_data_subset.xlsx - FAA Wildlife Strikes.csv')

#check data
data.sample(10)

Unnamed: 0,Airport: Code,Airport: Name,Origin State,Origin State Code,Country,Aircraft: Type,Aircraft: Number of engines,Collision Date and Time,When: Time of day,When: Phase of flight,...,Days,Feet above ground,Miles from airport,Wildlife: Animal Category,Wildlife: Species Order,Wildlife: Species Group,Wildlife: Species,Wildlife: Species ID,Number of Strikes,Record ID
12578,KJAX,JACKSONVILLE INTL,Florida,FL,United States,Airplane,2.0,8/20/06 17:15,Day,Landing Roll,...,,0.0,0.0,Birds,Perching Birds,Sparrows,Sparrows,ZX3,1,236127
666,KALB,ALBANY INTL,New York,NY,United States,Airplane,2.0,8/29/11 23:20,Night,Approach,...,,100.0,,Birds,"Gulls, Terns, Sandpipers, Plovers, Skimmers","Sandpipers, Curlews, Phalaropes",American woodcock,N6011,1,315837
1916,KBHM,BIRMINGHAM-SHUTTLESWORTH INTL,Alabama,AL,United States,Airplane,2.0,8/21/02 0:35,Day,Approach,...,,50.0,,Birds,Pigeons and Doves,Pigeons,Rock pigeon,O2111,1,213272
27997,KUNV,UNIVERSITY PARK ARPT,Pennsylvania,PA,United States,Airplane,2.0,2/7/12 15:30,Day,Approach,...,0.041667,,0.0,Birds,Perching Birds,Starlings,European starling,YL001,1,320352
5511,KCXY,CAPITAL CITY ARPT (PA),Pennsylvania,PA,United States,Airplane,1.0,8/16/05 10:37,Day,Climb,...,,50.0,,Birds,Perching Birds,Swallows and Martins,Swallows,YI,1,234357
6655,KDEN,DENVER INTL AIRPORT,Colorado,CO,United States,Airplane,2.0,8/23/11 16:00,Day,Descent,...,,8000.0,5.0,Birds,"Hawks, Kites, Eagles, Ospreys, Vultures","Kites, Hawks, Eagles",Hawks,K33,1,315741
2633,KBOS,GENERAL EDWARD LAWRENCE LOGAN INTL ARPT,Massachusetts,MA,United States,Airplane,2.0,10/3/09 11:25,Day,Take-off run,...,,0.0,0.0,Birds,"Gulls, Terns, Sandpipers, Plovers, Skimmers","Gulls, terns, kittiwakes",Herring gull,NE101,1,266849
3715,KCAE,COLUMBIA METRO,South Carolina,SC,United States,Airplane,2.0,5/31/07 17:30,Day,Approach,...,,50.0,0.0,Birds,Perching Birds,Swallows and Martins,Purple martin,YI001,1,200663
776,KAPC,NAPA COUNTY ARPT,California,CA,United States,Airplane,2.0,2/27/05 9:20,Day,Take-off run,...,,0.0,0.0,Birds,"Ducks, Geese, Swans, Waterfowl",Ducks,Ducks,J21,1,234151
23609,KSAW,SAWYER INTL,Michigan,MI,United States,Airplane,2.0,7/29/07 16:22,Day,Take-off run,...,,0.0,0.0,Birds,"Gulls, Terns, Sandpipers, Plovers, Skimmers","Gulls, terns, kittiwakes",Gulls,NE1,1,249880


In [3]:
#make copy of data for exploration and manipulation
my_copy = data.copy()

# Cleaning and EDA

## nulls, duplicates, data types, size

In [6]:
#check duplicates
my_dups = my_copy.duplicated().sum()
my_dups

0

In [8]:
#check nulls 
my_nulls = my_copy.isna().sum()
my_nulls

Airport: Code                                     0
Airport: Name                                     0
Origin State                                      0
Origin State Code                                 0
Country                                           0
Aircraft: Type                                  831
Aircraft: Number of engines                    2380
Collision Date and Time                           0
When: Time of day                               643
When: Phase of flight                           406
Effect: Amount of damage (detailed)           25187
Effect: Impact to flight                      25265
Effect: Indicated Damage                          0
Cost: Aircraft time out of service (hours)    25369
Cost: Total $                                     0
Days                                          25369
Feet above ground                              3426
Miles from airport                             8491
Wildlife: Animal Category                         0
Wildlife: Sp

Okay, so looks like there are a lot of nulls across a few features.

My research question is to predict the cost related features based on aircraft type, time of day, phase of flight, and animal category.

For the nulls in Aircraft Type, Time of Day and Phase of Flight I will simply drop them as these make up a small percentage of the data.

The features where there are lots of nulls - Effect related - I will drop the feature as speculated what the values may be could introduce unintended bias.

For the Cost - hours and Days I will consider the 'Cost' as 0 as there are the same number of nulls, but I could drop them as a target as well.



In [9]:
#check data types and size of file
my_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28298 entries, 0 to 28297
Data columns (total 25 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Airport: Code                               28298 non-null  object 
 1   Airport: Name                               28298 non-null  object 
 2   Origin State                                28298 non-null  object 
 3   Origin State Code                           28298 non-null  object 
 4   Country                                     28298 non-null  object 
 5   Aircraft: Type                              27467 non-null  object 
 6   Aircraft: Number of engines                 25918 non-null  float64
 7   Collision Date and Time                     28298 non-null  object 
 8   When: Time of day                           27655 non-null  object 
 9   When: Phase of flight                       27892 non-null  object 
 10  Effect: Am

Lots of nulls in some categories

In [10]:
#unique values per feature
my_copy.nunique()

Airport: Code                                   771
Airport: Name                                   771
Origin State                                     49
Origin State Code                                49
Country                                           1
Aircraft: Type                                    2
Aircraft: Number of engines                       4
Collision Date and Time                       27915
When: Time of day                                 4
When: Phase of flight                            11
Effect: Amount of damage (detailed)               4
Effect: Impact to flight                          4
Effect: Indicated Damage                          2
Cost: Aircraft time out of service (hours)      189
Cost: Total $                                  1097
Days                                            189
Feet above ground                               215
Miles from airport                               62
Wildlife: Animal Category                         4
Wildlife: Sp

In [None]:
#collisions per state

In [None]:
#collisions per year


In [None]:
#Collisions per aircraft type

In [None]:
#collisions per month per year

In [None]:
#collisions per animal category

In [None]:
#collisions - Time of day

In [None]:
#collisions - Phase of flight