In [20]:
import pandas as pd
import numpy as np

In [2]:
airlines = pd.read_csv("airlines_final.csv")
airlines.head()

Unnamed: 0.1,Unnamed: 0,id,day,airline,destination,dest_region,dest_size,boarding_area,dept_time,wait_min,cleanliness,safety,satisfaction
0,0,1351,Tuesday,UNITED INTL,KANSAI,Asia,Hub,Gates 91-102,2018-12-31,115.0,Clean,Neutral,Very satisfied
1,1,373,Friday,ALASKA,SAN JOSE DEL CABO,Canada/Mexico,Small,Gates 50-59,2018-12-31,135.0,Clean,Very safe,Very satisfied
2,2,2820,Thursday,DELTA,LOS ANGELES,West US,Hub,Gates 40-48,2018-12-31,70.0,Average,Somewhat safe,Neutral
3,3,1157,Tuesday,SOUTHWEST,LOS ANGELES,West US,Hub,Gates 20-39,2018-12-31,190.0,Clean,Very safe,Somewhat satsified
4,4,2992,Wednesday,AMERICAN,MIAMI,East US,Hub,Gates 50-59,2018-12-31,559.0,Somewhat clean,Very safe,Somewhat satsified


In [6]:
data = {"cleanliness" : ['Clean', 'Average', 'Somewhat clean', 'Somewhat dirty', 'Dirty'],
       
        'safety' : ['Neutral', 'Very safe', 'Somewhat safe', 'Very unsafe', 'Somewhat unsafe'],
        
        'satisfaction' : ['Very satisfied', 'Neutral', 'Somewhat satisfied', 'Somewhat unsatisfied', 'Very unsatisfied']
       }

categories = pd.DataFrame(data = data)
categories

Unnamed: 0,cleanliness,safety,satisfaction
0,Clean,Neutral,Very satisfied
1,Average,Very safe,Neutral
2,Somewhat clean,Somewhat safe,Somewhat satisfied
3,Somewhat dirty,Very unsafe,Somewhat unsatisfied
4,Dirty,Somewhat unsafe,Very unsatisfied


### Finding inconsistent categories

In [7]:
# Print unique values of survey columns in airlines
print('Cleanliness: ', airlines['cleanliness'].unique(), "\n")
print('Safety: ', airlines['safety'].unique(), "\n")
print('Satisfaction: ', airlines['satisfaction'].unique(), "\n")

Cleanliness:  ['Clean' 'Average' 'Somewhat clean' 'Somewhat dirty' 'Dirty'] 

Safety:  ['Neutral' 'Very safe' 'Somewhat safe' 'Very unsafe' 'Somewhat unsafe'] 

Satisfaction:  ['Very satisfied' 'Neutral' 'Somewhat satsified' 'Somewhat unsatisfied'
 'Very unsatisfied'] 



In [5]:
# Find the cleanliness category in airlines not in categories
cat_clean = set(airlines['cleanliness']).difference(categories["cleanliness"])
cat_clean

set()

In [6]:
# Find rows with that category
cat_clean_rows = airlines['cleanliness'].isin(cat_clean)
cat_clean_rows

0       False
1       False
2       False
3       False
4       False
        ...  
2472    False
2473    False
2474    False
2475    False
2476    False
Name: cleanliness, Length: 2477, dtype: bool

In [7]:
# Print rows with inconsistent category
airlines[cat_clean_rows]

Unnamed: 0.1,Unnamed: 0,id,day,airline,destination,dest_region,dest_size,boarding_area,dept_time,wait_min,cleanliness,safety,satisfaction


In [8]:
# Find the cleanliness category in airlines not in categories
# Create a set out of the cleanliness column in airlines using set() and find the inconsistent category by 
# finding the difference in the cleanliness column of categories.
cat_clean = set(airlines['cleanliness']).difference(categories['cleanliness'])

# Find rows of airlines with a cleanliness value not in categories and print the output.
cat_clean_rows = airlines['cleanliness'].isin(cat_clean)
print(airlines[cat_clean_rows])

In [10]:
# Print rows with consistent categories only
print(airlines[~cat_clean_rows])

      Unnamed: 0    id        day        airline        destination  \
0              0  1351    Tuesday    UNITED INTL             KANSAI   
1              1   373     Friday         ALASKA  SAN JOSE DEL CABO   
2              2  2820   Thursday          DELTA        LOS ANGELES   
3              3  1157    Tuesday      SOUTHWEST        LOS ANGELES   
4              4  2992  Wednesday       AMERICAN              MIAMI   
...          ...   ...        ...            ...                ...   
2472        2804  1475    Tuesday         ALASKA       NEW YORK-JFK   
2473        2805  2222   Thursday      SOUTHWEST            PHOENIX   
2474        2806  2684     Friday         UNITED            ORLANDO   
2475        2807  2549    Tuesday        JETBLUE         LONG BEACH   
2476        2808  2162   Saturday  CHINA EASTERN            QINGDAO   

        dest_region dest_size boarding_area   dept_time  wait_min  \
0              Asia       Hub  Gates 91-102  2018-12-31     115.0   
1     Can

## Working with Categorical Variables

### Ex 1: Inconsistent categories

In this exercise, you will examine two `categorical` columns from this DataFrame, `dest_region` and `dest_size` respectively, assess how to address them and make sure that they are cleaned and ready for analysis.

In [32]:
airlines.head()

Unnamed: 0.1,Unnamed: 0,id,day,airline,destination,dest_region,dest_size,boarding_area,dept_time,wait_min,cleanliness,safety,satisfaction,wait_type,day_week
0,0,1351,Tuesday,UNITED INTL,KANSAI,asia,Hub,Gates 91-102,2018-12-31,115.0,Clean,Neutral,Very satisfied,medium,weekday
1,1,373,Friday,ALASKA,SAN JOSE DEL CABO,canada/mexico,Small,Gates 50-59,2018-12-31,135.0,Clean,Very safe,Very satisfied,medium,weekday
2,2,2820,Thursday,DELTA,LOS ANGELES,west us,Hub,Gates 40-48,2018-12-31,70.0,Average,Somewhat safe,Neutral,medium,weekday
3,3,1157,Tuesday,SOUTHWEST,LOS ANGELES,west us,Hub,Gates 20-39,2018-12-31,190.0,Clean,Very safe,Somewhat satsified,long,weekday
4,4,2992,Wednesday,AMERICAN,MIAMI,east us,Hub,Gates 50-59,2018-12-31,559.0,Somewhat clean,Very safe,Somewhat satsified,long,weekday


In [11]:
# Print the unique values in dest_region and dest_size respectively.

airlines['dest_region'].unique()

array(['Asia', 'Canada/Mexico', 'West US', 'East US', 'Midwest US',
       'EAST US', 'Middle East', 'Europe', 'eur', 'Central/South America',
       'Australia/New Zealand', 'middle east'], dtype=object)

In [12]:
airlines['dest_size'].unique()

array(['Hub', 'Small', '    Hub', 'Medium', 'Large', 'Hub     ',
       '    Small', 'Medium     ', '    Medium', 'Small     ',
       '    Large', 'Large     '], dtype=object)

In [14]:
# The dest_region column has inconsistent values due to capitalization and has one value "eur" that needs to be remapped.

# Changing the capitalization of all values of dest_region to lowercase.
airlines['dest_region'] = airlines['dest_region'].str.lower()

In [15]:
# The dest_region column has one value "eur" that needs to be remapped.

airlines['dest_region'] = airlines['dest_region'].replace({'eur':'europe'})

In [16]:
# Verify changes have been effected
print(airlines['dest_region'].unique())

array(['asia', 'canada/mexico', 'west us', 'east us', 'midwest us',
       'middle east', 'europe', 'central/south america',
       'australia/new zealand'], dtype=object)

In [17]:
# The dest_size column has inconsistent values due to leading and trailing spaces.
# Strip white spaces from the dest_size column using the .strip() method.

airlines['dest_size'] = airlines['dest_size'].str.strip()

In [18]:
# Verify changes have been effected

print(airlines['dest_size'].unique())

['Hub' 'Small' 'Medium' 'Large']


### Ex 2: Remapping categories

To better understand survey respondents from `airlines`, you want to find out if there is a `relationship` between certain responses and the `day of the week` and `wait time` at the gate.

The airlines DataFrame contains the `day` and `wait_min columns`, which are `categorical` and `numerical` respectively.

The `day` column contains the `exact day` a flight took place

and `wait_min` contains the `amount of minutes` it took travelers to wait at the gate.

To make your analysis easier, you want to create two new categorical variables:

1.`wait_type`: `'short'` for 0-60 min, `'medium'` for 60-180 and `long` for 180+

2.`day_week`: `'weekday'` if day is in the weekday, `'weekend'` if day is in the weekend.

In [21]:
# First, Create the ranges and labels for the 'wait_type' column 
label_ranges = [0, 60, 180, np.inf]
label_names = ['short', "medium", "long"]

In [24]:
# Create the wait_type column by from wait_min by using pd.cut(), 
# while inputting label_ranges and label_names in the correct arguments.

airlines['wait_type'] = pd.cut(airlines["wait_min"], bins = label_ranges, labels = label_names)
#airlines['wait_type'] 

In [25]:
# Create the mapping dictionary mapping weekdays to 'weekday' and weekend days to 'weekend'.
mappings = {'Monday':'weekday', 'Tuesday':'weekday', 'Wednesday': 'weekday', 'Thursday': 'weekday', 'Friday': 'weekday', 
            'Saturday': 'weekend', 'Sunday': 'weekend'}

In [27]:
# Create the day_week column by using .replace()
airlines['day_week'] = airlines['day'].replace(mappings)
#airlines['day_week']

### Ex 3: Cleaning Text Data:

While collecting survey respondent metadata in the `airlines` DataFrame, the full name of respondents was saved in the `full_name` column. However upon closer inspection, you found that a lot of the different names are prefixed by `honorifics` such as `"Dr."`, `"Mr."`, `"Ms."` and `"Miss"`.

Your ultimate objective is to create two new columns named `first_name` and `last_name`, containing the first and last names of respondents respectively. Before doing so however, you need to `remove honorifics`.

In [30]:
# airlines['full_name']

In [None]:
# Replace "Dr." with empty string ""
airlines['full_name'] = airlines['full_name'].str.replace("Dr.","")

# Replace "Mr." with empty string ""
airlines['full_name'] = airlines['full_name'].str.replace("Mr.", "")

# Replace "Miss" with empty string ""
airlines['full_name'] = airlines['full_name'].str.replace("Miss", "")

# Replace "Ms." with empty string ""
airlines['full_name'] = airlines['full_name'].str.replace("Ms.", "")

In [None]:
# Run the assert statement using .str.contains() that tests whether full_name still contains any of the honorifics.
assert airlines['full_name'].str.contains('Ms.|Mr.|Miss|Dr.').any() == False

### Ex 4: Keeping it descriptive

To further understand travelers' experiences in the San Francisco Airport, the quality assurance department sent out a qualitative questionnaire to all travelers who gave the airport the worst score on all possible categories. Their response is stored in the `survey_response` column. Upon a closer look, you realized a few of the answers gave the shortest possible character amount without much substance.

In this exercise, you will `isolate` the responses with a `character` count `higher than 40` , and make sure your new DataFrame contains responses with 40 characters or more using an assert statement.

In [None]:
# Using the airlines DataFrame, 
# store the length of each instance in the survey_response column in resp_length by using .str.len().

resp_length = airlines["survey_response"].str.len()
print(resp_length)

In [None]:
# Isolate the rows of airlines where resp_length > 40. 

airlines_survey = airlines[resp_length > 40]
print(airlines_survey)

In [None]:
# Assert that the smallest survey_response length in airlines_survey is now bigger than 40.
assert airlines_survey['survey_response'].str.len().min() > 40

In [None]:
# Print new survey_response column
print(airlines_survey['survey_response'])