In [59]:
# Step 1: Load the dataset without headers so that pandas assigns default column names
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
raw_df = pd.read_csv("Delta Airlines Loyalty Customer Raw Data.csv", header=None)

# Step 2: Display the first few rows to verify that default column names are assigned correctly
print(raw_df.head())


                 0                                                  1   \
0               NaN  MAKE A COPY TO BEGIN YOUR WORK. THIS IS A READ...   
1  Record Creation                               Frequent Flier Number   
2        2024-02-27                                         7234617746   
3        2021-03-07                                         7234617746   
4        2023-01-31                                         7234617746   

           2           3          4              5                     6   \
0         NaN         NaN        NaN            NaN                   NaN   
1   Join Date  First Name  Last Name   Inquiry Type  Most Recent Flight #   
2  1988/07/08      debbie     spears  Flight Status             G35206241   
3  1988/07/08      debbie     Spears     New Flight             V81311927   
4  07/08/1988      debbie     spears  Cancel Flight             H82295055   

             7                  8                9               10  \
0           NaN      

In [61]:
#from the columns, it looks like index 1 contains the columns so thus we shall...
# Step 3: Display the first row to see if it contains the actual headers
print("First row of the dataset (potential headers):")
print(raw_df.iloc[1])


First row of the dataset (potential headers):
0               Record Creation 
1          Frequent Flier Number
2                      Join Date
3                     First Name
4                      Last Name
5                   Inquiry Type
6           Most Recent Flight #
7                   Lounge Used?
8              Departing Airport
9                Arrival Airport
10                Planned Snack?
11             Additional Snack?
12    # of Included Checked Bags
13       Total # of Checked Bags
14               Flight Delayed?
Name: 1, dtype: object


In [63]:
# Step 4: Reload dataset with the correct headers
raw_df = pd.read_csv('Delta Airlines Loyalty Customer Raw Data.csv', header=1)  # Set header row as row 1 (index 1)

# Reset index to avoid the first row being treated as an index
raw_df.reset_index(drop=True, inplace=True)

# Display the columns to confirm the headers are set correctly
#This is for verification
print("Column names in the dataset:")
print(raw_df.columns)


Column names in the dataset:
Index(['Record Creation ', 'Frequent Flier Number', 'Join Date', 'First Name',
       'Last Name', 'Inquiry Type', 'Most Recent Flight #', 'Lounge Used?',
       'Departing Airport', 'Arrival Airport', 'Planned Snack?',
       'Additional Snack?', '# of Included Checked Bags',
       'Total # of Checked Bags', 'Flight Delayed?'],
      dtype='object')


In [65]:
# Step 5: Handle missing values and correct data types

# Identify missing values
print("Missing values per column:")
print(raw_df.isnull().sum())

# Fill missing values with appropriate defaults or placeholders
raw_df['Frequent Flier Number'] = raw_df['Frequent Flier Number'].fillna("Unknown")  # Fill missing IDs with 'Unknown'
raw_df['Lounge Used?'] = raw_df['Lounge Used?'].fillna("False")                        # Fill missing lounge usage with 'False'
raw_df['# of Included Checked Bags'] = raw_df['# of Included Checked Bags'].fillna(0)  # Fill missing bag counts with 0

# Check for missing values after filling
print("\nMissing values after filling:")
print(raw_df.isnull().sum())

# Correcting data types accordingly
raw_df['Frequent Flier Number'] = raw_df['Frequent Flier Number'].astype(str)  # Ensure it's a string
raw_df['# of Included Checked Bags'] = raw_df['# of Included Checked Bags'].astype(int)  # Ensure it's an integer

# Verify the data types after conversion
print("\nData types after conversion:")
print(raw_df.dtypes)


Missing values per column:
Record Creation               461
Frequent Flier Number         532
Join Date                     535
First Name                    521
Last Name                     532
Inquiry Type                  521
Most Recent Flight #          519
Lounge Used?                  524
Departing Airport             499
Arrival Airport               500
Planned Snack?                529
Additional Snack?             540
# of Included Checked Bags    248
Total # of Checked Bags       533
Flight Delayed?               530
dtype: int64

Missing values after filling:
Record Creation               461
Frequent Flier Number           0
Join Date                     535
First Name                    521
Last Name                     532
Inquiry Type                  521
Most Recent Flight #          519
Lounge Used?                    0
Departing Airport             499
Arrival Airport               500
Planned Snack?                529
Additional Snack?             540
# of Includ

In [67]:
# Step 6: Convert columns to appropriate data types

# Convert 'Join Date' to datetime format
raw_df['Join Date'] = pd.to_datetime(raw_df['Join Date'], errors='coerce')

# Convert numeric columns to numeric types (errors='coerce' will handle invalid values by turning them to NaN)
raw_df['# of Included Checked Bags'] = pd.to_numeric(raw_df['# of Included Checked Bags'], errors='coerce')
raw_df['Total # of Checked Bags'] = pd.to_numeric(raw_df['Total # of Checked Bags'], errors='coerce')

# Convert boolean-like columns to actual boolean values
raw_df['Lounge Used?'] = raw_df['Lounge Used?'].apply(lambda x: True if str(x).lower() == 'true' else False)
raw_df['Flight Delayed?'] = raw_df['Flight Delayed?'].apply(lambda x: True if str(x).lower() == 'true' else False)

# Display the updated data types to verify and confirm the changes
print("Data types after conversion:")
print(raw_df.dtypes)


Data types after conversion:
Record Creation                       object
Frequent Flier Number                 object
Join Date                     datetime64[ns]
First Name                            object
Last Name                             object
Inquiry Type                          object
Most Recent Flight #                  object
Lounge Used?                            bool
Departing Airport                     object
Arrival Airport                       object
Planned Snack?                        object
Additional Snack?                     object
# of Included Checked Bags             int32
Total # of Checked Bags              float64
Flight Delayed?                         bool
dtype: object


In [69]:
# Step 7: Handle outliers and unusual values

# Check for negative values in numeric columns and set them to NaN
raw_df['# of Included Checked Bags'] = raw_df['# of Included Checked Bags'].apply(lambda x: x if x >= 0 else np.nan)
raw_df['Total # of Checked Bags'] = raw_df['Total # of Checked Bags'].apply(lambda x: x if x >= 0 else np.nan)

# For boolean-like columns, check for unexpected values and set them to NaN if needed
raw_df['Lounge Used?'] = raw_df['Lounge Used?'].apply(lambda x: x if isinstance(x, bool) else np.nan)
raw_df['Flight Delayed?'] = raw_df['Flight Delayed?'].apply(lambda x: x if isinstance(x, bool) else np.nan)

# Display data to check for any issues
print("Data after handling outliers and unusual values:")
print(raw_df.head())


Data after handling outliers and unusual values:
  Record Creation  Frequent Flier Number  Join Date First Name Last Name  \
0       2024-02-27          7234617746.0 1988-07-08     debbie    spears   
1       2021-03-07          7234617746.0 1988-07-08     debbie    Spears   
2       2023-01-31          7234617746.0        NaT     debbie    spears   
3       2021-08-21           551121699.0        NaT       kyle     Boone   
4       2022-03-01           551121699.0        NaT       KYLE     Boone   

    Inquiry Type Most Recent Flight #  Lounge Used? Departing Airport  \
0  Flight Status            G35206241         False               DFW   
1     New Flight            V81311927          True               JFK   
2  Cancel Flight            H82295055         False               MIA   
3     New Flight            E39886834          True               ttt   
4     New Flight            R04390271         False               SFO   

  Arrival Airport Planned Snack? Additional Snack?  \
0

In [71]:
# Step 8: Data Transformation

# Convert date columns to datetime format
raw_df['Join Date'] = pd.to_datetime(raw_df['Join Date'], errors='coerce')
raw_df['Record Creation '] = pd.to_datetime(raw_df['Record Creation '], errors='coerce')

# List of categorical columns
categorical_columns = ['Inquiry Type', 'Departing Airport', 'Arrival Airport', 'Planned Snack?', 
                       'Additional Snack?', 'Flight Delayed?']

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply label encoding to each categorical column
for col in categorical_columns:
    raw_df[col] = label_encoder.fit_transform(raw_df[col])

# Check the transformed data
print(raw_df.head())  # Show the first few rows of the dataframe to verify the transformation

  Record Creation  Frequent Flier Number  Join Date First Name Last Name  \
0       2024-02-27          7234617746.0 1988-07-08     debbie    spears   
1       2021-03-07          7234617746.0 1988-07-08     debbie    Spears   
2       2023-01-31          7234617746.0        NaT     debbie    spears   
3       2021-08-21           551121699.0        NaT       kyle     Boone   
4       2022-03-01           551121699.0        NaT       KYLE     Boone   

   Inquiry Type Most Recent Flight #  Lounge Used?  Departing Airport  \
0             2            G35206241         False                  3   
1             3            V81311927          True                  4   
2             0            H82295055         False                  6   
3             3            E39886834          True                 10   
4             3            R04390271         False                  9   

   Arrival Airport  Planned Snack?  Additional Snack?  \
0                2               0             

In [73]:
# Export the cleaned data to a CSV file
raw_df.to_csv('Delta Airlines Frequent Flier Raw Cleaned_data.csv', index=False)  # Ensure index is not included in the CSV
