In [79]:
import pandas as pd
import numpy as np
import math

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer

In [80]:
airline_data = pd.read_csv("project_airline_data.csv")

In [81]:
airline_data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,neutral,...,very satisfied,satisfied,neutral,satisfied,satisfied,very satisfied,very satisfied,25.0,18.0,neutral or dissatisfied
1,1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,neutral,...,very dissatisfied,very dissatisfied,very satisfied,neutral,very dissatisfied,satisfied,very dissatisfied,1.0,6.0,neutral or dissatisfied
2,2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,dissatisfied,...,very satisfied,satisfied,neutral,satisfied,satisfied,satisfied,very satisfied,0.0,0.0,satisfied
3,3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,dissatisfied,...,dissatisfied,dissatisfied,very satisfied,neutral,very dissatisfied,satisfied,dissatisfied,11.0,9.0,neutral or dissatisfied
4,4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,neutral,...,neutral,neutral,satisfied,satisfied,neutral,neutral,neutral,0.0,0.0,satisfied


In [82]:
airline_data.shape

(129880, 26)

In [83]:
airline_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129880 entries, 0 to 129879
Data columns (total 26 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Unnamed: 0.1                       129880 non-null  int64  
 1   Unnamed: 0                         129880 non-null  int64  
 2   id                                 129880 non-null  int64  
 3   Gender                             129880 non-null  object 
 4   Customer Type                      129880 non-null  object 
 5   Age                                129880 non-null  int64  
 6   Type of Travel                     129880 non-null  object 
 7   Class                              129880 non-null  object 
 8   Flight Distance                    129880 non-null  int64  
 9   Inflight wifi service              129880 non-null  object 
 10  Departure/Arrival time convenient  129880 non-null  object 
 11  Ease of Online booking             1181

In [84]:
# Check the first few rows of the 'satisfaction' column
print(airline_data["satisfaction"].head())

0    neutral or dissatisfied
1    neutral or dissatisfied
2                  satisfied
3    neutral or dissatisfied
4                  satisfied
Name: satisfaction, dtype: object


In [85]:
# Check for NaN values in the 'satisfaction' column
nan_count = airline_data["satisfaction"].isnull().sum()
print(f"Number of NaN values in 'satisfaction': {nan_count}")

Number of NaN values in 'satisfaction': 0


In [86]:
# Splitting data and target
X = airline_data.drop(["satisfaction"], axis=1)
y = airline_data["satisfaction"]

# Now, split the data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Print the shapes of the training and test sets
print("Train shape: ", X_train.shape)
print("Test shape: ", X_test.shape)


Train shape:  (103904, 25)
Test shape:  (25976, 25)


In [87]:
# Print columns
print(airline_data.columns)

# Check if 'Customer Type' exists in the DataFrame
if 'Customer Type' in airline_data.columns:
    airline_data['Customer Type'] = airline_data['Customer Type'].map({'Loyal Customer': 1, 'disloyal Customer': 0})
else:
    print("'Customer Type' column does not exist in the DataFrame.")

# Calculate the correlation matrix

# Remove Non-Numeric Columns
numeric_data = airline_data.select_dtypes(include=[np.number])
corr_matrix = numeric_data.corr()

# Convert Categorical Data to Numeric
airline_data['Gender'] = airline_data['Gender'].map({'Male': 1, 'Female': 0})
airline_data['Type of Travel'] = airline_data['Type of Travel'].map({'Business travel': 1, 'Personal Travel': 0})
airline_data['Class'] = airline_data['Class'].map({'Business': 1, 'Eco': 0})
airline_data['satisfaction'] = airline_data['Class'].map({'satisfied': 1, 'neutral or dissatisfied': 0})

# Drop columns with 'unnamed' and 'id number' in their names
columns_to_drop = [col for col in airline_data.columns if 'Unnamed' in col.lower() or 'id' in col.lower()]
airline_data = airline_data.drop(columns=columns_to_drop)

Index(['Unnamed: 0.1', 'Unnamed: 0', 'id', 'Gender', 'Customer Type', 'Age',
       'Type of Travel', 'Class', 'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'satisfaction'],
      dtype='object')


## Drop unnecessary columns

In [88]:
airline_data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,0,1,1,13,0,,460,neutral,satisfied,...,very satisfied,satisfied,neutral,satisfied,satisfied,very satisfied,very satisfied,25.0,18.0,
1,1,1,1,0,25,1,1.0,235,neutral,dissatisfied,...,very dissatisfied,very dissatisfied,very satisfied,neutral,very dissatisfied,satisfied,very dissatisfied,1.0,6.0,
2,2,2,0,1,26,1,1.0,1142,dissatisfied,dissatisfied,...,very satisfied,satisfied,neutral,satisfied,satisfied,satisfied,very satisfied,0.0,0.0,
3,3,3,0,1,25,1,1.0,562,dissatisfied,very satisfied,...,dissatisfied,dissatisfied,very satisfied,neutral,very dissatisfied,satisfied,dissatisfied,11.0,9.0,
4,4,4,1,1,61,1,1.0,214,neutral,neutral,...,neutral,neutral,satisfied,satisfied,neutral,neutral,neutral,0.0,0.0,


## Splitting data and target

In [89]:
X = airline_data.drop(["satisfaction"], axis=1)
y = airline_data["satisfaction"]

In [90]:
categorical_columns = X.select_dtypes("object").columns
numerical_columns = X.select_dtypes(["int", "float"]).columns

In [91]:
numerical_columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'Gender', 'Customer Type', 'Age',
       'Type of Travel', 'Class', 'Flight Distance',
       'Departure Delay in Minutes', 'Arrival Delay in Minutes'],
      dtype='object')

In [92]:
categorical_columns

Index(['Inflight wifi service', 'Departure/Arrival time convenient',
       'Ease of Online booking', 'Gate location', 'Food and drink',
       'Online boarding', 'Seat comfort', 'Inflight entertainment',
       'On-board service', 'Leg room service', 'Baggage handling',
       'Checkin service', 'Inflight service', 'Cleanliness'],
      dtype='object')

In [93]:
#Check for NaN values in y
print(y.isnull().sum())

129880


In [94]:
#If there are NaN values, you can choose to drop them
# Drop rows with NaN values in 'y'
valid_indices = y.dropna().index
X = X.loc[valid_indices]
y = y.loc[valid_indices]


In [95]:
#Check the Shape of X and y: This will give us an idea of how many samples 
#are left in the dataset after handling NaN values.
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

#Check for NaN Values Again: Let's ensure that there are no NaN values left in y
print(y.isnull().sum())


Shape of X: (0, 24)
Shape of y: (0,)
0


In [96]:
#Check the Unique Values in y: If y is a categorical target variable, 
#it's essential to ensure that there are at least two unique categories left after handling NaN values. 
#This is because the stratify parameter in train_test_split requires at least two categories to stratify the data.
print(y.unique())

[]


In [97]:
def print_stratified_percentages(data):
    classes = data.value_counts()
    for class_ in classes.keys():
        print(f"Class percentage: {class_} - ", f"{math.ceil((classes[class_] / data.shape[0])*100)}%")

In [98]:
print("For Original :")
print_stratified_percentages(y)

print("\nFor train :")
print_stratified_percentages(y_train)
print("\nFor test :")
print_stratified_percentages(y_test)


For Original :

For train :
Class percentage: neutral or dissatisfied -  57%
Class percentage: satisfied -  44%

For test :
Class percentage: neutral or dissatisfied -  57%
Class percentage: satisfied -  44%


## Encoding variables

### Categorical Variabes

#### There are two options for encoding

1. Label Encoding 
2. One-hot Encoding



#### Among the categorical columns - we are further splitting into two sets as per the dataset. 
1. One set will have columns related to the satisfaction of different services. 
2. Another set will have remaining categorical columns like age, customer type, etc. 

In [99]:
label_encoding_variables = ['Inflight wifi service', 'Departure/Arrival time convenient',
       'Ease of Online booking', 'Gate location', 'Food and drink',
       'Online boarding', 'Seat comfort', 'Inflight entertainment',
       'On-board service', 'Leg room service', 'Baggage handling',
       'Checkin service', 'Inflight service', 'Cleanliness']

In [100]:
onehot_encoding_variables = ['Gender', 'Customer Type', 'Type of Travel', 'Class']

### Label Encoding

Columns which is related to satisfaction of different services could be encoded as label encoding than One-hot encoding because it has some kine of implicit ordering like below:

["not applicable", "very dissatisfied", "dissatisfied", "neutral", "satisfied", "very satisfied"]

It can be encoded to 0 to 5.

Also we have 15 columns related to satisfaction alone. we have six categories. So one hot encoding will scale the features to 15*6 = 90 satisfaction columns. 

Thus, we will go with LabelEncoding to take advantage of implicit ordering and to keep columns in small numbers. 

In [101]:
label_encoding_df = X_train[label_encoding_variables]

In [102]:
for column in label_encoding_variables:
    label_encoder = LabelEncoder()
    label_encoding_df[column] = label_encoder.fit_transform(label_encoding_df[column])
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_encoding_df[column] = label_encoder.fit_transform(label_encoding_df[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_encoding_df[column] = label_encoder.fit_transform(label_encoding_df[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_encoding_df[column] = label_encod

In [103]:
label_encoding_df.head()

Unnamed: 0,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness
56347,5,5,5,5,0,1,0,0,3,0,4,1,3,0
103345,1,3,1,1,4,1,2,4,5,3,4,4,5,4
75976,5,1,1,1,5,3,1,3,3,5,2,1,3,4
121763,0,2,0,1,5,0,4,5,3,0,4,2,5,5
107028,2,4,2,3,4,0,2,0,0,2,0,1,0,0


In [104]:
X_train[label_encoding_variables].head()

Unnamed: 0,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness
56347,very satisfied,very satisfied,very satisfied,very satisfied,dissatisfied,neutral,dissatisfied,dissatisfied,satisfied,dissatisfied,very satisfied,neutral,satisfied,dissatisfied
103345,neutral,satisfied,neutral,neutral,very dissatisfied,neutral,satisfied,very dissatisfied,very satisfied,satisfied,very satisfied,very satisfied,very satisfied,very dissatisfied
75976,very satisfied,neutral,neutral,neutral,very satisfied,satisfied,neutral,satisfied,satisfied,very satisfied,satisfied,neutral,satisfied,very dissatisfied
121763,dissatisfied,not applicable,dissatisfied,neutral,very satisfied,dissatisfied,very satisfied,very satisfied,satisfied,dissatisfied,very satisfied,satisfied,very satisfied,very satisfied
107028,not applicable,very dissatisfied,not applicable,satisfied,very dissatisfied,dissatisfied,satisfied,dissatisfied,dissatisfied,not applicable,dissatisfied,neutral,dissatisfied,dissatisfied


If we compare label encoded data and original data, we could see that ordering is not followed for the values. For example, neutral=1, dissatisfied=0. 

So, It encoded from 0 to 5 but not in the order we wanted. Thus, we will manually enforce the order like below:

["not applicable", "very dissatisfied", "dissatisfied", "neutral", "satisfied", "very satisfied"] <br>
[0, 1, 2, 3, 4, 5]

### Label Encoding with order

In [105]:
label_encoding_df_train = X_train[label_encoding_variables]
label_encoding_df_test = X_test[label_encoding_variables]

In [106]:
satisfy_value_order = ["not applicable", "very dissatisfied", "dissatisfied", "neutral", "satisfied", "very satisfied"]

def apply_order(value):
    if value is None or pd.isna(value):
        return
    
    return satisfy_value_order.index(value)

In [107]:
for column in label_encoding_variables:
    
    label_encoding_df_train[column] = label_encoding_df_train[column].apply(apply_order)
    label_encoding_df_train[column] = label_encoding_df_train[column].astype("float")
    
    label_encoding_df_test[column] = label_encoding_df_test[column].apply(apply_order)
    label_encoding_df_test[column] = label_encoding_df_test[column].astype("float")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_encoding_df_train[column] = label_encoding_df_train[column].apply(apply_order)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_encoding_df_train[column] = label_encoding_df_train[column].astype("float")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_encoding_df_test[column] = label

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_encoding_df_train[column] = label_encoding_df_train[column].apply(apply_order)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_encoding_df_train[column] = label_encoding_df_train[column].astype("float")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_encoding_df_test[column] = label

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_encoding_df_test[column] = label_encoding_df_test[column].apply(apply_order)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_encoding_df_test[column] = label_encoding_df_test[column].astype("float")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_encoding_df_train[column] = label_en

In [108]:
label_encoding_df_train.head()

Unnamed: 0,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness
56347,5.0,5.0,5.0,5.0,2.0,3.0,2.0,2.0,4.0,2.0,5.0,3.0,4.0,2.0
103345,3.0,4.0,3.0,3.0,1.0,3.0,4.0,1.0,5.0,4.0,5.0,5.0,5.0,1.0
75976,5.0,3.0,3.0,3.0,5.0,4.0,3.0,4.0,4.0,5.0,4.0,3.0,4.0,1.0
121763,2.0,0.0,2.0,3.0,5.0,2.0,5.0,5.0,4.0,2.0,5.0,4.0,5.0,5.0
107028,0.0,1.0,0.0,4.0,1.0,2.0,4.0,2.0,2.0,0.0,2.0,3.0,2.0,2.0


In [109]:
X_train[label_encoding_variables].head()

Unnamed: 0,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness
56347,very satisfied,very satisfied,very satisfied,very satisfied,dissatisfied,neutral,dissatisfied,dissatisfied,satisfied,dissatisfied,very satisfied,neutral,satisfied,dissatisfied
103345,neutral,satisfied,neutral,neutral,very dissatisfied,neutral,satisfied,very dissatisfied,very satisfied,satisfied,very satisfied,very satisfied,very satisfied,very dissatisfied
75976,very satisfied,neutral,neutral,neutral,very satisfied,satisfied,neutral,satisfied,satisfied,very satisfied,satisfied,neutral,satisfied,very dissatisfied
121763,dissatisfied,not applicable,dissatisfied,neutral,very satisfied,dissatisfied,very satisfied,very satisfied,satisfied,dissatisfied,very satisfied,satisfied,very satisfied,very satisfied
107028,not applicable,very dissatisfied,not applicable,satisfied,very dissatisfied,dissatisfied,satisfied,dissatisfied,dissatisfied,not applicable,dissatisfied,neutral,dissatisfied,dissatisfied


### One-hot Encoding

One hot encoding can be done via both the ways. One using sklearn one hot encoder and another using pandas.get_dummies. Pandas function is prefered because it will return one-hot encoded features with proper name, on the other hand OneHotEncoder from sklearn will just return resultant feature names as numbers

In [110]:
onehot_encoding_df_train = X_train[onehot_encoding_variables]
onehot_encoding_df_test = X_test[onehot_encoding_variables]

In [111]:
onehot_encoding_df_test

Unnamed: 0,Gender,Customer Type,Type of Travel,Class
126586,Male,Loyal Customer,Personal Travel,Eco
12418,Male,Loyal Customer,Business travel,Business
128897,Female,Loyal Customer,Business travel,Business
2429,Female,Loyal Customer,Personal Travel,Eco
43539,Male,Loyal Customer,Business travel,Eco
...,...,...,...,...
31144,Male,Loyal Customer,Business travel,Business
113580,Female,Loyal Customer,Business travel,Eco
46451,Male,Loyal Customer,Personal Travel,Eco
118724,Male,disloyal Customer,Business travel,Eco


In [112]:
onehot_encoded_df_train = pd.DataFrame()
onehot_encoded_df_test = pd.DataFrame()


for column in onehot_encoding_variables:
    dummies_train = pd.get_dummies(onehot_encoding_df_train[column])
    onehot_encoded_df_train = pd.concat([onehot_encoded_df_train, dummies_train], axis =1)
    
    dummies_test = pd.get_dummies(onehot_encoding_df_test[column])
    onehot_encoded_df_test = pd.concat([onehot_encoded_df_test, dummies_test], axis =1)

In [113]:
onehot_encoded_df_train.head()

Unnamed: 0,Female,Male,Loyal Customer,disloyal Customer,Business travel,Personal Travel,Business,Eco,Eco Plus
56347,False,True,True,False,True,False,True,False,False
103345,False,True,True,False,False,True,False,True,False
75976,True,False,True,False,True,False,False,True,False
121763,False,True,False,True,True,False,True,False,False
107028,True,False,True,False,False,True,False,False,True


In [114]:
onehot_encoded_df_test.head()

Unnamed: 0,Female,Male,Loyal Customer,disloyal Customer,Business travel,Personal Travel,Business,Eco,Eco Plus
126586,False,True,True,False,False,True,False,True,False
12418,False,True,True,False,True,False,True,False,False
128897,True,False,True,False,True,False,True,False,False
2429,True,False,True,False,False,True,False,True,False
43539,False,True,True,False,True,False,False,True,False


### Encode Target Variable

In [115]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [116]:
y_train[0:10]

array([1, 0, 1, 0, 1, 1, 0, 1, 0, 0])

In [117]:
y_test[0:10]

array([0, 0, 1, 0, 1, 0, 1, 0, 1, 0])

### Put everything back into original dataframe

In [118]:
X_train.drop(categorical_columns, inplace=True, axis=1)
X_test.drop(categorical_columns, inplace=True, axis=1)

In [119]:
X_train = pd.concat([X_train,label_encoding_df_train, onehot_encoded_df_train], axis=1)
X_test = pd.concat([X_test,label_encoding_df_test, onehot_encoded_df_test], axis=1)

In [120]:
X_train.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Departure Delay in Minutes,...,Cleanliness,Female,Male,Loyal Customer,disloyal Customer,Business travel,Personal Travel,Business,Eco,Eco Plus
56347,56347,56347,80724,Male,Loyal Customer,31,Business travel,Business,448,0.0,...,2.0,False,True,True,False,True,False,True,False,False
103345,103345,103345,95334,Male,Loyal Customer,47,Personal Travel,Eco,189,12.0,...,1.0,False,True,True,False,False,True,False,True,False
75976,75976,75976,43290,Female,Loyal Customer,56,Business travel,Eco,436,0.0,...,1.0,True,False,True,False,True,False,False,True,False
121763,17859,17859,94446,Male,disloyal Customer,25,Business travel,Business,562,6.0,...,5.0,False,True,False,True,True,False,True,False,False
107028,3124,3124,35724,Female,Loyal Customer,49,Personal Travel,Eco Plus,2465,9.0,...,2.0,True,False,True,False,False,True,False,False,True


In [121]:
X_test.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Departure Delay in Minutes,...,Cleanliness,Female,Male,Loyal Customer,disloyal Customer,Business travel,Personal Travel,Business,Eco,Eco Plus
126586,22682,22682,40602,Male,Loyal Customer,49,Personal Travel,Eco,391,0.0,...,3.0,False,True,True,False,False,True,False,True,False
12418,12418,12418,6801,Male,Loyal Customer,56,Business travel,Business,3803,3.0,...,2.0,False,True,True,False,True,False,True,False,False
128897,24993,24993,95486,Female,Loyal Customer,57,Business travel,Business,3013,0.0,...,3.0,True,False,True,False,True,False,True,False,False
2429,2429,2429,46352,Female,Loyal Customer,17,Personal Travel,Eco,977,20.0,...,2.0,True,False,True,False,False,True,False,True,False
43539,43539,43539,48118,Male,Loyal Customer,39,Business travel,Eco,414,89.0,...,5.0,False,True,True,False,True,False,False,True,False


In [122]:
X_train.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'id', 'Gender', 'Customer Type', 'Age',
       'Type of Travel', 'Class', 'Flight Distance',
       'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'Inflight wifi service', 'Departure/Arrival time convenient',
       'Ease of Online booking', 'Gate location', 'Food and drink',
       'Online boarding', 'Seat comfort', 'Inflight entertainment',
       'On-board service', 'Leg room service', 'Baggage handling',
       'Checkin service', 'Inflight service', 'Cleanliness', 'Female', 'Male',
       'Loyal Customer', 'disloyal Customer', 'Business travel',
       'Personal Travel', 'Business', 'Eco', 'Eco Plus'],
      dtype='object')

In [123]:
X_test.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'id', 'Gender', 'Customer Type', 'Age',
       'Type of Travel', 'Class', 'Flight Distance',
       'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'Inflight wifi service', 'Departure/Arrival time convenient',
       'Ease of Online booking', 'Gate location', 'Food and drink',
       'Online boarding', 'Seat comfort', 'Inflight entertainment',
       'On-board service', 'Leg room service', 'Baggage handling',
       'Checkin service', 'Inflight service', 'Cleanliness', 'Female', 'Male',
       'Loyal Customer', 'disloyal Customer', 'Business travel',
       'Personal Travel', 'Business', 'Eco', 'Eco Plus'],
      dtype='object')

## Missing Value Fix

In [124]:
X_train.isna().sum()

Unnamed: 0.1                            0
Unnamed: 0                              0
id                                      0
Gender                                  0
Customer Type                           0
Age                                     0
Type of Travel                          0
Class                                   0
Flight Distance                         0
Departure Delay in Minutes           5311
Arrival Delay in Minutes             4675
Inflight wifi service                   0
Departure/Arrival time convenient       0
Ease of Online booking               9377
Gate location                           0
Food and drink                          0
Online boarding                         0
Seat comfort                            0
Inflight entertainment                  0
On-board service                        0
Leg room service                     9028
Baggage handling                        0
Checkin service                         0
Inflight service                  

In [125]:
X_test.isna().sum()

Unnamed: 0.1                            0
Unnamed: 0                              0
id                                      0
Gender                                  0
Customer Type                           0
Age                                     0
Type of Travel                          0
Class                                   0
Flight Distance                         0
Departure Delay in Minutes           1307
Arrival Delay in Minutes             1136
Inflight wifi service                   0
Departure/Arrival time convenient       0
Ease of Online booking               2352
Gate location                           0
Food and drink                          0
Online boarding                         0
Seat comfort                            0
Inflight entertainment                  0
On-board service                        0
Leg room service                     2335
Baggage handling                        0
Checkin service                         0
Inflight service                  

In [126]:
catogorical_columns_missing_values = ["Ease of Online booking", "Leg room service"]
numerical_columns_missing_values = ["Departure Delay in Minutes", "Arrival Delay in Minutes"]

### Impute Categorical columns

In [127]:
for column in catogorical_columns_missing_values:
    
    categorical_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

    X_train[column] = categorical_imputer.fit_transform(X_train[column].values.reshape(-1, 1))
    X_test[column] = categorical_imputer.transform(X_test[column].values.reshape(-1, 1))

### Impute Numerical columns

In [128]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer

# Separate numerical and categorical columns
X_train_num = X_train.select_dtypes(include=['float64', 'int64'])
X_train_cat = X_train.select_dtypes(exclude=['float64', 'int64'])

X_test_num = X_test.select_dtypes(include=['float64', 'int64'])
X_test_cat = X_test.select_dtypes(exclude=['float64', 'int64'])

# Impute numerical columns
imp_mean = IterativeImputer(random_state=0)
X_train_num_imputed = pd.DataFrame(imp_mean.fit_transform(X_train_num), columns=X_train_num.columns)
X_test_num_imputed = pd.DataFrame(imp_mean.transform(X_test_num), columns=X_test_num.columns)

# Impute categorical columns (if needed)
imp_freq = SimpleImputer(strategy='most_frequent')
X_train_cat_imputed = pd.DataFrame(imp_freq.fit_transform(X_train_cat), columns=X_train_cat.columns)
X_test_cat_imputed = pd.DataFrame(imp_freq.transform(X_test_cat), columns=X_test_cat.columns)

# Combine imputed numerical and categorical columns
X_train_imputed = pd.concat([X_train_num_imputed, X_train_cat_imputed], axis=1)
X_test_imputed = pd.concat([X_test_num_imputed, X_test_cat_imputed], axis=1)

# Now, X_train_imputed and X_test_imputed are your imputed datasets


In [129]:
X_train.isna().sum()

Unnamed: 0.1                            0
Unnamed: 0                              0
id                                      0
Gender                                  0
Customer Type                           0
Age                                     0
Type of Travel                          0
Class                                   0
Flight Distance                         0
Departure Delay in Minutes           5311
Arrival Delay in Minutes             4675
Inflight wifi service                   0
Departure/Arrival time convenient       0
Ease of Online booking                  0
Gate location                           0
Food and drink                          0
Online boarding                         0
Seat comfort                            0
Inflight entertainment                  0
On-board service                        0
Leg room service                        0
Baggage handling                        0
Checkin service                         0
Inflight service                  

## Standardizing / Scaling

In [130]:
from sklearn.preprocessing import MinMaxScaler

# Separate numerical columns
X_train_num = X_train.select_dtypes(include=['float64', 'int64'])
X_test_num = X_test.select_dtypes(include=['float64', 'int64'])

# Apply MinMaxScaler to numerical columns
scaler = MinMaxScaler()
X_train_num_scaled = pd.DataFrame(scaler.fit_transform(X_train_num), columns=X_train_num.columns)
X_test_num_scaled = pd.DataFrame(scaler.transform(X_test_num), columns=X_test_num.columns)

# Combine scaled numerical columns with original categorical columns
X_train_scaled = pd.concat([X_train_num_scaled, X_train.select_dtypes(exclude=['float64', 'int64'])], axis=1)
X_test_scaled = pd.concat([X_test_num_scaled, X_test.select_dtypes(exclude=['float64', 'int64'])], axis=1)


In [131]:
#One-Hot Encoding
import pandas as pd

# Assuming X_train is a DataFrame
X_train_encoded = pd.get_dummies(X_train)
X_test_encoded = pd.get_dummies(X_test)

# Print the encoded data
print("Encoded Training Data:")
print(X_train_encoded.head())  # Displaying the first few rows for brevity
print("\nEncoded Test Data:")
print(X_test_encoded.head())  # Displaying the first few rows for brevity

Encoded Training Data:
        Unnamed: 0.1  Unnamed: 0     id  Age  Flight Distance  \
56347          56347       56347  80724   31              448   
103345        103345      103345  95334   47              189   
75976          75976       75976  43290   56              436   
121763         17859       17859  94446   25              562   
107028          3124        3124  35724   49             2465   

        Departure Delay in Minutes  Arrival Delay in Minutes  \
56347                          0.0                       3.0   
103345                        12.0                       7.0   
75976                          0.0                       0.0   
121763                         6.0                       0.0   
107028                         9.0                       0.0   

        Inflight wifi service  Departure/Arrival time convenient  \
56347                     5.0                                5.0   
103345                    3.0                                4.0 

In [132]:
#Min-Max Scaling
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# Fit the scaler on the encoded training data and transform it
X_train_scaled = scaler.fit_transform(X_train_encoded)

# Transform the encoded test data using the same scaler
X_test_scaled = scaler.transform(X_test_encoded)

# Print the scaled data
print("\nScaled Training Data (First 5 rows):")
for row in X_train_scaled[:5]:  # Displaying the first 5 rows for brevity
    print(row)

print("\nScaled Test Data (First 5 rows):")
for row in X_test_scaled[:5]:  # Displaying the first 5 rows for brevity
    print(row)



Scaled Training Data (First 5 rows):
[0.54230388 0.54230388 0.62151882 0.30769231 0.0842084  0.
 0.00189394 1.         1.         1.         1.         0.4
 0.6        0.25       0.4        0.8        0.4        1.
 0.5        0.8        0.4        0.         1.         1.
 0.         1.         0.         1.         0.         0.
 0.         1.         1.         0.         1.         0.
 1.         0.         0.        ]
[0.99462961 0.99462961 0.73400987 0.51282051 0.0319063  0.0091954
 0.00441919 0.6        0.8        0.6        0.6        0.2
 0.6        0.75       0.2        1.         0.8        1.
 1.         1.         0.2        0.         1.         1.
 0.         0.         1.         0.         1.         0.
 0.         1.         1.         0.         0.         1.
 0.         1.         0.        ]
[0.73122047 0.73122047 0.33329227 0.62820513 0.08178514 0.
 0.         1.         0.6        0.6        0.6        1.
 0.8        0.5        0.8        0.8        1.         0

In [133]:
#Standard Scaling
from sklearn.preprocessing import StandardScaler

# Assuming X_train and X_test are your data splits
scaler = StandardScaler()

# Fit the scaler on the training data and transform it
X_train_scaled = scaler.fit_transform(X_train_encoded)

# Transform the test data using the same scaler
X_test_scaled = scaler.transform(X_test_encoded)

print("\nScaled Training Data (First 5 rows):")
for row in X_train_scaled[:5]:  # Displaying the first 5 rows for brevity
    print(row)



Scaled Training Data (First 5 rows):
[ 0.39039688  0.39039688  0.42073865 -0.55588889 -0.74445713 -0.38881705
 -0.31650112  1.70829964  1.27186294  1.66143963  1.58079347 -0.90475974
 -0.18545704 -1.09175691 -1.01723482  0.47952429 -1.10898473  1.15880708
 -0.24233373  0.30428728 -0.98009233 -1.01532374  1.01532374  0.47331516
 -0.47331516  0.66807572 -0.66807572  1.04350396 -0.90209561 -0.27982305
 -1.01532374  1.01532374  0.47331516 -0.47331516  0.66807572 -0.66807572
  1.04350396 -0.90209561 -0.27982305]
[ 1.89581296  1.89581296  0.81018624  0.5015158  -1.00402545 -0.0738459
 -0.21202241  0.20419689  0.6164821   0.16628139  0.0152928  -1.65624184
 -0.18545704  0.42395175 -1.76655081  1.25601795  0.46476985  1.15880708
  1.33710697  1.15417479 -1.74124603 -1.01532374  1.01532374  0.47331516
 -0.47331516 -1.49683632  1.49683632 -0.95830974  1.10852995 -0.27982305
 -1.01532374  1.01532374  0.47331516 -0.47331516 -1.49683632  1.49683632
 -0.95830974  1.10852995 -0.27982305]
[ 1.0191430

In [134]:
X_train_scaled[0]

array([ 0.39039688,  0.39039688,  0.42073865, -0.55588889, -0.74445713,
       -0.38881705, -0.31650112,  1.70829964,  1.27186294,  1.66143963,
        1.58079347, -0.90475974, -0.18545704, -1.09175691, -1.01723482,
        0.47952429, -1.10898473,  1.15880708, -0.24233373,  0.30428728,
       -0.98009233, -1.01532374,  1.01532374,  0.47331516, -0.47331516,
        0.66807572, -0.66807572,  1.04350396, -0.90209561, -0.27982305,
       -1.01532374,  1.01532374,  0.47331516, -0.47331516,  0.66807572,
       -0.66807572,  1.04350396, -0.90209561, -0.27982305])

In [135]:
X_test_scaled[0]

array([-0.68794331, -0.68794331, -0.64876279,  0.63369139, -0.80158221,
       -0.38881705, -0.39486014,  0.20419689,  0.6164821 ,  1.66143963,
        0.79804313, -0.15327764,  1.29614593, -0.33390258, -0.26791883,
        1.25601795, -0.32210744,  1.15880708,  0.54738662,  0.30428728,
       -0.21893864, -1.01532374,  1.01532374,  0.47331516, -0.47331516,
       -1.49683632,  1.49683632, -0.95830974,  1.10852995, -0.27982305,
       -1.01532374,  1.01532374,  0.47331516, -0.47331516, -1.49683632,
        1.49683632, -0.95830974,  1.10852995, -0.27982305])

In [136]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

In [137]:
# Seperate the features (x) and target variable (y) in your train and test data
X_train = X_train_scaled
X_test = X_test_scaled

X_train

array([[ 0.39039688,  0.39039688,  0.42073865, ...,  1.04350396,
        -0.90209561, -0.27982305],
       [ 1.89581296,  1.89581296,  0.81018624, ..., -0.95830974,
         1.10852995, -0.27982305],
       [ 1.01914305,  1.01914305, -0.57711083, ..., -0.95830974,
         1.10852995, -0.27982305],
       ...,
       [ 0.08882038,  0.08882038,  1.72473499, ...,  1.04350396,
        -0.90209561, -0.27982305],
       [-0.91290048, -0.91290048, -1.6503708 , ..., -0.95830974,
         1.10852995, -0.27982305],
       [ 1.07253954,  1.07253954,  0.74069344, ...,  1.04350396,
        -0.90209561, -0.27982305]])

In [144]:
# Before imputation
print("NaN values in X_train before imputation:", np.isnan(X_train).sum().sum())

# Create an imputer object with a mean filling strategy
imputer = SimpleImputer(strategy='mean')

# Apply the imputer to our data
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# After imputation
print("NaN values in X_train_imputed after imputation:", np.isnan(X_train_imputed).sum().sum())


# Now, you can use X_train_imputed for training
# We need to specify a range of values for the hyperparameters we want to tune.
param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1],
    'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge']
}

# Before grid search
print("NaN values in X_train_imputed before grid search:", np.isnan(X_train_imputed).sum().sum())

classifier = SGDClassifier(random_state=42)
# GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# fiting the grid search to the training data
grid_search.fit(X_train_imputed, y_train)  # Use X_train_imputed here

# Best Hyperparameters and estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

print("Best Hyperparameters:", best_params)
print("\nBest Estimator:", best_estimator)


NaN values in X_train before imputation: 9986
NaN values in X_train_imputed after imputation: 0
NaN values in X_train_imputed before grid search: 0




Best Hyperparameters: {'alpha': 0.01, 'loss': 'hinge'}

Best Estimator: SGDClassifier(alpha=0.01, random_state=42)
