## **Analysis of airline passenger satisfaction using regression models including decision tree, linear regression, random forest, and XGBoost. Dataset comprises 120,000+ passenger evaluations on cleanliness, comfort, service, and overall experience.**

Data set description

**Importing the libraries**

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# **Import Data set**

In [2]:
dataset = pd.read_csv("/content/Airplane.csv")
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

In [None]:
dataset.head(10)

In [None]:
X.head(10)

In [None]:
y.head(10)

In [None]:
X.shape

In [None]:
X.describe()

In [None]:
dataset.isnull().sum()

310 missing values for Arrival Delay in Minutes we have to fill

# **DROP ID COLUMN**

In [None]:
print(X.columns)

In [10]:
dataset.drop('id', axis=1, inplace=True)

In [None]:
dataset

# **Detecting Outliers in Age column**

In [12]:
outliers=[]
def detect_outliers(data):
  threshold= 0.1
  mean=np.mean(data)
  std=np.std(data)

  for i in data:
      z_score=(i-mean)/std
      if np.std (z_score)>threshold:
         outliers.append(y)
  return outliers

In [13]:
chunk_size = 100000

# Initialize an empty list to store the column data
column_data = []

# Iterate over chunks of the data
for chunk in pd.read_csv("/content/Airplane.csv", chunksize=chunk_size):
    column_data.extend(chunk['Age'].values)

# Convert the list to a NumPy array
column_array = np.array(column_data)

print(column_array)

[13 25 26 ... 30 22 27]


In [14]:
outlier_pt=detect_outliers(column_array)

In [None]:
outlier_pt

# **One Hot Encoding Categorical Columns**

In [16]:
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

In [None]:
type(X)

In [None]:
X.head(10)

In [None]:
type(X)

In [20]:
X.drop('Customer Type',axis=1,inplace=True)
X=pd.get_dummies(X)

# X.head(10)

In [None]:
X

In [None]:
type(X)

In [None]:
X.info()

# **Missing Values**



In [None]:
X.isnull().sum()

In [None]:
dataset = dataset.dropna()
dataset.isnull().sum()

In [43]:
dataset.reset_index(drop=True, inplace=True)

In [44]:
X.shape

(103904, 25)

# **Scaling the Flight Distance column, Departure delay in minutes, arrival delay in minutes**

In [45]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the "Flight distance" column
X['Flight Distance'] = scaler.fit_transform(X[['Flight Distance']])


In [46]:
print(X['Flight Distance'])

0        -0.731539
1        -0.957184
2        -0.047584
3        -0.629246
4        -0.978244
            ...   
103899   -1.000307
103900    1.160869
103901    0.807860
103902   -0.189991
103903    0.535081
Name: Flight Distance, Length: 103904, dtype: float64


In [47]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the "Departure Delay in Minutes" column
X['Departure Delay in Minutes'] = scaler.fit_transform(X[['Departure Delay in Minutes']])

In [48]:
print(X['Departure Delay in Minutes'])

0         0.266393
1        -0.361375
2        -0.387532
3        -0.099805
4        -0.387532
            ...   
103899   -0.309061
103900   -0.387532
103901   -0.204433
103902   -0.387532
103903   -0.387532
Name: Departure Delay in Minutes, Length: 103904, dtype: float64


In [49]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the "Arrival Delay in Minutes" column
X['Arrival Delay in Minutes'] = scaler.fit_transform(X[['Arrival Delay in Minutes']])

In [50]:
print(X['Arrival Delay in Minutes'])

0         0.072905
1        -0.237184
2        -0.392229
3        -0.159662
4        -0.392229
            ...   
103899   -0.392229
103900   -0.392229
103901   -0.030458
103902   -0.392229
103903   -0.392229
Name: Arrival Delay in Minutes, Length: 103904, dtype: float64


# **Satisfaction column Encoding**

In [51]:
print(y)

0         0
1         0
2         1
3         0
4         1
         ..
103589    0
103590    1
103591    0
103592    0
103593    0
Name: satisfaction, Length: 103594, dtype: int64


In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode the target variable
dataset['satisfaction'] = label_encoder.fit_transform(dataset['satisfaction'])

# Print the mapping of encoded labels
print("Mapping of encoded labels:")
for label, encoded_label in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)):
    print(f"{label}: {encoded_label}")

In [54]:
y = dataset.iloc[:, -1]

In [55]:
type(y)

In [56]:
y.head(10)

0    0
1    0
2    1
3    0
4    1
5    0
6    0
7    1
8    0
9    0
Name: satisfaction, dtype: int64

# **Splitting the dataset into the Training set and Test set**

In [59]:
import pandas as pd

# Remove rows with missing values
X = X.dropna()
y = y.dropna()

In [60]:
import numpy as np

# Impute missing values with the mean
X = X.fillna(np.mean(X))
y = y.fillna(np.mean(y))

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [61]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/3, random_state = 0)

# **Training the Simple Linear Regression model on the Training set**

In [62]:
print(X_train.dtypes)
print(y_train.dtypes)

Age                                    int64
Flight Distance                      float64
Inflight wifi service                  int64
Departure/Arrival time convenient      int64
Ease of Online booking                 int64
Gate location                          int64
Food and drink                         int64
Online boarding                        int64
Seat comfort                           int64
Inflight entertainment                 int64
On-board service                       int64
Leg room service                       int64
Baggage handling                       int64
Checkin service                        int64
Inflight service                       int64
Cleanliness                            int64
Departure Delay in Minutes           float64
Arrival Delay in Minutes             float64
Gender_Female                          uint8
Gender_Male                            uint8
Type of Travel_Business travel         uint8
Type of Travel_Personal Travel         uint8
Class_Busi

In [63]:
X_train.shape

(69062, 25)

In [64]:
X_test.shape

(34532, 25)

In [65]:
y_train.shape

(69062,)

In [66]:
y_test.shape

(34532,)

## Training the Simple Linear Regression model on the Training set

In [67]:
from sklearn.linear_model import LinearRegression
r1 = LinearRegression()
r1.fit(X_train,y_train)

In [68]:
y_pred = r1.predict(X_test)

# **Predicting the Test results**

In [69]:
y_test

6952     1
18144    1
76397    0
81118    0
46902    0
        ..
85126    1
75383    0
67794    0
48415    0
22462    0
Name: satisfaction, Length: 34532, dtype: int64

In [70]:
y_pred.reshape(-1,1)

array([[ 0.91941589],
       [ 0.85473876],
       [ 0.24651402],
       ...,
       [ 0.08533903],
       [-0.19011691],
       [ 0.39226218]])

# **Evaluation**

In [71]:
from sklearn.metrics import r2_score
print(r2_score(y_test, y_pred))

0.5159464032044744


# **Decision Tree**

In [72]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
regressor.fit(X_train,y_train)

# **Predicting the Test set results**

In [73]:
y_pred = regressor.predict(X_test)

In [74]:
y_test

6952     1
18144    1
76397    0
81118    0
46902    0
        ..
85126    1
75383    0
67794    0
48415    0
22462    0
Name: satisfaction, Length: 34532, dtype: int64

In [75]:
y_pred.reshape(-1,1)

array([[1.],
       [1.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])

# **Evalution Regression Model**

In [76]:
from sklearn.metrics import r2_score
print(r2_score(y_test, y_pred))

0.7346382136798042


# **Random_Forest_Regression**

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=150)
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)
print(y_pred)

In [None]:
print(y_test.values)

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

# **XGBoost**

In [77]:
from xgboost import XGBRegressor
regressor = XGBRegressor()
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)
print(y_pred)

In [79]:
print(y_test.values)

[1 1 0 ... 0 0 0]


In [80]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

0.0357799241360237
0.0977755098775146
0.8544101349445542


# **Conclusion**

:

Linear Regression Model has a R2 score: 0.5159464032044745

Decision Tree Model has a R2 score: 0.730042694289712

Random Forest Model has a R2 score: 0.8670522786770244

XGBoost Model has a R2 score: 0.8544101349445542

So we can conclude that the best model is Random Forest Model