# Airline Passenger Satisfaction

__This dataset contains an airline passenger satisfaction survey. the target is to predict passenger satisfaction__

# **READING DATA**

In [7]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [8]:
train = pd.read_csv("../input/airline-passenger-satisfaction/train.csv")
test = pd.read_csv("../input/airline-passenger-satisfaction/test.csv")

In [9]:
train.head()

In [10]:
test.head()

# __EDA__

In [11]:
train.info()

In [12]:
test.info()

-  train dataset has 103904 entries and 25 coloumns
-  test dataset has 25976 entries and 25 coloumns

* AS seen , `Unnamed: 0`  column has no meaning so we drop it 
*  also`id` column doesn't affect on target so we drop it also

In [13]:
train.drop(['Unnamed: 0', 'id'], axis = 1, inplace = True)
test.drop(['Unnamed: 0', 'id'], axis = 1, inplace = True)

In [14]:
train.info()
print("********************************************************************************")
print("********************************************************************************")
test.info()

In [15]:
train.isnull().sum()

In [16]:
test.isnull().sum()

1. train dataset null values:
    - there are 310 null values in `Arrival Delay in Minutes` columns <BR> <BR>

2. test dataset null values:
    - there are 83 null values in `Arrival Delay in Minutes` collumns

###### Num of Null values are small so we can drop them

In [17]:
train.head()

In [18]:
train2 = train.copy()

In [19]:
numerical = train.select_dtypes(exclude=['object']).copy()
numerical

In [20]:
categorical = train.select_dtypes(include=['object']).copy()
categorical

In [21]:
numerical.describe()

- #### difference between mean and mode indicates outliers But equality in mode and median indicates normal distribution

- ##### mean and mode is almost equal in all the columns except:
    -  Flight Distance 
    - Departure Delay in Minutes
    - Arrival Delay in Minutes

**so we can guess ouliers in those columns.**

In [22]:
categorical.describe()

In [23]:
# convert numerical data into categorical
train["Gender"] = train["Gender"].map({"Male":1,"Female":0})
test['Gender'] = test["Gender"].map({"Male":1, "Female":0})
train["Customer Type"] = train["Customer Type"].map({"Loyal Customer":1,"disloyal Customer":0})
test["Customer Type"] = test["Customer Type"].map({"Loyal Customer":1,"disloyal Customer":0})
train["Type of Travel"] = train["Type of Travel"].map({"Personal Travel":1,"Business travel":0})
test["Type of Travel"] = test["Type of Travel"].map({"Personal Travel":1,"Business travel":0})
train["Class"] = train["Class"].map({"Eco Plus":1,"Eco":0,"Business":-1})
test["Class"] = test["Class"].map({"Eco Plus":1,"Eco":0,"Business":-1})
train["satisfaction"] = train["satisfaction"].map({"satisfied":1,"neutral or dissatisfied":0,})
test["satisfaction"] = test["satisfaction"].map({"satisfied":1,"neutral or dissatisfied":0,})



In [24]:
corr=train.corr()

In [25]:
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(train.corr(), dtype=bool))

f, ax = plt.subplots(figsize=(18,16))
sns.heatmap(corr, mask=mask, annot=True,cmap= "vlag",)

- **High correlation between:**
    - `Departure Delay in Minutes` & `Arrival Delay in Minutes`  >> 0.97
    - `Ease of Online booking` & `Inflight wifi service`  >> 0.72
    - `Cleanliness` & `Inflight entertainment`  >> 0.69
    - `Cleanliness` & `Seat comfort`  >> 0.68
    - `Cleanliness` & `Food and drink`  >> 0.66
    - `Inflight service` & `Baggage handling`  >> 0.63

- __As correlation between Departure Delay in Minutes & Arrival Delay in Minutes is so high then we can drop one of them__

In [26]:
train.drop(['Arrival Delay in Minutes'], axis = 1,inplace=True)
test.drop(['Arrival Delay in Minutes'], axis = 1,inplace=True)

In [27]:
corr["satisfaction"].sort_values(ascending=False)

- __the most satisfaction correlation is with "Online boarding" (0.5)__

In [28]:
sns.displot(data=train, x="Age", kind="kde")

In [29]:
sns.histplot(data=train, x= "Age")

the disturbtion of people age: 
- the most age range from 25 to 35 and from 40 to 60 
(this means to focus more on the needs of this age) 
- For children between(1,10) 
- For old pepole older than 65 >> provide helpers for them. 

In [30]:
 round(train2['satisfaction'].value_counts()/train.shape[0]*100,2).plot.pie(autopct='%1.1f%%',shadow=True, startangle=180,explode = [0.06, 0])

- Data is Balanced
- ratio between satisfied and dissatisfied people 
( our target to increase the ratio of satisfied people ) 

In [31]:
 round(train2['Customer Type'].value_counts()/train.shape[0]*100,2).plot.pie(autopct='%1.1f%%',shadow=True, startangle=90,explode = [0.2, 0])

- ratio between loyal and disloyal customers


In [32]:
ax = sns.violinplot(x="Online boarding", y="satisfaction",
                    data=train2, palette="muted")

- relation between online boarding and satisfaction
( as the online boarding satisfaction increase, the overall satsifaction increases and vice versa)
- so try to imporve online boarding more.

In [33]:
sns.boxplot(x="Online boarding", y="Age" , data=train2)

In [34]:
sns.boxplot(x="Ease of Online booking", y="Age" , data=train2)

In [None]:
fig = plt.figure(figsize=(12,18))
for i in range(len(numerical.columns)):
    fig.add_subplot(9, 4, i+1)
    sns.scatterplot(numerical.iloc[:, i],train['satisfaction'])
plt.tight_layout()
plt.show()

In [None]:
categorics=['Gender', 'Customer Type','Type of Travel', 'Class','Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness']
for i in categorics:
    plt.figure(figsize=(16,6))
    plt.subplot(1,2,1)
    ax=sns.countplot(x=train2[i],palette="Pastel1")
    for p in ax.patches:
        ax.annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.3, p.get_height()-5000))
    plt.subplot(1,2,2)
    ax=sns.countplot(x=train2[i],hue=train2.satisfaction,palette=["#f08080","#87cefa"]) 
    for p in ax.patches:
        ax.annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.15, p.get_height()/2),rotation=90)
    plt.show()

1. make a study why some customers become disloyal (survey or a feedback)
3. the business travel is more than personal travel( making trips with good offers to increase the personal travel) 
    - most personal travelers travel in eco class which hasn't good service.(shown below)
4. imporve the service in Eco class and make offers in eco plus .
5. imporve the wifi service because it affects the overall satisfaction. 
 
7. imporve the online booking as it have an effect on overall satisfaction.

9.  working on the food and drinks quality .
10. online boarding ( affects the satisfaction very much ).
12. provide more ways of Inflight entertainment.

In [None]:
sns.countplot(x=train2["Type of Travel"],hue=train2["Class"],palette="Pastel1") 

In [None]:
l=["Seat comfort","Cleanliness","On-board service","Inflight entertainment","Inflight wifi service"]
for i in l:
    plt.figure(figsize=(16,6))
    plt.subplot(1,2,2)
    sns.countplot(x=train2[i],hue=train2["Class"],palette="Pastel1")
    plt.show()


- most of people who are satisfied with different service traveling in Business class

In [None]:
sns.histplot(x='Flight Distance',hue="satisfaction",data=train2,element="poly")

 as the distance increases, the satisfaction increases!!!

In [None]:
categorics=['Gender', 'Customer Type','Type of Travel', 'Class','Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness']
for i in ['Gender','Customer Type','Type of Travel','Class']:
    categorics.remove(i)
total = float(len(train))
ax = train[categorics].mean().sort_values(ascending=False).plot(kind="barh",ylabel="Features",colormap="Pastel1",xticks=[0,0.5,1,1.5,2,2.5,3,3.5,4,4.5,5],figsize=(14,6))
plt.title('Average satisfaction ratings of services', fontsize=16)
for p in ax.patches:
    count = '{:.1f}'.format(p.get_width())
    x, y = p.get_x() + p.get_width()+0.15, p.get_y()
    ax.annotate(count, (x, y), ha='right')

- we need to work in the features which has less average satisfaction ratings.

# Preprocessing

- Missing values
- Outliers

__Missing value__ : As shown above , Num of Null values are small so we can drop them 
    

- missing values was in Arrival Delay in Minutes column .. and we drop this column .
- then now, we have no null values

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

_______________________________

__Dealing with Outliers__

In [None]:
plt.figure(figsize=(10,8))
train.boxplot(rot=90)

- ##### As we detect above outliers are in :
    - Flight Distance 
    - Departure Delay in Minutes
    - Arrival Delay in Minutes(was dropped)


In [None]:
def find_outliers(x):
    q1 = np.percentile(x, 25)
    q3 = np.percentile(x, 75)
    iqr = q3-q1 
    floor = q1 - 1.5*iqr
    ceiling = q3 + 1.5*iqr

    return ceiling,floor

__For train__

In [None]:
ul,ll=find_outliers(train["Flight Distance"])
print("Outliers lower bound= ",ll)
print("Outliers upper bound= ",ul)
d=train.shape[0]-train[(train["Flight Distance"]>=ll) & (train["Flight Distance"]<=ul) ].shape[0]
print("length of outlier data= ",d)

In [None]:
ul,ll=find_outliers(train["Departure Delay in Minutes"])
print("Outliers lower bound= ",ll)
print("Outliers upper bound= ",ul)
d=train.shape[0]-train[(train["Departure Delay in Minutes"]>=ll) & (train["Departure Delay in Minutes"]<=ul)].shape[0]
print("length of outlier data= ",d)

In [None]:
#median imputation
def before_after_median_imputation(x):
    sns.boxplot(train[x])
    plt.title("Box Plot before median imputation")
    plt.show()
    Upper_tail,Lower_tail = find_outliers(train[x])
    med = np.median(train[x])
    for i in train[x]:
        if i > Upper_tail or i < Lower_tail:
                train[x] = train[x].replace(i, med)
    sns.boxplot(train[x])
    plt.title("Box Plot after median imputation")
    plt.show()            

In [None]:
col=["Flight Distance",'Departure Delay in Minutes']
for column in col:
    before_after_median_imputation(column)

For test

In [None]:
plt.figure(figsize=(10,8))
test.boxplot(rot=90)

In [None]:
ul,ll=find_outliers(test["Flight Distance"])
print("Outliers lower bound= ",ll)
print("Outliers upper bound= ",ul)
d=test.shape[0]-test[(test["Flight Distance"]>=ll) & (test["Flight Distance"]<=ul) ].shape[0]
print("length of outlier data= ",d)

In [None]:
ul,ll=find_outliers(test["Departure Delay in Minutes"])
print("Outliers lower bound= ",ll)
print("Outliers upper bound= ",ul)
d=test.shape[0]-test[(test["Departure Delay in Minutes"]>=ll) & (test["Departure Delay in Minutes"]<=ul)].shape[0]
print("length of outlier data= ",d)

In [None]:
#median imputation
def before_after_median_imputation_test(x):
    sns.boxplot(test[x])
    plt.title("Box Plot before median imputation")
    plt.show()
    Upper_tail,Lower_tail = find_outliers(test[x])
    med = np.median(test[x])
    for i in test[x]:
        if i > Upper_tail or i < Lower_tail:
                test[x] = test[x].replace(i, med)
    sns.boxplot(test[x])
    plt.title("Box Plot after median imputation")
    plt.show()

In [None]:
col=["Flight Distance",'Departure Delay in Minutes']
for column in col:
    before_after_median_imputation_test(column)

## Modeling

**Train Test Split**

In [None]:
X_train=train.drop("satisfaction" , axis=1)
Y_train=train['satisfaction'].values
X_test=test.drop("satisfaction" , axis=1)
Y_test =test['satisfaction'].values

**Scalling**

In [None]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.fit_transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC 
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,plot_confusion_matrix,ConfusionMatrixDisplay
from sklearn.decomposition import PCA 
from sklearn.neighbors import KNeighborsClassifier

### __Models:__
    - LogisticRegression
    - Decision tree
    - SVM
    - KNN with pca
    - RandomForest
    - XGBoost

**LogisticRegression**

In [None]:
lr = LogisticRegression()

lr.fit(X_train,Y_train)

y_pred_lr = lr.predict(X_test)

In [None]:
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(Y_test, y_pred_lr)))

In [None]:
# Check for overfitting and underfitting
print(f"Training Data Score: {lr.score(X_train, Y_train)}")
print(f"Testing Data Score: {lr.score(X_test, Y_test)}")

In [None]:
print(classification_report(Y_test, y_pred_lr))

plot_confusion_matrix(lr, X_test, Y_test,cmap=plt.cm.Blues, normalize = 'all')


**Decision tree**

In [None]:
DT = DecisionTreeClassifier( max_depth=13, random_state=42)

# fit the model
DT.fit(X_train, Y_train)
y_pred_DT = DT.predict(X_test)

In [None]:
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(Y_test, y_pred_DT)))

In [None]:
# Check for overfitting and underfitting
print('Training set score: {:.4f}'.format(DT.score(X_train, Y_train)))
print('Test set score: {:.4f}'.format(DT.score(X_test, Y_test)))

In [None]:
print(classification_report(Y_test, y_pred_DT))
plot_confusion_matrix(DT, X_test, Y_test,cmap=plt.cm.Blues, normalize = 'all')

__SVM__

In [None]:
svc = SVC(random_state=0,C=0.1,kernel='rbf')
svc.fit(X_train, Y_train)
y_pred_svc = svc.predict(X_test)

In [None]:
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(Y_test, y_pred_svc)))

In [None]:
# Check for overfitting and underfitting
print(f"Training Data Score: {svc.score(X_train, Y_train)}")
print(f"Testing Data Score: {svc.score(X_test, Y_test)}")

In [None]:
print(classification_report(Y_test, y_pred_svc))
cm2 = confusion_matrix(Y_test, y_pred_svc,normalize="all")
disp = ConfusionMatrixDisplay(cm2).plot(cmap=plt.cm.Blues)

__KNN with pca__

In [None]:
pca = PCA(n_components=15)
pca_train = pca.fit_transform(X_train)
pca_test= pca.transform(X_test)

In [None]:
pca_train.size

In [None]:
pca_train_df=pd.DataFrame(pca_train)
pca_test_df=pd.DataFrame(pca_test)


In [None]:
pca_train_df.shape

In [None]:
pca_test_df.shape

In [None]:
# KNN
knn = KNeighborsClassifier(n_neighbors=70)
knn.fit(pca_train_df, Y_train)
y_pred_knn = knn.predict(pca_test_df)

In [None]:
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(Y_test, y_pred_knn)))

In [None]:
# Check for overfitting and underfitting
print(f"Training Data Score: {knn.score(pca_train_df, Y_train)}")
print(f"Testing Data Score: {knn.score(pca_test_df, Y_test)}")

In [None]:
print(classification_report(Y_test, y_pred_lr))
plot_confusion_matrix(lr, X_test, Y_test,cmap=plt.cm.Blues, normalize = 'all')

__RandomForest__

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(max_depth=25, random_state=0,n_estimators= 1200)
rf.fit(X_train, Y_train)
y_pred_rf = rf.predict(X_test)
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(Y_test, y_pred_rf)))


In [None]:
# Check for overfitting and underfitting
print(f"Training Data Score: {rf.score(X_train, Y_train)}")
print(f"Testing Data Score: {rf.score(X_test, Y_test)}")


In [None]:
print(classification_report(Y_test, y_pred_rf))
cm3 = confusion_matrix(Y_test, y_pred_rf,normalize="all")
disp = ConfusionMatrixDisplay(cm2).plot(cmap=plt.cm.Blues)

__XGBoost__

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb = XGBClassifier()
xgb.fit(X_train, Y_train)

y_pred_xgb = xgb.predict(X_test)

In [None]:
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(Y_test, y_pred_xgb)))

In [None]:
# Check for overfitting and underfitting
print(f"Training Data Score: {xgb.score(X_train, Y_train)}")
print(f"Testing Data Score: {xgb.score(X_test, Y_test)}")


In [None]:
print(classification_report(Y_test, y_pred_rf))
cm3 = confusion_matrix(Y_test, y_pred_xgb,normalize="all")
disp = ConfusionMatrixDisplay(cm3).plot(cmap=plt.cm.Blues)