# Airline Passenger Satisfaction factors analysis

In [None]:
from PIL import Image
aeroplane=Image.open(r'C:\Users\LENOVO\Desktop\data analyst\New folder\pexels-pixabay-164589.jpg')
aeroplane

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import chart_studio.plotly as plty
import cufflinks as cf
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
init_notebook_mode(connected=True)
cf.go_offline()
from sklearn.preprocessing import StandardScaler

# Data Collection

In [None]:
airline_df=pd.read_csv(r"C:\Users\LENOVO\Desktop\data analyst\datasets\air passenger\air passengers.csv")

In [None]:
airline_df.head()

In [None]:
airline_df.shape

In [None]:
airline_df.tail(7)

# Analysing and Cleaning Data

In [None]:
airline_df.columns

In [None]:
airline_df.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
airline_df.info()

In [None]:
airline_df.describe()

In [None]:
airline_df.isna().sum()

In [None]:
# Filling the missing values of 'Arrival Delay in Minutes' with mean value
Arrival_Delay_mean=airline_df['Arrival Delay in Minutes'].mean()
Arrival_Delay_mean

In [None]:
airline_df['Arrival Delay in Minutes'].fillna(value=Arrival_Delay_mean,inplace=True)

In [None]:
airline_df.isna().sum()

In [None]:
plt.figure(figsize=(7,5))
sns.histplot(x='Age',hue='satisfaction',multiple="stack",bins=10,palette='coolwarm',
              edgecolor=".3",
              linewidth=.5,data=airline_df)
sns.despine()
# Thus we can see that people in age group 25-55 are most dissatisfied
# No of flyers are more in range 25-60 years

In [None]:
sns.countplot(x='satisfaction',palette='coolwarm',saturation=2.0,data=airline_df)
sns.despine()
# People flying are more dissatisfied than satisfied

In [None]:
airline_df['satisfaction'].value_counts()

In [None]:
data=[58879,45025]
keys=['unsatisfied passengers','satisfied passengers']
palette_color = sns.color_palette('bright')

In [None]:
plt.pie(data,labels=keys,autopct='%.0f%%',colors=palette_color)

In [None]:
airline_df.columns

In [None]:
airline_df['Cleanliness'].mean()
# Thus cleanliness on average is poor

In [None]:
airline_df['Flight Distance'].mean()
# Average flight distance is 1189 km

In [None]:
sns.set_style("ticks")
sns.histplot(x='Customer Type',hue='satisfaction',data=airline_df,bins=2,multiple="dodge",shrink=0.5
            ,palette='coolwarm',edgecolor=".3",
            linewidth=.5)
sns.despine()
# Thus disloyal customers are more dissatisfied 

In [None]:
airline_df['Gender'].value_counts()

In [None]:
sns.countplot(x='Gender',data=airline_df,palette='magma')

In [None]:
sns.set_style('white')
plt.figure(figsize=(7,4))
ax=sns.histplot(x="Gender",hue="satisfaction",data=airline_df,multiple="dodge",shrink=0.5,
            edgecolor=".3",
            palette='coolwarm',
            linewidth=.5)
sns.move_legend(ax,"upper right",bbox_to_anchor=(1.5, 1))
sns.despine()

In [None]:
airline_df.columns

In [None]:
sns.histplot(x='Flight Distance',hue='satisfaction',data=airline_df,bins=20,multiple="stack",
             edgecolor=".3",
             palette='coolwarm',
             linewidth=.5)
sns.despine()
# shorter flights must be taken care of more

In [None]:
# Thus we can see that shorter duration flights have more dissatisfied customers

In [None]:
sns.histplot(x='Cleanliness',hue='satisfaction',data=airline_df,multiple="stack",bins=5,
             edgecolor=".3",
             palette='coolwarm',
             linewidth=.5)
# Thus flights must be clean to ensure satisfaction 
# 1 indicates cleanliness is low and 5 indicates good cleanliness in flight
# Thus we can see that as cleanliness increases satisfied passengers are more

In [None]:
plt.figure(figsize=(7,5))
ax=sns.histplot(x='Class',hue='satisfaction',data=airline_df,multiple="dodge",
             edgecolor=".3",
            palette='coolwarm',
            linewidth=.5,bins=2)
sns.move_legend(ax,"upper right",bbox_to_anchor=(1.5,1))
sns.despine()
# Economy class passengers must be catered to more

In [None]:
# Thus Biz class passengers are most satisfied while Eco class passengers are least satisfied

In [None]:
airline_df['Type of Travel'].head()

In [None]:
sns.histplot(x='Type of Travel',hue='satisfaction',data=airline_df,multiple="stack",
            palette='coolwarm'
            ,linewidth=.5,
            edgecolor=".3")
# People on personal travel needs more catering

In [None]:
airline_df['satisfaction'].iplot(kind="hist",bins=5,bargap=0)

# Training and Testing Data

In [None]:
# Conversion to numeric values  

airline_df['Type of Travel'].replace(regex={"Personal Travel":0,"Business travel":1},inplace=True)
airline_df['Class'].replace(regex={"Eco":1,"Eco Plus":2,"Business":3},inplace=True)
airline_df['Class'].replace(regex={"Eco":1,"Eco Plus":2,"Business":3},inplace=True)
airline_df['satisfaction'].replace(regex={"neutral or dissatisfied":0,"satisfied":1},inplace=True)
airline_df['Customer Type'].replace(regex={"disloyal Customer":0,"Loyal Customer":1},inplace=True)


hm=airline_df[['Gender', 'Customer Type', 'Age', 'Type of Travel',
       'Class', 'Flight Distance',
       'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'satisfaction']]

In [None]:
tc=hm.corr()
plt.subplots(figsize=(7,7))
sns.heatmap(tc,annot=True,cmap="YlGnBu",vmin=0,vmax=1,linewidths=0.7)

In [None]:
# Thus we can see the most important factors for satisfaction are
# Type of travel(R)
# Class(R)
# Cleanliness(R)
# Flight Distance(R)
# Customer Type(R)
# Age(R)

In [None]:
X=airline_df[['Type of Travel','Class','Cleanliness','Flight Distance','Customer Type','Age']]
y=airline_df['satisfaction']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [None]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Applying Logistic Regression

In [None]:
logmodel=LogisticRegression(solver='lbfgs', max_iter=1000)

# scaler=StandardScaler()

In [None]:
logmodel.fit(X_train,y_train)  

In [None]:
# X_train=scaler.fit_transform(X_train)
# X_test=scaler.transform(X_test)
# y_

In [None]:
predictions=logmodel.predict(X_test)
predictions

# Accuracy Check

In [None]:
print(classification_report(y_test,predictions))

In [None]:
confusion_matrix(y_test,predictions)

In [None]:
accuracy_score(y_test,predictions)*100
# Thus 82 percent accuracy