In [2]:
#import all necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#creating a dataframe by reading data from the Kaggle dataset
df=pd.read_csv('airline_passenger_satisfaction.csv')

In [None]:
#Verify data in dataframe
print(df.head(10))


In [None]:
#Check number of rows and columns
print(df.shape)
print(df.info())

In [None]:
#Check for null values in dataset
df.isna().sum()

In [6]:
#Fill the null values with 0
df['Arrival Delay'].fillna(value=0, inplace=True)

#Reverify null values again by executing previous cell

In [None]:
#Check validity of Non null values
print(df['Gender'].value_counts(),'\n')
print(df['Customer Type'].value_counts(),'\n')
print(df['Type of Travel'].value_counts(),'\n')
print(df['Class'].value_counts(),'\n')
print(df['Satisfaction'].value_counts())

In [None]:
#Check the validity of values in Age column
print(df[df['Age']>80]['Age'].value_counts(),'\n')
print(df[df['Age']<10]['Age'].value_counts())

In [None]:
#Exploratory Data Analysis Starts here
#First checking the distribution of Satisfied vs Neutral or Dissatisfied passneger on a pie plot
fig1=plt.figure(figsize=(8,4))
ax1=fig1.add_axes([0.1,0.1,0.9,0.9])

colors = [ '#DD7596', '#8EB897']
ax1.pie(df['Satisfaction'].value_counts(),  autopct='%1.0f%%',labels=df['Satisfaction'].unique(),
        shadow=True,startangle=90,colors=colors)
ax1.axis('equal')  
ax1.set_title('Satisfied people vs Neutral/Dissatisfied')
plt.show()

In [None]:
#Plot relation between Customer Type and Satisfaction on a stacked bar chart
plt.rcParams['figure.figsize'] = [8, 5]
df.groupby(['Customer Type','Satisfaction']).size().unstack().plot(kind='bar',stacked=True)
plt.title('Relation between Customer Type and Satisfaction')
plt.ylabel('Satisfaction')

#Plot relation between Passenger class and Satisfaction on a stacked bar chart
plt.rcParams['figure.figsize'] = [8, 5]
df.groupby(['Class','Satisfaction']).size().unstack().plot(kind='bar',stacked=True)
plt.title('Relation between Passenger class and Satisfaction')
plt.ylabel('Satisfaction')
plt.legend(loc='upper right')

#Plot relation between Gender and Satisfaction on a stacked bar chart
plt.rcParams['figure.figsize'] = [10, 10]
df.groupby(['Gender','Satisfaction']).size().unstack().plot(kind='bar',stacked=True)
plt.title('Relation between Gender and Satisfaction')
plt.ylabel('Satisfaction')
plt.legend(loc='upper right')

In [None]:
#Box plot to understand the age distribution of passengers
fig5=plt.figure(figsize=(6,4))
ax5=fig5.add_axes([0.1,0.1,0.9,0.9])

ax5.boxplot(df['Age'])
ax5.set_title('Demography of Flight Travelers')
ax5.set_xlabel('Age')
plt.show()

In [None]:
#extract all ratings column in one new dataframe for further use
all_ratings=df.iloc[:,9:23]

#Plotting best and worst services on adjacent bar graphs
fig, axs = plt.subplots(1, 2, figsize=(15, 8))
axs[0].bar(all_ratings.columns, all_ratings[all_ratings==5].count(),color='g')
axs[0].title.set_text("Number of 5 star rating for each service")

axs[1].bar(all_ratings.columns, all_ratings[all_ratings==1].count(),color='r')
axs[1].title.set_text('Number of 1 star rating for each service')

fig.autofmt_xdate(rotation=80)
fig.tight_layout(pad=5.0)



plt.show()

In [None]:
#Plotting flight distance and arrival delay on a scatter plot
fig4, ax3 = plt.subplots(1, 1, figsize=(10, 8))
ax3.scatter(df['Flight Distance'], df['Arrival Delay'],marker = '*')
plt.title("Relation between Flight Distance and Arrival Delay")
plt.xlabel("Flight Distance")
plt.ylabel("Arrival Delay")
plt.show()

In [None]:
#Unsupervised clustering using KMeans algorithm
from sklearn import metrics
from sklearn import cluster
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import scale

#split the data to be all columns related to ratings
x_ratings = df.iloc[:,9:23]

#scale the data before clustering
x_scaled = scale(x_ratings)
y_class = df['Satisfaction']
x_scaled

In [14]:
#calculate the inertia to identify the elbow point to determine optimum number of clusters
k_range=range(2,12)
sse=[]
for k in k_range:
    km=cluster.KMeans(n_clusters=k)
    km.fit(x_scaled)
    sse.append(km.inertia_)

In [None]:
#plotting the inertia as a function of number of clusters
plt.xlabel('K')
plt.ylabel('Inertia')
plt.plot(k_range,sse)
plt.show()

In [None]:
#building the clustering model for value of K ranging from 2 to 6 as we can see from previous graph that 
#optimal values would be between 2 and 6
Y2 = LabelEncoder().fit_transform(y_class)
for k in range(2, 7):
    kmeans = cluster.KMeans(n_clusters=k)
    kmeans.fit(x_scaled)
    print(k)
    print(metrics.silhouette_score(x_ratings, kmeans.labels_))
    print(metrics.completeness_score(Y2, kmeans.labels_))
    print(metrics.homogeneity_score(Y2, kmeans.labels_))

In [None]:
#Supervised Learning section begins here
from sklearn import model_selection
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

#split all rating columns in a new dataframe
all_ratings=df.iloc[:,9:23]
all_ratings.columns
all_ratings.head(10)

In [20]:
#Convert Satisfaction column to numeric values
target_satisfaction=df['Satisfaction']
label_mapping= {'Satisfied':1,'Neutral or Dissatisfied':0}
target_satisfaction=target_satisfaction.map(label_mapping)
print(df['Satisfaction'].value_counts())
print(target_satisfaction.value_counts())

Neutral or Dissatisfied    73452
Satisfied                  56428
Name: Satisfaction, dtype: int64
0    73452
1    56428
Name: Satisfaction, dtype: int64


In [21]:
#Split the training data into training and test data
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(all_ratings, target_satisfaction, test_size = 0.20)

In [None]:
#Decision Tree
model = DecisionTreeClassifier()
model.fit(X_train, Y_train)
y_predicted = model.predict(X_test)
model.score(X_test,Y_test)
print('Accuracy score',metrics.accuracy_score(Y_test,y_predicted))
print(metrics.classification_report(Y_test, y_predicted))
print(metrics.confusion_matrix(Y_test, y_predicted))

In [None]:
#Visualize the confusion matrix
import seaborn as sns
cm=metrics.confusion_matrix(Y_test, y_predicted)
fig1=plt.figure(figsize=(8,4))
ax1=fig1.add_axes([0.1,0.1,0.9,0.9])
sns.heatmap(cm,annot=True, fmt='g')

plt.show()