# Hypotézy

1. Loajálnym zákazníkom menej prekáža horší inflight service ako neloajálnym zákazníkom.
2. Ľuďom nad 50 rokov menej prekáža zlé pokrytie wifi počas letu.
3. Ak je pasažier počas dlhého letu nespokojný s komfortom, tak je nespokojný aj s celým letom.
4. Počas dlhých letov je pasažierom ponúkané lepšie jedlo.
5. Čím dlhšie meškanie odletu, tak tým dlhšie meškanie príletu (hypotéza so spojitými dátami)


In [None]:
import pandas as pd
import seaborn as sns
import scipy.stats as stats
from scipy.stats import chisquare
from scipy.stats import chi2_contingency
from scipy.stats import chi2

In [None]:
df = pd.read_csv('dataset/train.csv', index_col='Unnamed: 0')
df

# 1. Loajálnym zákazníkom menej prekáža horší inflight service ako neloajálnym zákazníkom. 

In [None]:
loyal_customer = df.loc[(df['Customer Type'] == 'Loyal Customer') & (df['Inflight service'] <= 3), 'satisfaction']
disloyal_customer = df.loc[(df['Customer Type'] == 'disloyal Customer') & (df['Inflight service'] <= 3), 'satisfaction']

In [None]:
loyal_customer.value_counts().sort_index().plot(kind='bar')

loyal = loyal_customer.value_counts().sort_index()

jano = sum(loyal.values.tolist())
loyal / jano * 100

In [None]:
disloyal_customer.value_counts().sort_index().plot(kind='bar')

disloyal = disloyal_customer.value_counts().sort_index()

jano = sum(disloyal.values.tolist())
disloyal / jano * 100

In [None]:
chisquare(loyal.values.tolist(), f_exp=disloyal.values.tolist())

# 2. Ľuďom nad 50 rokov prekáža zlé pokrytie wifi počas letu menej, ako ľuďom pod 50 rokov.

### Alt. Ľuďom nad 50 rokov prekáža zlé pokrytie wifi počas letu rovnako/viac, ako ľuďom pod 50 rokov.

In [None]:
older_than_fifty = df.loc[(df['Age'] >= 50) & (df['Inflight wifi service'] <= 3), 'satisfaction']
younger_than_fifty = df.loc[(df['Age'] < 50) & (df['Inflight wifi service'] <= 3), 'satisfaction']

In [None]:
older_than_fifty.value_counts().plot(kind='bar')

In [None]:
younger_than_fifty.value_counts().plot(kind='bar')

In [None]:
older = older_than_fifty.value_counts().sort_index()

jano = sum(older.values.tolist())
older / jano * 100

In [None]:
younger = younger_than_fifty.value_counts().sort_index()

jano = sum(younger.values.tolist())
younger / jano * 100

In [None]:
chisquare(older.values.tolist(), f_exp=younger.values.tolist())

# 3 Ak je pasažier počas dlhého letu spokojný s komfortom, tak je spokojný aj s celým letom.

### Alt. Ak je pasažier počas dlhého letu nespokojný s komfortom, tak je spokojný aj s celým letom.

In [None]:
long_flights_dissatisfied = df.loc[(df['Flight Distance'] > 843) & (df['Seat comfort'] <= 3), 'satisfaction']
long_flights_satisfied = df.loc[(df['Flight Distance'] > 843) & (df['Seat comfort'] > 3), 'satisfaction']

In [None]:
long_flights_satisfied.value_counts().sort_index().plot(kind='bar')

satisfied = long_flights_satisfied.value_counts().sort_index()

jano = sum(satisfied.values.tolist())
satisfied / jano * 100

In [None]:
long_flights_dissatisfied.value_counts().sort_index().plot(kind='bar')

dissatisfied = long_flights_dissatisfied.value_counts().sort_index()

jano = sum(dissatisfied.values.tolist())
dissatisfied / jano * 100

In [None]:
print(satisfied.values.tolist(), dissatisfied.values.tolist())

In [None]:
contingecny_table = [satisfied.values.tolist(), dissatisfied.values.tolist()]

stat, p, dof, expected = chi2_contingency(contingecny_table)

prob = 0.95
critical = chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
if abs(stat) >= critical:
	print('Dependent (reject H0)')
else:
	print('Independent (fail to reject H0)')
# interpret p-value
alpha = 1.0 - prob
print('significance=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
	print('Dependent (reject H0)')
else:
	print('Independent (fail to reject H0)')

# 4 Počas dlhých letov je pasažierom ponúkané lepšie jedlo ako je ponúkané počas krátkych letov.

### Alt. Počas dlhých letov je pasažierom ponúkané horšie/rovnako kvalitné jedlo ako je ponúkané počas krátkych letov.

In [None]:
long_flights = df.loc[df['Flight Distance'] > 843, 'Food and drink']
short_flights = df.loc[df['Flight Distance'] <= 843, 'Food and drink']

In [None]:
long_flights.value_counts().sort_index().plot(kind='bar')

In [None]:
short_flights.value_counts().sort_index().plot(kind='bar')

In [None]:
short = short_flights.value_counts().sort_index()

jano = sum(short.values.tolist())
short / jano * 100

In [None]:
long_f = long_flights.value_counts().sort_index()

jano = sum(long_f.values.tolist())
long_f / jano * 100

In [None]:
chisquare(long_f.values.tolist(), f_exp=short.values.tolist())

# 5 

rozdiel dlzky meskania odletu pasazierov, ktory su spokojny s letom a pasazierov, ktory su nespokojny s letom je signifikantny

In [None]:
delay_dissatisfied = df.loc[(df['satisfaction'] == 'neutral or dissatisfied'), 'Departure Delay in Minutes']
sns.distplot(delay_dissatisfied, bins=30)

# stats.shapiro(delay_satisfied)

In [None]:
delay_satisfied = df.loc[(df['satisfaction'] == 'satisfied'), 'Departure Delay in Minutes']
sns.distplot(delay_satisfied, bins=30)

# delay_satisfied.hist()

stats.shapiro(delay_satisfied)

In [None]:
stats.mannwhitneyu(delay_dissatisfied, delay_satisfied)

In [None]:
sns.barplot(x='satisfaction', y='Departure Delay in Minutes', data=df, 
            capsize=0.1, errwidth=2, palette=sns.color_palette("Blues"))