In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('data/Clean_Dataset.csv')
df.sample(20)

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
df.describe()

In [None]:
print(f"{'column':<20}{'# unique values'}")
print("-"*40)
for column in df.columns:
    print(f"{column:<20}{df[column].nunique()}")

In [None]:
df.dtypes

In [8]:
CAT_COLS = [
    'airline',
    'flight',
    'source_city',
    'departure_time',
    'stops',
    'arrival_time',
    'destination_city',
    'class',
]

NUM_COLS = [
    'duration',
    'days_left',
    'price',
]

In [None]:
print("Cat Cols values:")
print()

for column in CAT_COLS:
    print(f"{column.upper()}:")
    print(", ".join(df[column].unique()))
    print()

In [None]:
CAT_COLS_SMALL = CAT_COLS.copy()
CAT_COLS_SMALL.remove("flight")
print(CAT_COLS_SMALL)

In [None]:
# Calculate heights based on unique categories in each categorical column
heights = [df[col].nunique() for col in CAT_COLS_SMALL]
total_height = sum(heights)
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2']

fig, axs = plt.subplots(
    ncols=1,
    nrows=len(CAT_COLS_SMALL),
    figsize=(10, len(CAT_COLS_SMALL)),
    sharex=True,
    gridspec_kw={'height_ratios': heights}  # Use height_ratios to adjust each row's height
)

for column, ax, color in zip(CAT_COLS_SMALL, axs.flatten(), colors):
    order = df[column].value_counts().index    
    sns.countplot(
        df,
        y=column,
        ax=ax,
        order=order,
        color=color,
    )

fig.tight_layout()
plt.subplots_adjust(hspace=0.0, wspace=0)

In [None]:
len(CAT_COLS_SMALL)

In [None]:
# Calculate heights based on unique categories in each categorical column
heights = [df[col].nunique() for col in CAT_COLS_SMALL]
total_height = sum(heights)
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2']

# Create subplots – one column per categorical column
fig, axs = plt.subplots(
    nrows=len(CAT_COLS_SMALL),  # Transpose: Now each row corresponds to a categorical column
    ncols=len(NUM_COLS),        # Transpose: Each column corresponds to a numerical column
    figsize=(15, total_height * 0.3),  # Adjust height dynamically based on total height
    sharex='col',  # Sharing x-axis along each column
    sharey='row',
    gridspec_kw={'height_ratios': heights}  # Use height_ratios to adjust each row's height
)


# Iterate over categorical columns using enumerate
for (row_index, cat_col), color in zip(enumerate(CAT_COLS_SMALL), colors):
    for col_index, num_col in enumerate(NUM_COLS):
        
        # Access the subplot using the current index
        ax = axs[row_index, col_index]
        
        # Create a boxplot for the current combination with numerical categories on y-axis
        sns.boxplot(
            data=df,
            x=num_col,  # Numerical column on x-axis
            y=cat_col,  # Categorical data on y-axis
            ax=ax,
            color=color,
        )
            
# Adjust layout to remove gaps
plt.subplots_adjust(hspace=0.0, wspace=0)

In [None]:
fig, axs = plt.subplots(
    ncols=1,
    nrows=len(NUM_COLS),
    figsize=(10, 12),
)

for column, ax in zip(NUM_COLS, axs.flatten()):
    sns.histplot(
        data=df,
        x=column,
        bins=100,
        ax=ax,
        hue="class",
    )
    
fig.tight_layout()

In [None]:
sns.pairplot(data=df.sample(2000), hue="class")

In [None]:

crosstab_result = pd.crosstab(index=df['source_city'], columns=df['destination_city'])

sns.heatmap(
    crosstab_result,
    cmap="Blues",
    annot=True,
    fmt=".5g",
)


# Plot the heatmap
plt.title('Count of flights from Source to Destination City')
plt.xlabel('Destination City')
plt.ylabel('Source City')
plt.show()

In [None]:

# Create the crosstab with mean duration across source and destination cities
crosstab_result = pd.crosstab(
    index=df['source_city'],
    columns=df['destination_city'],
    values=df['duration'],
    aggfunc=np.mean
)

# Plot the heatmap
sns.heatmap(crosstab_result, cmap="Blues", annot=True, fmt=".5g")
plt.title('Mean Duration from Source to Destination City')
plt.xlabel('Destination City')
plt.ylabel('Source City')
plt.show()

In [None]:
# Create the crosstab with mean duration across source and destination cities
crosstab_result = pd.crosstab(
    index=df['source_city'],
    columns=df['destination_city'],
    values=df['price'],
    aggfunc=np.mean
)

# Plot the heatmap
sns.heatmap(crosstab_result, cmap="Blues", annot=True, fmt=".5g")
plt.title('Mean Price from Source to Destination City')
plt.xlabel('Destination City')
plt.ylabel('Source City')
plt.show()

In [None]:
for column in CAT_COLS:
    print(df.groupby(column)['price'].mean())
    print('='*25)

In [None]:
for i in CAT_COLS:
    print(df.groupby(['class', i])['price'].mean())
    print('='*25)

In [None]:
df.groupby('class')['price'].plot(kind = 'hist', bins = 50)

In [None]:
sns.heatmap(data = df.loc[:, ['duration', 'days_left', 'price']].corr(), annot = True)

In [None]:
mask_eco = df.loc[:, 'class'] == 'Economy'
df.loc[mask_eco, :].describe()

In [None]:
mask_bis = df.loc[:, 'class'] == 'Business'
df.loc[mask_bis, :].describe()