# L1 - EDA
**CH01A Finding a Good Deal among Hotels: Data Collection**

hotels-vienna dataset

#### First look at the hotels data


In [None]:
import os
import sys
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings("ignore")

In [None]:
# load in clean and tidy data and create workfile
df = pd.read_csv("hotels-vienna.csv")

In [None]:
df

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df.shape

In [None]:
############################################
# First look
############################################
df = df.filter(
    [
        "hotel_id",
        "accommodation_type",
        "country",
        "city",
        "city_actual",
        "neighbourhood",
        "center1label",
        "distance",
        "center2label",
        "distance_alter",
        "stars",
        "rating",
        "rating_count",
        "ratingta",
        "ratingta_count",
        "year",
        "month",
        "weekend",
        "holiday",
        "nnights",
        "price",
        "scarce_room",
        "offer",
        "offer_cat",
    ]
)

In [None]:
df.info()

# Finding numerical variables and feature engineering for numerical variables

In [None]:
# find numerical variables
numerical = [var for var in df.columns if df[var].dtype!='O']
print('There are {} numerical variables\n'.format(len(numerical)))
print('The numerical variables are :', numerical)

In [None]:
# view the numerical variables
df[numerical].head()

In [None]:
df_num = df[numerical]
df_num

In [None]:
df_num.info()

In [None]:
# See missing values using isnull
df_num.isnull().sum()

In [None]:
# Impute missing values using mean
df_num_imputed = df_num.fillna(df_num.mean())
df_num_imputed.info()

In [None]:
# Alternatively drop missing values in df_num
df_num_dropped = df_num.dropna()
df_num_dropped.info()

In [None]:
df_num_imputed.describe()

In [None]:
df_num_dropped.describe()

In [None]:
# Correlation in numerial data
# Assuming df_num is your DataFrame
correlation_matrix = df_num_dropped.corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)

# Set plot title
plt.title('Correlation Coefficient Matrix')

# Display the plot
plt.show()


In [None]:
# Correlation in numerial data for imputed data
# Assuming df_num is your DataFrame
correlation_matrix = df_num_imputed.corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)

# Set plot title
plt.title('Correlation Coefficient Matrix')

# Display the plot
plt.show()

The absence of calculated correlation values for certain variables, such as "year," "month," and "weekend," in the correlation matrix is because these variables are categorical or binary variables.

Correlation coefficients are typically used to measure the strength and direction of linear relationships between continuous variables. Since categorical variables do not have a natural ordering or numeric representation, it is not meaningful to calculate correlation coefficients between them.

If you have categorical or binary variables in your dataset, it is common to calculate other types of association measures specific to categorical variables. Some commonly used measures for categorical variables include chi-square test, Cramer's V, and point biserial correlation.

In [None]:
# Checking subcategories for numerical variables to see if some integer valued variables are actually categorical
for column in df_num:
    num_categories = df_num_imputed[column].nunique()
    print(f"Number of categories in {column}: {num_categories}")

In [None]:
# Because there is only one distinct category in some numeric variables we will remove them from our data

#Drop one distinct numeric variables
columns_to_remove = ['year', 'month', 'weekend', 'holiday', 'nnights']
df_num_dropped = df_num_imputed.drop(columns=columns_to_remove)
df_num_dropped

In [None]:
df_num_dropped.info()

In [None]:
# Correlation in numerial data for df_num_dropped data
# Assuming df_num is your DataFrame
correlation_matrix = df_num_dropped.corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)

# Set plot title
plt.title('Correlation Coefficient Matrix')

# Display the plot
plt.show()

# Outliers in numerical variables

In [None]:
df_num_dropped.columns

In [None]:
# Define the numerical variables
numerical_vars = ['distance', 'distance_alter', 'stars', 'rating', 'rating_count',
                  'ratingta', 'ratingta_count', 'price']

# Create subplots for histograms
fig, axs = plt.subplots(1, len(numerical_vars), figsize=(12, 4))

# Plot histograms for each numerical variable
for i, var in enumerate(numerical_vars):
    axs[i].hist(df_num_dropped[var], edgecolor='white', align='mid')
    axs[i].set_xlabel(var)
    axs[i].set_ylabel('Frequency')

# Adjust spacing between subplots
plt.tight_layout()

# Display the plot
plt.show()


In [None]:
# Define the numerical variables
numerical_vars = ['distance', 'distance_alter', 'stars', 'rating', 'rating_count',
                  'ratingta', 'ratingta_count', 'price']

# Calculate the number of rows and columns for subplots
num_plots = len(numerical_vars)
num_rows = num_plots // 2
num_cols = num_plots % 2

# Create subplots for histograms
fig, axs = plt.subplots(num_rows, 2, figsize=(12, 4 * num_rows))

# Flatten the axs array if there is only one row
if num_rows == 1:
    axs = axs.reshape(1, -1)

# Plot histograms for each numerical variable
for i, var in enumerate(numerical_vars):
    row = i // 2
    col = i % 2
    axs[row, col].hist(df_num_dropped[var], edgecolor='white', align='mid')
    axs[row, col].set_xlabel(var)
    axs[row, col].set_ylabel('Frequency')

# Adjust spacing between subplots
plt.tight_layout()

# Display the plot
plt.show()


In [None]:
# Define the numerical variables
numerical_vars = ['distance', 'distance_alter', 'stars', 'rating', 'rating_count',
                  'ratingta', 'ratingta_count', 'price']

# Create subplots for histograms and box plots
fig, axs = plt.subplots(len(numerical_vars), 2, figsize=(12, 4 * len(numerical_vars)))

# Plot histograms and box plots for each numerical variable
for i, var in enumerate(numerical_vars):
    # Plot histogram
    axs[i, 0].hist(df_num_dropped[var], edgecolor='white', align='mid')
    axs[i, 0].set_xlabel(var)
    axs[i, 0].set_ylabel('Frequency')

    # Plot box plot
    axs[i, 1].boxplot(df_num_dropped[var], vert=False)
    axs[i, 1].set_xlabel(var)
    axs[i, 1].set_yticklabels([])

# Adjust spacing between subplots
plt.tight_layout()

# Display the plot
plt.show()


In [None]:
# Define the numerical variables
numerical_vars = ['distance', 'distance_alter', 'stars', 'rating', 'rating_count',
                  'ratingta', 'ratingta_count', 'price']

# Create subplots for box plots
plt.figure(figsize=(15, 10))

for i, var in enumerate(numerical_vars):
    plt.subplot(3, 3, i+1)
    boxplot = df_num_dropped.boxplot(column=var)
    boxplot.set_title(var)
    boxplot.set_ylabel('')

# Adjust spacing between subplots
plt.tight_layout()

# Display the plot
plt.show()


In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
# Define the numerical variables
numerical_vars = ['distance', 'distance_alter', 'stars', 'rating', 'rating_count',
                  'ratingta', 'ratingta_count', 'price']

# Apply Winsorization to reduce outliers
winsorized_df = df_num_dropped.copy()
for var in numerical_vars:
    q_low = df_num_dropped[var].quantile(0.01)
    q_high = df_num_dropped[var].quantile(0.99)
    winsorized_df[var] = df_num_dropped[var].clip(q_low, q_high)

# Apply scaling to the winsorized DataFrame
scaler = MinMaxScaler()  # or StandardScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(winsorized_df[numerical_vars]), columns=numerical_vars)

# Display the scaled DataFrame
print(scaled_df)

In [None]:
# Define the numerical variables
numerical_vars = ['distance', 'distance_alter', 'stars', 'rating', 'rating_count',
                  'ratingta', 'ratingta_count', 'price']

# Create subplots for histograms
fig, axs = plt.subplots(2, 4, figsize=(12, 8))
axs = axs.flatten()

# Plot histograms for each variable
for i, var in enumerate(numerical_vars):
    axs[i].hist(scaled_df[var], bins=10, edgecolor='white', align='mid')
    axs[i].set_xlabel(var)
    axs[i].set_ylabel('Frequency')

# Adjust spacing between subplots
plt.tight_layout()

# Display the plot
plt.show()


In [None]:
# Define the numerical variables
numerical_vars = ['distance', 'distance_alter', 'stars', 'rating', 'rating_count',
                  'ratingta', 'ratingta_count', 'price']

# Create subplots for box plots
plt.figure(figsize=(15, 10))

for i, var in enumerate(numerical_vars):
    plt.subplot(3, 3, i+1)
    boxplot = scaled_df.boxplot(column=var)
    boxplot.set_title(var)
    boxplot.set_ylabel('')

# Adjust spacing between subplots
plt.tight_layout()

# Display the plot
plt.show()

In [None]:
# To see if there is difference in distribution after droping outliers we will drop outliers remained out of the first and third quantiles
Q1 = scaled_df.quantile(0.25)
Q3 = scaled_df.quantile(0.75)
IQR = Q3 - Q1
scaled_df = scaled_df[~((scaled_df < (Q1 - 1.5 * IQR)) | (scaled_df > (Q3 + 1.5 * IQR))).any(axis=1)]

In [None]:
# Check outliers using box plots again after droping outliers
# Define the numerical variables
numerical_vars = ['distance', 'distance_alter', 'stars', 'rating', 'rating_count',
                  'ratingta', 'ratingta_count', 'price']

# Create subplots for box plots
plt.figure(figsize=(15, 10))

for i, var in enumerate(numerical_vars):
    plt.subplot(3, 3, i+1)
    boxplot = scaled_df.boxplot(column=var)
    boxplot.set_title(var)
    boxplot.set_ylabel('')

# Adjust spacing between subplots
plt.tight_layout()

# Display the plot
plt.show()

In [None]:
scaled_df.info()

In [None]:
scaled_df

In [None]:
# To drop index and create new one
scaled_df = scaled_df.reset_index(drop=True)
scaled_df

In [None]:
# Create subplots for histograms
fig, axs = plt.subplots(2, 4, figsize=(12, 8))
axs = axs.flatten()

# Plot histograms for each variable
for i, var in enumerate(numerical_vars):
    axs[i].hist(scaled_df[var], bins=10, edgecolor='white', align='mid')
    axs[i].set_xlabel(var)
    axs[i].set_ylabel('Frequency')

# Adjust spacing between subplots
plt.tight_layout()

# Display the plot
plt.show()

In [None]:
# Plotting the relationship between price and distance
sns.scatterplot(data=scaled_df, x='distance', y='price')
plt.xlabel('Distance')
plt.ylabel('Price')
plt.title('Relationship between Price and Distance')
plt.show()


In [None]:
# Plotting the relationship between price and distance
sns.regplot(data=scaled_df, x='distance', y='price', scatter_kws={'alpha': 0.5}, line_kws={'color': 'red'})
plt.xlabel('Distance')
plt.ylabel('Price')
plt.title('Relationship between Price and Distance')
plt.show()


In [None]:
# Create a joint plot for price and distance
sns.jointplot(data=scaled_df, x='distance', y='price', kind='reg', height=6, ratio=3)
plt.xlabel('Distance')
plt.ylabel('Price')
plt.title('Relationship between Price and Distance')
plt.show()


In [None]:
# Create a joint plot for price and rating
sns.jointplot(data=scaled_df, x='rating', y='price', kind='reg', height=6, ratio=3)
plt.xlabel('Rating')
plt.ylabel('Price')
plt.title('Relationship between Price and Rating')
plt.show()


In [None]:
# Create joint plots for price and rating_count
sns.jointplot(data=scaled_df, x='rating_count', y='price', kind='reg', height=6, ratio=3)
plt.xlabel('Rating Count')
plt.ylabel('Price')
plt.title('Relationship between Price and Rating Count')
plt.show()

# Create joint plots for price and ratingta_count
sns.jointplot(data=scaled_df, x='ratingta_count', y='price', kind='reg', height=6, ratio=3)
plt.xlabel('RatingTA Count')
plt.ylabel('Price')
plt.title('Relationship between Price and RatingTA Count')
plt.show()


In [None]:
# Create joint plots for price and star
sns.jointplot(data=scaled_df, x='stars', y='price', kind='reg', height=6, ratio=3)
plt.xlabel('Stars')
plt.ylabel('Price')
plt.title('Relationship between Price and Stars')
plt.show()

# Finding categorical variables and feature engineering for categorical variables

In [None]:
categorical_columns = df.select_dtypes(include=['object']).columns
print(categorical_columns)

In [None]:
# find categorical variables
categorical_columns = list(df.select_dtypes(include=['object']).columns)
for column in categorical_columns:
    print(column)

In [None]:
# find categorical variables
categorical = [var for var in df.columns if df[var].dtype=='O']
print('There are {} categorical variables\n'.format(len(categorical)))
print('The categorical variables are :', categorical)

In [None]:
categorical.info()

In [None]:
categorical()

# To create a new DataFrame, df_cat, containing only the categorical variables from your original DataFrame, you can use the loc accessor to select the desired columns.

In [None]:
# Create categorical variables df
categorical_columns = df.select_dtypes(include=['object']).columns
df_cat = df.loc[:, categorical_columns]
df_cat

# In the code above, categorical_columns is assigned the column names of the categorical variables using select_dtypes(include=['object']). Then, df.loc[:, categorical_columns] selects all rows (:) and the columns specified in categorical_columns, creating a new DataFrame, df_cat, containing only the categorical variables.

In [None]:
# Get info
df_cat.info()

In [None]:
# Checking subcategories 
for column in df_cat:
    num_categories = df_cat[column].nunique()
    print(f"Number of categories in {column}: {num_categories}")

In [None]:
#Drop one distinct category variables
columns_to_remove = ['country', 'city', 'center1label', 'center2label']
df_cat_dropped = df_cat.drop(columns=columns_to_remove)
df_cat_dropped

In [None]:
# Step 1: Calculate the count and percentage of each category in df_cat_dropped.
category_counts = df_cat_dropped.apply(pd.Series.value_counts)
category_percentages = category_counts / len(df_cat_dropped) * 100

In [None]:
#Step 2: Plot the histograms to visualize the frequency of each category.
import matplotlib.pyplot as plt

fig, axs = plt.subplots(len(df_cat_dropped.columns), figsize=(10, 6 * len(df_cat_dropped.columns)))

for i, column in enumerate(df_cat_dropped.columns):
    ax = axs[i]
    ax.bar(category_counts.index, category_counts[column])
    ax.set_xlabel(column)
    ax.set_ylabel('Count')
    ax.set_title(f'Category Frequencies - {column}')
    ax2 = ax.twinx()
    ax2.plot(category_counts.index, category_percentages[column], color='red', marker='o')
    ax2.set_ylabel('Percentage')
    ax2.set_ylim(0, 100)
    ax2.grid(False)
    for tick in ax.get_xticklabels():
        tick.set_rotation(90)

plt.tight_layout()
plt.show()

In [None]:
#A Different way to visualize the distribution for categorical variables
# Set up the figure and axes
fig, axs = plt.subplots(len(df_cat_dropped.columns), figsize=(10, 6 * len(df_cat_dropped.columns)))

# Iterate over each categorical column
for i, column in enumerate(df_cat_dropped.columns):
    # Calculate value counts and percentages
    value_counts = df_cat_dropped[column].value_counts()
    percentages = value_counts / len(df_cat_dropped) * 100

    # Plot the bar chart
    axs[i].bar(value_counts.index, value_counts.values)
    axs[i].set_xlabel(column)
    axs[i].set_ylabel('Count')
    axs[i].set_title(f'Category Frequencies - {column}')

    # Add percentage labels
    for j, count in enumerate(value_counts):
        axs[i].text(j, count, f'{count} ({percentages[j]:.2f}%)', ha='center', va='bottom')

# Adjust spacing between subplots
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
#Another way
# Calculate category frequencies and percentages
category_counts = df_cat_dropped.apply(pd.Series.value_counts)
category_percentages = category_counts / len(df_cat_dropped) * 100

# Set up the figure and axes
fig, axs = plt.subplots(len(df_cat_dropped.columns), figsize=(10, 6 * len(df_cat_dropped.columns)))

# Iterate over each categorical column
for i, column in enumerate(df_cat_dropped.columns):
    # Create a stacked bar chart
    sns.countplot(data=df_cat_dropped, x=column, ax=axs[i])
    axs[i].set_xlabel(column)
    axs[i].set_ylabel('Count')
    axs[i].set_title(f'Category Frequencies - {column}')
    
    # Add percentage labels to each bar
    total_count = len(df_cat_dropped[column])
    for p in axs[i].patches:
        height = p.get_height()
        axs[i].text(p.get_x() + p.get_width() / 2,
                    height + 3,
                    f'{height} ({height / total_count * 100:.2f}%)',
                    ha='center')

# Adjust spacing between subplots
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
# Encoding for categorical variables
# Perform one-hot encoding for nominal variables
df_encoded = pd.get_dummies(df_cat_dropped, columns=['accommodation_type', 'city_actual', 'neighbourhood', 'offer_cat'])
df_encoded.info()

In [None]:
# Set the style for the plot
sns.set(style='ticks')

# Plot histogram distributions for each variable in df_encoded
df_encoded.hist(figsize=(10, 8), bins=10, edgecolor='black', grid=False)

# Adjust the layout
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
# Distribute 0s and 1s to make a decision on which variable is winner, which variable is loser 
# Calculate the percentage of zeros and ones for each variable
percentage_zeros = (df_encoded == 0).mean() * 100
percentage_ones = (df_encoded == 1).mean() * 100

# Set the style for the plot
sns.set(style='ticks')

# Create a figure and axes for the plot
fig, ax = plt.subplots(figsize=(10, 8))

# Plot the histogram distributions for zeros and ones
ax.bar(percentage_zeros.index, percentage_zeros, label='Zeros', alpha=0.7)
ax.bar(percentage_ones.index, percentage_ones, bottom=percentage_zeros, label='Ones', alpha=0.7)

# Set the labels and title
ax.set_xlabel('Variables')
ax.set_ylabel('Percentage')
ax.set_title('Percentage Distribution of Zeros and Ones for Each Categorical Variable')

# Rotate x-axis labels if needed
plt.xticks(rotation=45)

# Add a legend
ax.legend()

# Show the plot
plt.tight_layout()
plt.show()


In [None]:
# An example to visualize a relationship between two categorical variables using sns 
# Set the style for the plot
sns.set(style='ticks')

# Create the plot
ax = sns.regplot(x="accommodation_type_Hotel", y="city_actual_Fischamend", data=df_encoded, x_estimator=np.mean)
plt.title('Relationship between city_actual_Fischamend and accommodation_type')

# Show the plot
plt.show()


In [None]:
# An example to create histogram for one categorical variable to see the distribution of 0s and 1s
# Create a histogram of the 'neighbourhood_Wahring' column
plt.hist(df_encoded['offer_cat_15-50% offer'], bins=10)

# Set labels and title
plt.xlabel('offer_cat_15-50% offer')
plt.ylabel('Frequency')
plt.title('Distribution of offer_cat_15-50% offer')

# Show the plot
plt.show()

In [None]:
# An example to create histogram for one categorical variable to see the distribution of 0s and 1s
# Create a histogram of the 'neighbourhood_Wahring' column
plt.hist(df_encoded['neighbourhood_Wahring'], bins=10)

# Set labels and title
plt.xlabel('neighbourhood_Wahring')
plt.ylabel('Frequency')
plt.title('Distribution of neighbourhood_Wahring')

# Show the plot
plt.show()

# Finding outliers for variables

In [None]:
# view summary statistics in numerical variables
print(df_encoded.describe(include = 'all'))

In [None]:
# Set up the subplots
fig, axs = plt.subplots(5, 8, figsize=(15, 10))
fig.suptitle('Boxplots for Each Variable')

# Loop through the columns and plot boxplots
for i, column in enumerate(df_encoded):
    ax = axs[i // 8, i % 8]
    ax.boxplot(df_encoded[column])
    ax.set_title(column, fontsize=8)
    ax.set_xticklabels([])
    ax.tick_params(axis='x', which='both', bottom=False, top=False)

# Adjust the layout and spacing
fig.tight_layout(rect=[0, 0, 1, 0.95])

# Show the plot
plt.show()


In [None]:
df = df.filter(
    [
        "hotel_id",
        "accommodation_type",
        "country",
        "city",
        "city_actual",
        "center1label",
        "distance",
        "stars",
        "rating",
        "price",
    ]
)

### Table 1.1

In [None]:
df.head()

In [None]:
# Calculate the IQR
IQR = df['price'].quantile(0.75) - df['price'].quantile(0.25)

# Calculate the lower and upper fences
Lower_fence = df['price'].quantile(0.25) - (IQR * 1.5)
Upper_fence = df['price'].quantile(0.75) + (IQR * 1.5)

# Identify price outliers
outliers = df[(df['price'] < Lower_fence) | (df['price'] > Upper_fence)]

# Print the outlier information
print("Price outliers are values < {lowerboundary} or > {upperboundary}".format(
    lowerboundary=Lower_fence, upperboundary=Upper_fence))
print(outliers)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming you have the DataFrame named 'df'

# Create a box plot to visualize the 'price' column
plt.boxplot(df['price'], vert=False, sym='r')

# Calculate the IQR and fences
IQR = df['price'].quantile(0.75) - df['price'].quantile(0.25)
Lower_fence = df['price'].quantile(0.25) - (IQR * 1.5)
Upper_fence = df['price'].quantile(0.75) + (IQR * 1.5)

# Identify price outliers
outliers = df[(df['price'] < Lower_fence) | (df['price'] > Upper_fence)]

# Add red markers for outliers on the box plot
plt.plot(outliers['price'], [1] * len(outliers), 'ro', label='Outliers')

# Set labels and title
plt.xlabel('Price')
plt.title('Distribution of Price with Outliers')

# Show the legend
plt.legend()

# Show the plot
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming you have the DataFrame named 'df'

# Get the numeric columns from the DataFrame
numeric_columns = df.select_dtypes(include=['float', 'int']).columns

# Iterate over each numeric column in the DataFrame
for column in numeric_columns:
    # Create a box plot for the current column
    plt.boxplot(df[column], vert=False, sym='r')

    # Set labels and title
    plt.xlabel(column)
    plt.title(f"Distribution of {column} with Outliers")

    # Show the plot
    plt.show()



In [None]:
# Display the first few rows of the DataFrame
print(df.head())

# Summary statistics
print(df.describe())

# Data types and missing values
print(df.info())

# Distribution of accommodation types
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='accommodation_type')
plt.title('Distribution of Accommodation Types')
plt.show()

# Distribution of stars
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='stars')
plt.title('Distribution of Stars')
plt.show()

# Correlation heatmap
correlation = df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Scatter plot of price vs. rating
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='price', y='rating')
plt.title('Price vs. Rating')
plt.show()


In [None]:
# Display the data types of each column in the DataFrame
print(df.dtypes)

# Separate categorical and numerical variables
categorical_vars = df.select_dtypes(include='object').columns
numerical_vars = df.select_dtypes(include=['int', 'float']).columns

# Print the categorical variables
print('Categorical Variables:')
print(categorical_vars)

# Print the numerical variables
print('Numerical Variables:')
print(numerical_vars)


In [None]:
# Display the count of missing values in each categorical variable
categorical_vars = df.select_dtypes(include='object').columns
missing_counts = df[categorical_vars].isnull().sum()
print('Missing Value Counts:')
print(missing_counts)

# Visualize the frequency distribution of each categorical variable
plt.figure(figsize=(12, 8))
for i, var in enumerate(categorical_vars):
    plt.subplot(2, 3, i+1)
    sns.countplot(data=df, x=var)
    plt.title('Frequency Distribution of {}'.format(var))
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Identify potential outliers in categorical variables
outlier_vars = []
for var in categorical_vars:
    value_counts = df[var].value_counts()
    if len(value_counts) <= 2:
        continue
    top_value_count = value_counts.iloc[0]
    outlier_threshold = top_value_count * 0.95
    if any(value_counts < outlier_threshold):
        outlier_vars.append(var)

print('Potential Outlier Variables:')
print(outlier_vars)


In [None]:
# Display the count of missing values in each numerical variable
numerical_vars = df.select_dtypes(include=['int', 'float']).columns
missing_counts = df[numerical_vars].isnull().sum()
print('Missing Value Counts:')
print(missing_counts)

# Summary statistics
summary_stats = df[numerical_vars].describe()
print('Summary Statistics:')
print(summary_stats)

# Visualize the distribution of each numerical variable
plt.figure(figsize=(12, 8))
for i, var in enumerate(numerical_vars):
    plt.subplot(2, 3, i+1)
    sns.distplot(df[var].dropna())
    plt.title('Distribution of {}'.format(var))
plt.tight_layout()
plt.show()

# Identify potential outliers in numerical variables
outlier_vars = []
for var in numerical_vars:
    q1 = df[var].quantile(0.25)
    q3 = df[var].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = df[(df[var] < lower_bound) | (df[var] > upper_bound)]
    if len(outliers) > 0:
        outlier_vars.append(var)

print('Potential Outlier Variables:')
print(outlier_vars)


In [None]:
# Remove outliers from the 'price' variable
df = df[df['price'] <= 600]

# Check the updated summary statistics
print(df['price'].describe())


In [None]:
# Visualize the new distribution of 'price'
plt.figure(figsize=(8, 6))
sns.distplot(df['price'], kde=True)
plt.title('Distribution of Price (Outliers Removed)')
plt.show()


In [None]:
# Select the numerical columns from the DataFrame
numeric_columns = df.select_dtypes(include=['float', 'int'])

# Standardize the numerical variables
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_columns)

# Convert the scaled_data to a DataFrame
scaled_df = pd.DataFrame(data=scaled_data, columns=numeric_columns.columns)

# Print the scaled DataFrame
print(scaled_df.head())
