## Summary

In [None]:
import pandas as pd

df=pd.DataFrame.from_dict(df)

In [None]:
print("Shape of the Dataframe: ", df.shape)

In [None]:
print("First 5 rows:")
print(df.head())

In [None]:
print("Descriptive Statistics:")
print(df.describe())

In [None]:
print("Missing Values:")
print(df.isnull().sum())

### Calculate the mean price of each town

In [None]:
df_mean_price_by_town = df["price"].groupby(df['town_location'].str.split().str[-1]).mean()
df_mean_price_by_town.head()

# Data Cleaning

## Handling outliers
### Visualize distribution and boxplots to identify outliers

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x=df['price'])
plt.title('Boxplot of Price')
plt.show()

### Remove outliers

In [None]:
import numpy as np

price_column = df['price']
Q1 = price_column.quantile(0.25)
Q3 = price_column.quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
lower_bound = lower_bound if lower_bound > 0 else 0
upper_bound = Q3 + 1.5 * IQR

outliers = (price_column < lower_bound) | (price_column > upper_bound)
price_mean = df['price'].mean()
df = df[~outliers]

In [None]:
print(f"Upper bound is: {upper_bound}, lower bound is: {lower_bound}. Mean is: {price_mean}")

In [None]:
df['price'].mean()

In [None]:
import plotly.express as px

if df_name == "all":
    trace = []
    colors = ['blue', 'red', 'green']
    towns = df["town_location"].unique()
    for i, town in enumerate(towns):
        trace.append(px.line(df[df['town_location']==town], x="date", y="price", title='Hotel Demands', line_group='date', labels={'price': town}))
        trace[i].update_traces(line=dict(color=colors[i]))


    fig = px.line(df[df['town_location']==towns[0]], x="date", y="price", title=f'Hotel Demands in {towns[0]} vs {towns[1]} vs {towns[2]}')

    fig.add_traces(trace[1].data)
    fig.add_traces(trace[2].data)

    fig.show()

In [None]:
import plotly.express as px

fig = px.line(df, x="date", y="price", title=f'Hotel Demands in {df_name}')
fig.show()

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(df['km_from_center'], df['price'], c=df['stars'], cmap='viridis', alpha=0.5, edgecolors='w')

plt.xlabel('Distance from Center (km)')
plt.ylabel('Price EUR')
plt.title('Price vs Distance from Center')

cbar = plt.colorbar()
cbar.set_label('Stars')

plt.show()

In [None]:
filtered_df = df[df['km_from_center'] < 10]
plt.figure(figsize=(10, 6))
plt.scatter(filtered_df['km_from_center'], filtered_df['price'], c=filtered_df['stars'], cmap='viridis', alpha=0.8, edgecolors='w')

plt.xlabel('Distance from Center (km)')
plt.ylabel('Price EUR')
plt.title('Price vs Distance from Center')

cbar = plt.colorbar()
cbar.set_label('Stars')

plt.show()

## Binning
Binning 'price' into categories (low, medium, high).

In [None]:
mean_price = df['price'].median()
mean_price

In [None]:
bins = [0, 80, 140, np.inf]
labels = ['Low', 'Medium', 'High']
df['price_category'] = pd.cut(df['price'], bins=bins, labels=labels, right=False)
print(df[['price', 'price_category']])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))
sns.countplot(x='price_category', data=df)
plt.title('Distribution of Price Categories')

print("\nCount of Data Points in Each Category:")
print(df['price_category'].value_counts())

plt.show()

In [None]:
df.groupby('price_category').agg({'price': ['mean', 'median'], 'stars': 'mean', 'avg_user_ratings': 'mean'})

# EDA (Exploratory Data Analysis)
## Visualize relationships between variables

In [None]:
plt.figure(figsize=(40, 10))
df['km_from_center'] = pd.to_numeric(df['km_from_center'], errors='coerce')

df_sorted = df.sort_values(by='km_from_center')

sns.scatterplot(x='km_from_center', y='price', hue='stars', data=df_sorted)
plt.title('Scatterplot of Price vs. Distance from Center with Star Rating')
plt.show()

### According to the Boxplot and EDA analysis we just did it is clear that there are some outliers we need to handle. Namely price and km_from_center

In [None]:
df = df[(df['price'] >= lower_bound) & (df['price'] <= upper_bound)]

In [None]:
threshold = 20
df_filtered = df[df['km_from_center'] < threshold]

In [None]:
plt.figure(figsize=(20, 10))
sns.scatterplot(x='km_from_center', y='price', hue='stars', data=df_filtered)
plt.title('Scatterplot of Price vs. Distance from Center with Star Rating (Outliers Removed)')
plt.show()

## We can also see what the most frequent price is

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.displot(df['price'], bins=30, kde=False, color='blue')
plt.title('Distribution of Price without Outliers')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].dt.strftime('%B')

plt.figure(figsize=(14, 8))
sns.set(style="whitegrid")

sns.lineplot(x='month', y='price',hue='town_location', data=df, sort=False, marker='o', color='red')
plt.title('Price vs. Month')
plt.xlabel('Month')
plt.ylabel('Price')
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(14, 8))
sns.set(style="whitegrid")

sns.violinplot(x='town_location', y='price', hue='stars', data=df, palette='viridis', inner='quartile')
plt.title('Price Distribution by Location and Price')
plt.xlabel('Location')
plt.ylabel('Price')
plt.xticks(rotation=25, ha='right') 
plt.legend(title='Stars', bbox_to_anchor=(1.05, 1), loc='upper left') 
plt.show()


In [None]:
df.to_csv('hotels_preprocessed_'+ df_name +'.csv', index=False)