In [None]:
import yaml
import wandb
import numpy as np
import pandas as pd
import seaborn as sns
import pandas_profiling
import matplotlib.pyplot as plt

sns.set()

In [None]:
with open("../../config.yaml", "r") as file:
    config = yaml.safe_load(file)

In [None]:
run = wandb.init(project=config['main']['project_name'], 
                 group=config['main']['experiment_name'], 
                 job_type="EDA", save_code=True)

In [None]:
artifact = run.use_artifact("nyc_airbnb/raw_data.csv:latest")
df = pd.read_csv(artifact.file())
df['last_review'] = pd.to_datetime(df['last_review'])

df.head()

In [None]:
profile = pandas_profiling.ProfileReport(df, title="Pandas Profiling Report")
profile.to_widgets()

#### **Exploratory Data Analysis**

##### Numerical Data

In [None]:
df.describe(datetime_is_numeric=True)

In [None]:
ROWS = 3
COLS = 3

numeric_columns = [
    'latitude',
    'longitude',
    'price',
    'minimum_nights',
    'number_of_reviews',
    'reviews_per_month',
    'calculated_host_listings_count',
    'availability_365'
]

fig, axes = plt.subplots(ROWS, COLS, figsize=(20, 12))
axes = [axes[i][j] for i in range(ROWS) for j in range(COLS)]

for i, col in zip(range(ROWS * COLS), numeric_columns):
        sns.boxplot(x=col, ax=axes[i], data=df)

In [None]:
sns.pairplot(df[numeric_columns])

#### Categorical Data

In [None]:
df.describe(exclude=[np.number])

#### Check Missing Data

In [None]:
df_missing = df[df.isna().any(axis=1)]
df_missing.head()

In [None]:
df_missing[df_missing['number_of_reviews'] != 0]

#### Fix Outliers in Pricing Feature

In [None]:
sns.boxplot(x=df['price'])

In [None]:
sns.boxplot(x=df[df['price'].between(10, 350)]['price'])

#### Final Check

In [None]:
df.info()

In [None]:
run.finish()