# 1) Explore the data

## Import Required Libraries and Define the Schema

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Load the Data

In [0]:
# Table name
table_name = "housing"

# Load data from the table
df = spark.read.table(table_name)

## Display the results of DataFrame

In [0]:
display(df)

## Check the dataframe type

In [0]:
type(df)

## Convert pyspark dataframe to pandas dataframe

In [0]:
df = df.toPandas()

In [0]:
type(df)

## Display the first few rows of the dataset

In [0]:
df.head()

In [0]:
df.head(10)

## Determine the dimensions

In [0]:
df.shape

##  Summary of a DataFrame

In [0]:
df.info()

## Number of non-null values in each column

In [0]:
df.count()

## Retrieve the column names

In [0]:
df.columns

## Examine the data types of each column

In [0]:
df.dtypes

## Compute the pairwise correlation of columns

In [0]:
df.corr()

## Remove duplicate rows

In [0]:
# Original shape of the df
original_shape = df.shape

# Drop duplicate rows
df = df.drop_duplicates()

# Calculate the number of duplicate rows
num_duplicate_rows = original_shape[0] - df.shape[0]

# print the number of duplicate rows removed
print("The number of duplicate rows removed :", num_duplicate_rows)

In [0]:
data = {
    'Name': ['John', 'Alice', 'John', 'Bob', 'Alice'],
    'Age': [25, 30, 25, 35, 30],
    'City': ['New York', 'London', 'New York', 'Paris', 'London'],
    'Age': [25, 30, 25, 35, 30],
    'City': ['New York', 'London', 'New York', 'Paris', 'London']
}

In [0]:
df_1 = pd.DataFrame(data)
print(df_1)

In [0]:
# Original shape of the df
original_shape = df_1.shape

# Drop duplicate rows
df_1 = df_1.drop_duplicates()

# Calculate the number of duplicate rows
num_duplicate_rows = original_shape[0] - df_1.shape[0]

# print the number of duplicate rows removed
print("The number of duplicate rows removed :", num_duplicate_rows)

## Summary statistics of numerical columns

In [0]:
df.describe()

## Check for missing values

In [0]:
# if there are any missing values in dataframe df
df.isnull().values.any()

In [0]:
# check how many missing values in df
df.isnull().values.sum()

In [0]:
# Number of missing values in each column
df.isnull().sum()

# 2) Visualize the Data

## Correlation matrix and Heatmap

In [0]:
df.head()

In [0]:
df_2 = df.drop(columns="median_house_value")

In [0]:
df_2.head()

In [0]:
correlation_values = df_2.corrwith(df["median_house_value"])
sorted_correlation_values = correlation_values.sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(12, 8))
ax.bar(
    sorted_correlation_values.index,
    sorted_correlation_values.values,
    color=['#1f77b4' if c > 0 else '#ff7f0e' for c in sorted_correlation_values.values]
)

ax.set_xlabel('Features')
ax.set_ylabel('Correlation')
ax.set_title('Correlation with Median House Value')

plt.xticks(rotation=45)
ax.grid(True)

plt.show()

In [0]:
corr = df.corr()

plt.figure(figsize=(12, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)

plt.title('Correlation Heatmap')
plt.xticks(rotation=45)
plt.yticks(rotation=0)

plt.show()

## Histogram of a numerical variable

In [0]:
plt.figure(figsize=(10, 6))  # Set the figure size

# Plot the histogram
plt.hist(df['median_house_value'], bins=20, color='skyblue', edgecolor='black')

# Add labels and title
plt.xlabel('Median House Value')
plt.ylabel('Frequency')
plt.title('Histogram of Median House Value')

# Add gridlines
plt.grid(True, linestyle='--', alpha=0.5)

# Show the plot
plt.show()

## Scatter plot of two numerical variables

In [0]:
plt.figure(figsize=(10, 6))  # Set the figure size

# Plot the scatter plot
plt.scatter(df['median_house_value'], df['median_income'], alpha=0.5, color='skyblue', edgecolors='black')

# Add labels and title
plt.xlabel('Median House Value')
plt.ylabel('Median Income')
plt.title('Scatter Plot of Median House Value vs Median Income')

# Add gridlines
plt.grid(True, linestyle='--', alpha=0.5)

# Show the plot
plt.show()

# 3) Pandas Profiling

In [0]:
from pandas_profiling import ProfileReport
df_profile = ProfileReport(df,
                           correlations={
                               "auto": {"calculate": True},
                               "pearson": {"calculate": True},
                               "spearman": {"calculate": True},
                               "kendall": {"calculate": True},
                               "phi_k": {"calculate": True},
                               "cramers": {"calculate": True},
                           }, title="Profiling Report", progress_bar=False, infer_dtypes=False)
profile_html = df_profile.to_html()

displayHTML(profile_html)