In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset
df = pd.read_csv('house_sales_in_king_county_usa.csv')

# Convert the 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'])

In [None]:
# Part 1: Display the 5 Largest Sales Volumes by Year
sales_by_year = df['date'].dt.year.value_counts().nlargest(5)
plt.figure(figsize=(10, 5))
sales_by_year.sort_index().plot(kind='bar')
plt.title('Top 5 Years with the Largest Sales Volumes')
plt.xlabel('Year')
plt.ylabel('Number of Sales')
plt.show()

In [None]:
# Part 2: Display the Price Distribution for Houses with vs. Without a Water View
plt.figure(figsize=(10, 5))
plt.hist(df[df['waterfront'] == 0]['price'], bins=30, alpha=0.5, label='Without Water View')
plt.hist(df[df['waterfront'] == 1]['price'], bins=30, alpha=0.5, label='With Water View')
plt.title('Price Distribution for Houses with and without Water View')
plt.xlabel('Price')
plt.ylabel('Number of Houses')
plt.yscale('log')
plt.legend()
plt.show()

In [None]:
# Part 3: Correlation Diagram
correlation_data = df[['price', 'sqft_living', 'grade', 'sqft_above', 'sqft_living15']]
correlation_matrix = correlation_data.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Part 4: Boxplot of Price by Grade
plt.figure(figsize=(12, 6))
sns.boxplot(x='grade', y='price', data=df)
plt.title('Boxplot of Price by Grade')
plt.show()

In [None]:
# Part 5: Scatterplot of 'Price' vs. 'Sqft_Living' with Linear Regression
sns.lmplot(x='sqft_living', y='price', data=df, aspect=2, height=6)
plt.title('Scatterplot of Price vs. Sqft_Living with Linear Regression')
plt.xlabel('Sqft Living')
plt.ylabel('Price')
plt.show()