In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import json

# Load the dataset
df = pd.read_csv('AB_NYC_2019.csv')

# Display the first few rows of the dataframe
df.head()

ImportError: C extension: pandas.compat._constants not built. If you want to import pandas from the source directory, you may need to run 'python setup.py build_ext' to build the C extensions first.

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
# Handle missing values (if any)
df = df.fillna(df.mean())  # Fill missing values with column mean

# Check again for missing values
df.isnull().sum()

In [None]:
# Exploratory Data Analysis
df.describe()

In [None]:
# Correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Distribution of property prices
plt.figure(figsize=(10, 6))
sns.histplot(df['price'], kde=True, color='blue')
plt.title('Distribution of Property Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Scatter plot of price vs. square footage
plt.figure(figsize=(10, 6))
sns.scatterplot(x='sqft', y='price', data=df)
plt.title('Price vs. Square Footage')
plt.xlabel('Square Footage')
plt.ylabel('Price')
plt.show()

In [None]:
# Boxplot of price by neighborhood
plt.figure(figsize=(12, 8))
sns.boxplot(x='neighborhood', y='price', data=df)
plt.title('Price Distribution by Neighborhood')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Price prediction model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Select features and target variable
X = df[['sqft', 'bedrooms', 'bathrooms']]
y = df['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error: {rmse}')