In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium.plugins import MarkerCluster
import reverse_geocoder as rg
import plotly.express as px

In [None]:
df = pd.read_csv('data.csv',index_col=0)


In [None]:
df.size

In [None]:
df.shape

In [None]:
df.info()


In [None]:
df.head()


In [None]:
df.describe()


### Data Preprocessing

In [None]:
#checking for missing values
df.isnull().sum()

In [None]:
duplicate_rows = df[df.duplicated()]
print("Duplicate Rows:")
print(duplicate_rows)

In [None]:
df['bathrooms'] = df['bathrooms'].astype(int)
df['floors'] = df['floors'].astype(int)
df['price'] = df['price'].astype(int)


In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
df.head(5)

In [None]:
import pandas as pd

# Assuming df is your DataFrame
df = df.drop('id', axis=1)


In [None]:
df['bedrooms'].unique()

## Exploratory Data Analysis


### Frequency Distribution of Traget(Price)

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['price'], bins=50, kde=True)
plt.title('Distribution of Prices')
plt.xlabel('Price')
plt.xlim(0, 4000000)  
plt.show()



### A Base Map Indicating the locations of houses from dataset

In [None]:

map_center = [df['lat'].mean(), df['long'].mean()]
base_map = folium.Map(location=map_center, zoom_start=10)

markerCluster = MarkerCluster().add_to(base_map)
for index, row in df.iterrows():
    folium.Marker([row['lat'], row['long']], popup=f"Zipcode: {row['zipcode']}").add_to(markerCluster)

base_map


### Function to obtain the location of house from given latitude and Longitude values(Feature Engineering)

In [None]:

def location(lat, lon):
    result = rg.search((lat, lon), mode=1)
    if result:
        return result[0]['name']
    else:
        return "Unknown"

df['location'] = df.apply(lambda row: location(row['lat'], row['long']), axis=1)
df.head()


### Average House prices by their location

In [None]:
avg_priceByLoc = df.groupby('location')['price'].mean().reset_index()
avg_priceByLoc = avg_priceByLoc.sort_values(by='price', ascending=False)


plt.figure(figsize=(20, 15))
sns.barplot(data=avg_priceByLoc, x='location', y='price', palette='viridis')
plt.title('Average House Prices by location')
plt.xlabel('location')
plt.ylabel('Average Price')
plt.xticks(rotation=45, ha='right')
plt.show()


### Top 10 locations with the highest prices

In [None]:
top10Loc= avg_priceByLoc.head(10)


fig = px.bar(top10Loc, x='location', y='price', 
             title='Average House Prices by Location (Top 10)',
             labels={'location': 'Location', 'price': 'Average Price'},
            color='location')
fig.update_layout(xaxis_tickangle=-45, xaxis_title=None, yaxis_title='Average Price')
fig.show()


In [None]:
df['yr_built'].unique()

### Price Vs Year

In [None]:
plt.figure(figsize=(10, 6))
sns.lineplot(df['yr_built'], df['price'], alpha=0.5)
plt.title('Relation between Year Built and Price')
plt.xlabel('Year Built')
plt.ylabel('Price')
plt.grid(True)
plt.show()


### Comparison of Price with Waterfront and View


In [None]:

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))

sns.barplot(data=df, x='waterfront', y='price', ax=axes[0])
axes[0].set_title('Relation between Waterfront and Price')
axes[0].set_xlabel('Waterfront')
axes[0].set_ylabel('Price')

sns.barplot(data=df, x='view', y='price', ax=axes[1])
axes[1].set_title('Relation between View and Price')
axes[1].set_xlabel('View')
axes[1].set_ylabel('Price')
plt.tight_layout()
plt.show()



### ComparingConstruction Grade and Property Condition with Prices

In [None]:
construction_grade_stats = df['grade'].describe()
property_condition_stats = df['condition'].describe()
house_price_stats = df['price'].describe()

# Plot Construction Grade vs. House Prices
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='grade', y='price')
plt.title('Construction Grade vs. House Prices')
plt.xlabel('Construction Grade')
plt.ylabel('House Price')
plt.ylim(0)  # Set y-axis limit to start from 0
plt.show()







In [None]:
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='condition', y='price')
plt.title('Property Condition vs. House Prices')
plt.xlabel('Property Condition')
plt.ylabel('House Price')
plt.ylim(0) 
plt.xticks([1, 2, 3,4,5])
plt.show()


### Prics Vs Floors

In [None]:
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='floors', y='price')
plt.yscale('log')
plt.title('Price vs. Floors')
plt.xlabel('Floors')
plt.ylabel('Price')
plt.xticks([1, 2, 3])
plt.show()

### Prices Ranges categorized by bedrooms

In [None]:
fig = px.histogram(df, x='price', color='bedrooms', barmode='overlay',
                   labels={'price': 'Price', 'bedrooms': 'Number of Bedrooms'},
                   title='Price Distribution by Number of Bedrooms')

fig.update_xaxes(range=[0, df['price'].quantile(0.95)])  
fig.update_layout(xaxis_title='Price', yaxis_title='Frequency',
                  height=400, width=600) 



In [None]:
df.to_csv('dataframe_final.csv', index=False)
