In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
df = pd.read_csv('gurgaon_properties_missing_value_imputation.csv')

In [3]:
df.shape

(3554, 18)

In [4]:
df.head()

Unnamed: 0,property_type,society,sector,price,price_per_sqft,bedRoom,bathroom,balcony,floorNum,agePossession,built_up_area,study room,servant room,store room,pooja room,others,furnishing_type,luxury_score
0,flat,signature global park 4,sector 36,0.82,7586.0,3.0,2.0,2,2.0,New Property,850.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0
1,flat,smart world gems,sector 89,0.95,8597.0,2.0,2.0,2,4.0,New Property,1226.0,1.0,1.0,0.0,0.0,0.0,0.0,38.0
2,flat,breez global hill view,sohna road,0.32,5470.0,2.0,2.0,1,17.0,New Property,1000.0,0.0,0.0,0.0,0.0,0.0,0.0,49.0
3,flat,bestech park view sanskruti,sector 92,1.6,8020.0,3.0,4.0,3+,10.0,Relatively New,1615.0,0.0,1.0,0.0,0.0,1.0,1.0,174.0
4,flat,suncity avenue,sector 102,0.48,9023.0,2.0,2.0,1,5.0,Relatively New,582.0,0.0,0.0,1.0,0.0,0.0,0.0,159.0


In [5]:
latlong = pd.read_csv('latlong.csv')

In [6]:
latlong

Unnamed: 0,sector,coordinates
0,sector 1,"28.3663° N, 76.9456° E"
1,sector 2,"28.5095° N, 77.0320° E"
2,sector 3,"28.4909° N, 77.0176° E"
3,sector 4,"28.4738° N, 77.0107° E"
4,sector 5,"28.4794° N, 77.0176° E"
...,...,...
124,sector 113,"28.5287° N, 77.0233° E"
125,sector 114,"28.5334° N, 77.0118° E"
126,sector 115,"28.5385° N, 77.0061° E"
127,gwal pahari,"28.4484° N, 77.0210° E"


In [7]:
latlong['latitude'] = latlong['coordinates'].str.split(',').str.get(0).str.split('°').str.get(0).astype('float')

In [8]:
latlong['longitude'] = latlong['coordinates'].str.split(',').str.get(1).str.split('°').str.get(0).astype('float')

In [9]:
latlong.head()

Unnamed: 0,sector,coordinates,latitude,longitude
0,sector 1,"28.3663° N, 76.9456° E",28.3663,76.9456
1,sector 2,"28.5095° N, 77.0320° E",28.5095,77.032
2,sector 3,"28.4909° N, 77.0176° E",28.4909,77.0176
3,sector 4,"28.4738° N, 77.0107° E",28.4738,77.0107
4,sector 5,"28.4794° N, 77.0176° E",28.4794,77.0176


In [10]:
new_df = df.merge(latlong, on='sector')

In [11]:
new_df.columns

Index(['property_type', 'society', 'sector', 'price', 'price_per_sqft',
       'bedRoom', 'bathroom', 'balcony', 'floorNum', 'agePossession',
       'built_up_area', 'study room', 'servant room', 'store room',
       'pooja room', 'others', 'furnishing_type', 'luxury_score',
       'coordinates', 'latitude', 'longitude'],
      dtype='object')

In [12]:
print(new_df[['price', 'price_per_sqft', 'built_up_area', 'latitude', 'longitude']].dtypes)


price             float64
price_per_sqft    float64
built_up_area     float64
latitude          float64
longitude         float64
dtype: object


In [13]:
print(new_df[['price', 'price_per_sqft', 'built_up_area', 'latitude', 'longitude']].isnull().sum())


price             0
price_per_sqft    0
built_up_area     0
latitude          0
longitude         0
dtype: int64


In [14]:
print(new_df['sector'].unique())


['sector 36' 'sector 89' 'sector 92' 'sector 102' 'gwal pahari'
 'sector 108' 'sector 105' 'sector 26' 'sector 109' 'sector 28'
 'sector 65' 'sector 12' 'sector 85' 'sector 30' 'sector 107' 'sector 3'
 'sector 2' 'sector 41' 'sector 4' 'sector 62' 'sector 49' 'sector 81'
 'sector 66' 'sector 86' 'sector 48' 'sector 51' 'sector 37' 'sector 111'
 'sector 67' 'sector 113' 'sector 13' 'sector 61' 'sector 69' 'sector 67a'
 'sector 37d' 'sector 82' 'sector 53' 'sector 74' 'sector 52' 'sector 43'
 'sector 14' 'sector 25' 'sector 95' 'sector 56' 'sector 83' 'sector 104'
 'sector 88a' 'sector 55' 'sector 50' 'sector 84' 'sector 91' 'sector 76'
 'sector 82a' 'sector 78' 'manesar' 'sector 93' 'sector 7' 'sector 71'
 'sector 110' 'sector 33' 'sector 70' 'sector 103' 'sector 90' 'sector 38'
 'sector 79' 'sector 112' 'sector 22' 'sector 59' 'sector 99' 'sector 9'
 'sector 58' 'sector 77' 'sector 1' 'sector 57' 'sector 106' 'sector 63'
 'sector 5' 'sector 40' 'sector 23' 'sector 6' 'sector 72' 'secto

In [17]:
# Check for any duplicates
print(new_df.duplicated().sum())

# Check the structure and some sample data
#print(new_df.head())


4


In [16]:
group_df = new_df.groupby('sector').mean()[['price','price_per_sqft','built_up_area','latitude','longitude']]

TypeError: agg function failed [how->mean,dtype->object]

In [None]:
# Ensure you are not overwriting group_df unintentionally
group_df = new_df.groupby('sector').mean()[['price', 'price_per_sqft', 'built_up_area', 'latitude', 'longitude']]


In [None]:
group_df

In [None]:
fig = px.scatter_mapbox(group_df, lat="latitude", lon="longitude", color="price_per_sqft", size='built_up_area',
                  color_continuous_scale=px.colors.cyclical.IceFire, zoom=10,
                  mapbox_style="open-street-map",text=group_df.index)
fig.show()

In [None]:
new_df.to_csv('data_viz1.csv',index=False)

In [None]:
df1 = pd.read_csv('gurgaon_properties.csv')

In [None]:
df1.head()

In [None]:
wordcloud_df = df1.merge(df, left_index=True, right_index=True)[['features','sector']]

In [None]:
wordcloud_df.head()

In [None]:
import ast
main = []
for item in wordcloud_df['features'].dropna().apply(ast.literal_eval):
    main.extend(item)

In [None]:
main

In [None]:
from wordcloud import WordCloud

In [None]:
feature_text = ' '.join(main)

In [None]:
import pickle
pickle.dump(feature_text, open('feature_text.pkl','wb'))

In [None]:
feature_text

In [None]:
plt.rcParams["font.family"] = "Arial"

wordcloud = WordCloud(width = 800, height = 800, 
                      background_color ='white', 
                      stopwords = set(['s']),  # Any stopwords you'd like to exclude
                      min_font_size = 10).generate(feature_text)

plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud, interpolation='bilinear') 
plt.axis("off") 
plt.tight_layout(pad = 0) 
plt.show() # st.pyplot()

In [None]:
data = dict(
    names=["A", "B", "C", "D", "E", "F"],
    parents=["", "", "", "A", "A", "C"],
    values=[10, 20, 30, 40, 50, 60],
)

fig = px.sunburst(
    df1,
    names='property_type',
    values='price_per_sqft',
    parents='bedRoom',
    title="Sample Sunburst Chart"
)
fig.show()

In [None]:
fig = px.scatter(df, x="built_up_area", y="price", color="bedRoom", title="Area Vs Price")

# Show the plot
fig.show()

In [None]:
fig = px.pie(df, names='bedRoom', title='Total Bill Amount by Day')

# Show the plot
fig.show()

In [None]:
temp_df = df[df['bedRoom'] <= 4]
# Create side-by-side boxplots of the total bill amounts by day
fig = px.box(temp_df, x='bedRoom', y='price', title='BHK Price Range')

# Show the plot
fig.show()


In [None]:
sns.distplot(df[df['property_type'] == 'house']['price'])
sns.distplot(df[df['property_type'] == 'flat']['price'])

In [None]:
new_df['sector'].unique().tolist().insert(0,'overall')