In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
import tensorflow as tf
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
ad = pd.read_csv("housing_prices.csv")
df = ad.copy()
df.head(5)

Unnamed: 0,ID,State,City,Locality,Property_Type,BHK,Size_in_SqFt,Price_in_Lakhs,Price_per_SqFt,Year_Built,...,Age_of_Property,Nearby_Schools,Nearby_Hospitals,Public_Transport_Accessibility,Parking_Space,Security,Amenities,Facing,Owner_Type,Availability_Status
0,1,Tamil Nadu,Chennai,Locality_84,Apartment,1,4740,489.76,0.1,1990,...,35,10,3,High,No,No,"Playground, Gym, Garden, Pool, Clubhouse",West,Owner,Ready_to_Move
1,2,Maharashtra,Pune,Locality_490,Independent House,3,2364,195.52,0.08,2008,...,17,8,1,Low,No,Yes,"Playground, Clubhouse, Pool, Gym, Garden",North,Builder,Under_Construction
2,3,Punjab,Ludhiana,Locality_167,Apartment,2,3642,183.79,0.05,1997,...,28,9,8,Low,Yes,No,"Clubhouse, Pool, Playground, Gym",South,Broker,Ready_to_Move
3,4,Rajasthan,Jodhpur,Locality_393,Independent House,2,2741,300.29,0.11,1991,...,34,5,7,High,Yes,Yes,"Playground, Clubhouse, Gym, Pool, Garden",North,Builder,Ready_to_Move
4,5,Rajasthan,Jaipur,Locality_466,Villa,4,4823,182.9,0.04,2002,...,23,4,9,Low,No,Yes,"Playground, Garden, Gym, Pool, Clubhouse",East,Builder,Ready_to_Move


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 23 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   ID                              250000 non-null  int64  
 1   State                           250000 non-null  object 
 2   City                            250000 non-null  object 
 3   Locality                        250000 non-null  object 
 4   Property_Type                   250000 non-null  object 
 5   BHK                             250000 non-null  int64  
 6   Size_in_SqFt                    250000 non-null  int64  
 7   Price_in_Lakhs                  250000 non-null  float64
 8   Price_per_SqFt                  250000 non-null  float64
 9   Year_Built                      250000 non-null  int64  
 10  Furnished_Status                250000 non-null  object 
 11  Floor_No                        250000 non-null  int64  
 12  Total_Floors    

In [4]:
df['Price']=df['Price_per_SqFt']*df['Size_in_SqFt']

In [5]:
df.drop(columns=['ID','Size_in_SqFt','Price_per_SqFt','Locality'],inplace=True)

In [6]:
for i in df.select_dtypes(include='object').columns:
    print(df[i].value_counts())
    print('****'*20)

State
Odisha            12681
Tamil Nadu        12629
West Bengal       12622
Gujarat           12578
Delhi             12552
Telangana         12539
Maharashtra       12533
Punjab            12516
Uttar Pradesh     12508
Uttarakhand       12501
Assam             12496
Kerala            12487
Jharkhand         12480
Andhra Pradesh    12462
Chhattisgarh      12456
Madhya Pradesh    12451
Karnataka         12424
Rajasthan         12402
Bihar             12369
Haryana           12314
Name: count, dtype: int64
********************************************************************************
City
Coimbatore        6461
Ahmedabad         6411
Silchar           6404
Durgapur          6387
Cuttack           6358
Vijayawada        6344
Jamshedpur        6335
Bhubaneswar       6323
New Delhi         6306
Kochi             6305
Ludhiana          6295
Bhopal            6284
Hyderabad         6273
Noida             6271
Haridwar          6267
Warangal          6266
Bilaspur          6263
Dwarka     

In [7]:
df = pd.get_dummies(df, columns=['State'])

In [8]:
df = pd.get_dummies(df, columns=['City'])

In [9]:
df = pd.get_dummies(df, columns=['Property_Type'])

In [10]:
df = pd.get_dummies(df, columns=['Furnished_Status'])

In [11]:
df = pd.get_dummies(df, columns=['Public_Transport_Accessibility'])

In [12]:
df = pd.get_dummies(df, columns=['Parking_Space'])

In [13]:
df=df.drop(["Parking_Space_No"],axis=1)

In [14]:
df = pd.get_dummies(df, columns=['Security'])

In [15]:
df=df.drop(["Security_No"],axis=1)

In [16]:
df = pd.get_dummies(df, columns=['Amenities'])

In [17]:
df = pd.get_dummies(df, columns=['Facing'])

In [18]:
df = pd.get_dummies(df, columns=['Owner_Type'])

In [19]:
df = pd.get_dummies(df, columns=['Availability_Status'])

In [20]:
df=df.drop(["Availability_Status_Ready_to_Move"],axis=1)

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Columns: 415 entries, BHK to Availability_Status_Under_Construction
dtypes: bool(406), float64(2), int64(7)
memory usage: 114.0 MB


In [22]:
corr_matrix=df.corr()
corr_matrix["Price"].sort_values()

Amenities_Gym, Garden, Clubhouse, Playground, Pool   -0.005858
Amenities_Clubhouse, Pool, Playground, Gym, Garden   -0.005239
City_Cuttack                                         -0.004394
Amenities_Garden, Clubhouse, Gym, Pool, Playground   -0.004307
Amenities_Playground, Pool, Clubhouse, Gym           -0.004237
                                                        ...   
Amenities_Pool, Playground, Garden, Gym, Clubhouse    0.005078
Amenities_Pool, Clubhouse, Garden, Gym                0.005150
Amenities_Pool, Clubhouse, Playground, Gym, Garden    0.006041
Price_in_Lakhs                                        0.998053
Price                                                 1.000000
Name: Price, Length: 415, dtype: float64

In [23]:
sorted_corr=corr_matrix["Price"].sort_values()

In [24]:
filtered_corr = sorted_corr[(sorted_corr > 0.0035) | (sorted_corr < -0.0035)]
print(filtered_corr)

Amenities_Gym, Garden, Clubhouse, Playground, Pool   -0.005858
Amenities_Clubhouse, Pool, Playground, Gym, Garden   -0.005239
City_Cuttack                                         -0.004394
Amenities_Garden, Clubhouse, Gym, Pool, Playground   -0.004307
Amenities_Playground, Pool, Clubhouse, Gym           -0.004237
Property_Type_Villa                                  -0.004077
Amenities_Playground, Garden                         -0.004030
City_Trivandrum                                      -0.003714
Amenities_Gym, Playground                            -0.003696
Amenities_Gym, Playground, Clubhouse, Pool, Garden   -0.003598
Amenities_Gym, Garden, Pool, Playground, Clubhouse   -0.003569
Amenities_Garden, Pool, Clubhouse, Playground         0.003588
City_Bangalore                                        0.003606
Amenities_Playground, Gym, Clubhouse, Garden, Pool    0.003611
Security_Yes                                          0.003802
Property_Type_Independent House                       0

In [25]:
filtered_columns = filtered_corr.index.tolist()

df= df[filtered_columns]

df.head()

Unnamed: 0,"Amenities_Gym, Garden, Clubhouse, Playground, Pool","Amenities_Clubhouse, Pool, Playground, Gym, Garden",City_Cuttack,"Amenities_Garden, Clubhouse, Gym, Pool, Playground","Amenities_Playground, Pool, Clubhouse, Gym",Property_Type_Villa,"Amenities_Playground, Garden",City_Trivandrum,"Amenities_Gym, Playground","Amenities_Gym, Playground, Clubhouse, Pool, Garden",...,"Amenities_Clubhouse, Pool, Gym, Garden","Amenities_Playground, Clubhouse, Gym","Amenities_Gym, Playground, Pool, Clubhouse",State_Karnataka,"Amenities_Gym, Playground, Garden, Pool, Clubhouse","Amenities_Pool, Playground, Garden, Gym, Clubhouse","Amenities_Pool, Clubhouse, Garden, Gym","Amenities_Pool, Clubhouse, Playground, Gym, Garden",Price_in_Lakhs,Price
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,489.76,474.0
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,195.52,189.12
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,183.79,182.1
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,300.29,301.51
4,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,182.9,192.92


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 31 columns):
 #   Column                                              Non-Null Count   Dtype  
---  ------                                              --------------   -----  
 0   Amenities_Gym, Garden, Clubhouse, Playground, Pool  250000 non-null  bool   
 1   Amenities_Clubhouse, Pool, Playground, Gym, Garden  250000 non-null  bool   
 2   City_Cuttack                                        250000 non-null  bool   
 3   Amenities_Garden, Clubhouse, Gym, Pool, Playground  250000 non-null  bool   
 4   Amenities_Playground, Pool, Clubhouse, Gym          250000 non-null  bool   
 5   Property_Type_Villa                                 250000 non-null  bool   
 6   Amenities_Playground, Garden                        250000 non-null  bool   
 7   City_Trivandrum                                     250000 non-null  bool   
 8   Amenities_Gym, Playground                           250000 non-n

In [27]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['Price'])  # Bağımsız değişkenler
y = df['Price']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (200000, 30)
X_test shape: (50000, 30)
y_train shape: (200000,)
y_test shape: (50000,)


In [28]:
ann = tf.keras.models.Sequential()

In [29]:
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

In [30]:
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

In [31]:
ann.add(tf.keras.layers.Dense(units=1))

In [32]:
ann.compile(optimizer = 'adam', loss = 'mean_squared_error')

In [33]:
ann.fit(X_train, y_train, batch_size = 32, epochs = 20)

Epoch 1/20
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 943us/step - loss: 8632.7734
Epoch 2/20
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 993us/step - loss: 78.5332
Epoch 3/20
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step - loss: 78.5232
Epoch 4/20
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1ms/step - loss: 78.1375
Epoch 5/20
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 963us/step - loss: 78.8427
Epoch 6/20
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 973us/step - loss: 78.8643
Epoch 7/20
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 995us/step - loss: 78.7330
Epoch 8/20
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 975us/step - loss: 78.6793
Epoch 9/20
[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 986us/step - loss: 78.4087
Epoch 10/20
[1m6250/6250[0m [32m━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x1e5a0267150>

In [34]:
y_pred = ann.predict(X_test)

r2_value = r2_score(y_test, y_pred)
print("(R^2) Value:", r2_value)

[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 546us/step
(R^2) Value: 0.9961148675044068
