In [33]:
import pandas as pd
import numpy as np
import seaborn as sns

In [34]:
df = pd.read_csv('gurgaon_properties_v3.csv')

In [35]:
df.isna().sum()

society                   2
property_type             1
sector                    0
price                    19
price_per_sq             19
area                     19
bedRoom                   1
bathroom                  1
balcony                   1
floorNum                 21
facing                    1
agePossession             1
furnish_score             1
features_score            1
study room                1
servant room              1
store room                1
pooja room                1
others                    1
super_built_up_area    1902
built_up_area          2629
carpet_area            1873
dtype: int64

In [36]:
df.drop(3816 ,inplace=True)

In [37]:
df.society.fillna('independent' ,inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.society.fillna('independent' ,inplace=True)


# ***FloorNum***

In [38]:
df[df.property_type == 'houses'].floorNum.median()

3.0

In [39]:
df.floorNum.fillna(3.0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.floorNum.fillna(3.0, inplace=True)


In [40]:
df.isnull().sum()

society                   0
property_type             0
sector                    0
price                    18
price_per_sq             18
area                     18
bedRoom                   0
bathroom                  0
balcony                   0
floorNum                  0
facing                    0
agePossession             0
furnish_score             0
features_score            0
study room                0
servant room              0
store room                0
pooja room                0
others                    0
super_built_up_area    1901
built_up_area          2628
carpet_area            1872
dtype: int64

In [41]:
df = df[~df.price.isnull()]

In [42]:
df.corr(numeric_only=True)['price'].sort_values(ascending=True)

floorNum              -0.095749
carpet_area           -0.010145
others                -0.007593
built_up_area          0.005692
area                   0.020427
features_score         0.100788
balcony                0.221979
study room             0.258324
store room             0.310369
pooja room             0.344105
price_per_sq           0.397428
servant room           0.402216
furnish_score          0.424391
bedRoom                0.524497
bathroom               0.593133
super_built_up_area    0.770986
price                  1.000000
Name: price, dtype: float64

# ***Super Built Up Area***

# ***Carpet Area***

# ***Built up area***

In [43]:
df.corr(numeric_only=True)['built_up_area'].sort_values(ascending=True)

price_per_sq          -0.022774
others                -0.010799
study room            -0.007613
features_score        -0.004119
servant room          -0.003542
pooja room            -0.002807
store room            -0.002167
bedRoom                0.001138
bathroom               0.004885
price                  0.005692
balcony                0.005815
floorNum               0.008603
furnish_score          0.024130
super_built_up_area    0.954530
carpet_area            0.971476
area                   0.997758
built_up_area          1.000000
Name: built_up_area, dtype: float64

In [44]:
df1 = df[~(df.built_up_area.isnull())]
df2 = df[df.built_up_area.isnull()]

In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Assuming your data is in a pandas DataFrame called df
# df = pd.read_csv('your_data.csv')  # Replace with your actual data loading method

# Define the features and target
X = df1.drop(columns=['built_up_area'])
y = df1['built_up_area']

# Define which columns are categorical and which are numerical
categorical_features = ['sector' ,'society', 'property_type', 'facing', 'agePossession']
numerical_features = ['price', 'price_per_sq', 'bedRoom', 'bathroom', 'balcony', 
                      'floorNum', 'furnish_score', 'features_score', 'study room', 
                      'servant room', 'store room', 'pooja room', 'others' 
                      ]

# Create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Create the pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor',RandomForestRegressor(n_estimators=100 , random_state=42))
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# K-fold cross-validation
# kfold = KFold(n_splits=10, shuffle=True, random_state=42)
# scores = cross_val_score(model, X, y, cv=kfold, scoring='r2')

# Fit the model
model.fit(X_train, y_train)

# Check the model's performance
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)

print(f'Training Score: {train_score:.4f}')
print(f'Test Score: {test_score:.4f}')

Training Score: 0.7880
Test Score: 0.7126


In [46]:
x = df2.drop(columns='built_up_area')

resetIndex_x = x.reset_index()

predicted = model.predict(x)

y = pd.Series(predicted)

merged_df = pd.concat([resetIndex_x,y] ,axis=1, join='outer')

merged_df.rename(columns={
    0 : 'built_up_area'
},inplace=True)

merged_df.set_index('index' , inplace=True)

merged_df.index.name = None

df2.update(merged_df)

df.update(df2)


In [47]:
df.corr(numeric_only=True)['price'].sort_values()

floorNum              -0.095749
built_up_area         -0.018977
carpet_area           -0.010145
others                -0.007593
area                   0.020427
features_score         0.100788
balcony                0.221979
study room             0.258324
store room             0.310369
pooja room             0.344105
price_per_sq           0.397428
servant room           0.402216
furnish_score          0.424391
bedRoom                0.524497
bathroom               0.593133
super_built_up_area    0.770986
price                  1.000000
Name: price, dtype: float64

In [48]:
df.drop(columns=['super_built_up_area' , 'carpet_area' , 'built_up_area'], inplace=True)

In [53]:
df.agePossession = df.agePossession.str.lower()

In [54]:
df.to_csv('gurgaon_v4.csv' , index=False)

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3798 entries, 0 to 3815
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   society         3798 non-null   object 
 1   property_type   3798 non-null   object 
 2   sector          3798 non-null   object 
 3   price           3798 non-null   float64
 4   price_per_sq    3798 non-null   float64
 5   area            3798 non-null   float64
 6   bedRoom         3798 non-null   float64
 7   bathroom        3798 non-null   float64
 8   balcony         3798 non-null   float64
 9   floorNum        3798 non-null   float64
 10  facing          3798 non-null   object 
 11  agePossession   3798 non-null   object 
 12  furnish_score   3798 non-null   float64
 13  features_score  3798 non-null   float64
 14  study room      3798 non-null   float64
 15  servant room    3798 non-null   float64
 16  store room      3798 non-null   float64
 17  pooja room      3798 non-null   float6