In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
df1 = pd.read_csv('chennai_real_estate_data.csv')

In [3]:
df1

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Carpet Area,Ready To Move,Anna Nagar,1 BHK,Society1,1117,2,0,49.91
1,Carpet Area,Under Construction,Anna Nagar,4 BHK,Society3,3725,4,0,101.91
2,Plot Area,Ready To Move,Adyar,2 BHK,Society1,1915,1,3,90.78
3,Carpet Area,Ready To Move,Velachery,2 BHK,Society1,2308,2,3,132.87
4,Carpet Area,Under Construction,Tambaram,3 BHK,Society5,2526,3,2,70.73
...,...,...,...,...,...,...,...,...,...
4995,Carpet Area,Ready To Move,Medavakkam,4 BHK,Society1,2144,4,3,131.00
4996,Plot Area,Ready To Move,Velachery,5 BHK,Society4,3288,4,0,135.49
4997,Carpet Area,Under Construction,Thiruvanmiyur,1 BHK,Society4,615,2,1,20.86
4998,Carpet Area,Under Construction,Medavakkam,3 BHK,Society1,1569,2,2,90.90


# Data Cleaning

In [4]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     5000 non-null   object 
 1   availability  5000 non-null   object 
 2   location      5000 non-null   object 
 3   size          5000 non-null   object 
 4   society       5000 non-null   object 
 5   total_sqft    5000 non-null   int64  
 6   bath          5000 non-null   int64  
 7   balcony       5000 non-null   int64  
 8   price         5000 non-null   float64
dtypes: float64(1), int64(3), object(5)
memory usage: 351.7+ KB


In [5]:
df1.describe()

Unnamed: 0,total_sqft,bath,balcony,price
count,5000.0,5000.0,5000.0,5000.0
mean,2556.4742,2.4006,1.5086,121.464538
std,1364.658724,1.266349,1.110573,75.03744
min,502.0,1.0,0.0,13.82
25%,1374.75,1.0,1.0,61.9825
50%,2354.0,2.0,1.0,107.15
75%,3486.0,3.0,3.0,164.72
max,5993.0,5.0,3.0,400.79


In [6]:
df1.isnull().sum()

area_type       0
availability    0
location        0
size            0
society         0
total_sqft      0
bath            0
balcony         0
price           0
dtype: int64

In [7]:
df1.shape

(5000, 9)

In [8]:
df2 = df1.drop (['area_type', 'society','balcony','availability'], axis = 'columns')
df2

Unnamed: 0,location,size,total_sqft,bath,price
0,Anna Nagar,1 BHK,1117,2,49.91
1,Anna Nagar,4 BHK,3725,4,101.91
2,Adyar,2 BHK,1915,1,90.78
3,Velachery,2 BHK,2308,2,132.87
4,Tambaram,3 BHK,2526,3,70.73
...,...,...,...,...,...
4995,Medavakkam,4 BHK,2144,4,131.00
4996,Velachery,5 BHK,3288,4,135.49
4997,Thiruvanmiyur,1 BHK,615,2,20.86
4998,Medavakkam,3 BHK,1569,2,90.90


In [9]:
df2['size'].unique()

array(['1 BHK', '4 BHK', '2 BHK', '3 BHK', '5 BHK'], dtype=object)

In [10]:
df2['bhk'] = df2['size'].apply(lambda x: int(x.split(' ')[0]))
df2.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Anna Nagar,1 BHK,1117,2,49.91,1
1,Anna Nagar,4 BHK,3725,4,101.91,4
2,Adyar,2 BHK,1915,1,90.78,2
3,Velachery,2 BHK,2308,2,132.87,2
4,Tambaram,3 BHK,2526,3,70.73,3


In [11]:
df2.drop("size" ,axis="columns")

Unnamed: 0,location,total_sqft,bath,price,bhk
0,Anna Nagar,1117,2,49.91,1
1,Anna Nagar,3725,4,101.91,4
2,Adyar,1915,1,90.78,2
3,Velachery,2308,2,132.87,2
4,Tambaram,2526,3,70.73,3
...,...,...,...,...,...
4995,Medavakkam,2144,4,131.00,4
4996,Velachery,3288,4,135.49,5
4997,Thiruvanmiyur,615,2,20.86,1
4998,Medavakkam,1569,2,90.90,3


In [12]:
dummies = pd.get_dummies(df2.location)
dummies

Unnamed: 0,Adyar,Anna Nagar,Kodambakkam,Medavakkam,Nungambakkam,Sholinganallur,T. Nagar,Tambaram,Thiruvanmiyur,Velachery
0,False,True,False,False,False,False,False,False,False,False
1,False,True,False,False,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,True
4,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...
4995,False,False,False,True,False,False,False,False,False,False
4996,False,False,False,False,False,False,False,False,False,True
4997,False,False,False,False,False,False,False,False,True,False
4998,False,False,False,True,False,False,False,False,False,False


In [13]:
dummies_df=dummies.astype(int)
dummies_df

Unnamed: 0,Adyar,Anna Nagar,Kodambakkam,Medavakkam,Nungambakkam,Sholinganallur,T. Nagar,Tambaram,Thiruvanmiyur,Velachery
0,0,1,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
4995,0,0,0,1,0,0,0,0,0,0
4996,0,0,0,0,0,0,0,0,0,1
4997,0,0,0,0,0,0,0,0,1,0
4998,0,0,0,1,0,0,0,0,0,0


In [14]:
merged = pd.concat([df2,dummies],axis='columns')
merged

Unnamed: 0,location,size,total_sqft,bath,price,bhk,Adyar,Anna Nagar,Kodambakkam,Medavakkam,Nungambakkam,Sholinganallur,T. Nagar,Tambaram,Thiruvanmiyur,Velachery
0,Anna Nagar,1 BHK,1117,2,49.91,1,False,True,False,False,False,False,False,False,False,False
1,Anna Nagar,4 BHK,3725,4,101.91,4,False,True,False,False,False,False,False,False,False,False
2,Adyar,2 BHK,1915,1,90.78,2,True,False,False,False,False,False,False,False,False,False
3,Velachery,2 BHK,2308,2,132.87,2,False,False,False,False,False,False,False,False,False,True
4,Tambaram,3 BHK,2526,3,70.73,3,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,Medavakkam,4 BHK,2144,4,131.00,4,False,False,False,True,False,False,False,False,False,False
4996,Velachery,5 BHK,3288,4,135.49,5,False,False,False,False,False,False,False,False,False,True
4997,Thiruvanmiyur,1 BHK,615,2,20.86,1,False,False,False,False,False,False,False,False,True,False
4998,Medavakkam,3 BHK,1569,2,90.90,3,False,False,False,True,False,False,False,False,False,False


In [15]:
final = merged.drop(['location','size'], axis='columns')
final.astype(int)

Unnamed: 0,total_sqft,bath,price,bhk,Adyar,Anna Nagar,Kodambakkam,Medavakkam,Nungambakkam,Sholinganallur,T. Nagar,Tambaram,Thiruvanmiyur,Velachery
0,1117,2,49,1,0,1,0,0,0,0,0,0,0,0
1,3725,4,101,4,0,1,0,0,0,0,0,0,0,0
2,1915,1,90,2,1,0,0,0,0,0,0,0,0,0
3,2308,2,132,2,0,0,0,0,0,0,0,0,0,1
4,2526,3,70,3,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,2144,4,131,4,0,0,0,1,0,0,0,0,0,0
4996,3288,4,135,5,0,0,0,0,0,0,0,0,0,1
4997,615,2,20,1,0,0,0,0,0,0,0,0,1,0
4998,1569,2,90,3,0,0,0,1,0,0,0,0,0,0


In [16]:
final = final.drop(['Velachery'], axis='columns')
final.astype(int)

Unnamed: 0,total_sqft,bath,price,bhk,Adyar,Anna Nagar,Kodambakkam,Medavakkam,Nungambakkam,Sholinganallur,T. Nagar,Tambaram,Thiruvanmiyur
0,1117,2,49,1,0,1,0,0,0,0,0,0,0
1,3725,4,101,4,0,1,0,0,0,0,0,0,0
2,1915,1,90,2,1,0,0,0,0,0,0,0,0
3,2308,2,132,2,0,0,0,0,0,0,0,0,0
4,2526,3,70,3,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,2144,4,131,4,0,0,0,1,0,0,0,0,0
4996,3288,4,135,5,0,0,0,0,0,0,0,0,0
4997,615,2,20,1,0,0,0,0,0,0,0,0,1
4998,1569,2,90,3,0,0,0,1,0,0,0,0,0


In [17]:
X = final.drop('price', axis='columns')
X.astype(int)

Unnamed: 0,total_sqft,bath,bhk,Adyar,Anna Nagar,Kodambakkam,Medavakkam,Nungambakkam,Sholinganallur,T. Nagar,Tambaram,Thiruvanmiyur
0,1117,2,1,0,1,0,0,0,0,0,0,0
1,3725,4,4,0,1,0,0,0,0,0,0,0
2,1915,1,2,1,0,0,0,0,0,0,0,0
3,2308,2,2,0,0,0,0,0,0,0,0,0
4,2526,3,3,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,2144,4,4,0,0,0,1,0,0,0,0,0
4996,3288,4,5,0,0,0,0,0,0,0,0,0
4997,615,2,1,0,0,0,0,0,0,0,0,1
4998,1569,2,3,0,0,0,1,0,0,0,0,0


In [18]:
y = final.price

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=10)

In [20]:
reg=LinearRegression()

In [21]:
reg.fit(X_train,y_train)

In [22]:
reg.predict(X_test)

array([121.83723992,  93.58350693, 117.70621934, 183.17767403,
        39.60740835, 168.48914197,  27.59431332, 131.37600202,
       208.49392557, 200.04138602, 169.5980508 , 197.58967598,
        82.81717284, 204.15669487, 120.93185753,  52.57179168,
       176.67482769,  91.23833589, 279.28252159, 224.18480611,
        97.99638712, 114.66070108, 123.55703812, 224.15699992,
        50.97603999, 199.72680284, 128.38447808,  25.74555665,
        37.43468625,  81.72025098, 143.51426833,  85.58805045,
        47.23118596,  54.24713678,  99.33742563, 136.82708732,
        79.34773109,  85.64569912, 130.88362713,  33.02672789,
        94.87182374, 227.14421417, 141.72203068,  77.79819339,
        99.30616585,  39.32467198,  77.65038238, 215.2520538 ,
       114.90390746, 144.05771024, 143.070231  , 116.56110515,
       196.25079514, 191.96456313,  44.81685173, 219.20063197,
       107.57541303,  33.68722751,  25.00054531, 100.57696669,
       239.90168465, 150.60954931, 233.44828136,  71.40

In [23]:
X_train

Unnamed: 0,total_sqft,bath,bhk,Adyar,Anna Nagar,Kodambakkam,Medavakkam,Nungambakkam,Sholinganallur,T. Nagar,Tambaram,Thiruvanmiyur
2733,4428,3,4,False,False,True,False,False,False,False,False,False
447,3209,1,3,False,False,False,False,False,True,False,False,False
970,2688,1,3,False,False,False,True,False,False,False,False,False
125,3473,2,4,False,False,False,False,True,False,False,False,False
798,2877,4,5,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
1180,2378,4,3,False,False,True,False,False,False,False,False,False
3441,2092,1,2,False,False,False,False,False,False,False,True,False
1344,4469,1,4,False,False,False,False,False,False,False,True,False
4623,1644,1,3,False,False,False,False,False,False,False,False,False


In [24]:
y_train

2733    190.40
447      96.99
970     124.49
125     114.02
798     190.69
         ...  
1180    162.91
3441    101.69
1344    247.22
4623     87.58
1289    357.92
Name: price, Length: 4000, dtype: float64

In [25]:
reg.score(X_train,y_train)

0.7497372653296503

In [26]:
reg.predict([[4428,3,4,0,0,1,0,0,0,0,0,0]])



array([211.99880895])

In [27]:
reg.predict([[3209,1,3,0,0,0,0,0,1,0,0,0]])



array([152.9177874])

In [28]:
reg.predict([[2688,1,3,0,0,0,1,0,0,0,0,0]])



array([128.75418694])

In [29]:
reg.predict([[3473,2,4,0,0,0,0,1,0,0,0,0]])



array([164.47457373])

In [30]:
from sklearn.ensemble import RandomForestRegressor

In [31]:
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

In [32]:
rf_model.predict(X_test)

array([119.0479    , 107.245     ,  97.1882    , 129.4431    ,
        32.7072    , 151.9269    ,  32.07214667, 128.0218    ,
       194.9141    , 186.4062    , 147.1088    , 151.4976    ,
        88.131     , 195.0755    , 157.971     ,  42.4099    ,
       172.1541    ,  96.0661    , 231.2479    , 275.1101    ,
        91.9605    , 120.5487    , 128.4149    , 245.8088    ,
        56.1377    , 158.3176    , 158.4615    ,  26.3794    ,
        40.06245833,  68.8637    , 164.4407    ,  81.6384    ,
        46.7518    ,  56.1086    ,  92.83911333, 126.1038    ,
        75.8947    ,  95.0639    , 104.8751    ,  35.35653333,
       125.8396    , 283.5857    , 143.34      ,  65.1926    ,
        88.0677    ,  46.4295    ,  97.3028    , 225.2311    ,
       109.2086    , 157.0759    , 128.1421    , 103.2721    ,
       184.9951    , 195.8563    ,  51.6075    , 187.5117    ,
       112.0901    ,  39.9845    ,  25.67286   , 113.2629    ,
       260.8077    , 157.3488    , 217.1974    ,  79.77

In [33]:
rf_model.score(X_train,y_train)

0.9590717572825167

In [34]:
rf_model.predict([[4428,3,4,0,0,1,0,0,0,0,0,0]])



array([212.0991])

In [35]:
rf_model.predict([[3209,1,3,0,0,0,0,0,1,0,0,0]])



array([108.0501])

In [36]:
rf_model.predict([[2688,1,3,0,0,0,1,0,0,0,0,0]])



array([128.3548])

In [37]:
rf_model.predict([[3473,2,4,0,0,0,0,1,0,0,0,0]])



array([127.4127])

In [38]:
from sklearn.tree import DecisionTreeRegressor

In [52]:
reg_model = DecisionTreeRegressor()
reg_model.fit(X_train, y_train)

In [54]:
reg_model.predict(X_test)

array([107.29 ,  64.41 ,  68.89 , 115.07 ,  29.87 , 213.65 ,  35.23 ,
        76.3  , 196.82 , 254.83 , 130.9  , 114.56 ,  78.21 , 113.43 ,
       174.93 ,  38.96 , 122.27 , 125.48 , 185.59 , 232.49 , 115.12 ,
        94.92 , 161.32 , 235.06 ,  69.94 , 155.07 , 184.69 ,  26.4  ,
        55.14 ,  45.73 , 180.44 ,  69.73 ,  57.03 ,  49.32 , 121.29 ,
       165.99 ,  51.45 , 115.03 ,  86.19 ,  36.91 , 134.65 , 328.85 ,
       134.88 ,  45.36 ,  71.31 ,  50.46 , 105.66 , 275.25 ,  93.71 ,
       200.93 ,  96.51 ,  82.25 , 153.75 , 193.48 ,  55.72 , 155.55 ,
       143.45 ,  44.09 ,  28.93 ,  88.2  , 335.97 , 192.36 , 161.23 ,
        89.41 , 136.99 , 239.1  , 223.97 , 184.65 ,  43.595,  68.2  ,
       147.46 , 134.65 , 111.65 , 158.09 , 320.89 , 236.68 ,  33.27 ,
       119.43 , 154.03 ,  88.55 , 357.92 , 176.84 , 113.17 , 269.99 ,
        66.6  , 122.11 , 150.24 ,  48.16 , 237.55 ,  72.12 ,  91.91 ,
       232.49 , 173.33 ,  94.57 ,  23.71 , 175.35 , 124.01 , 121.44 ,
        90.2  , 139.

In [55]:
reg_model.score(X_train,y_train)

0.9997350937633047

In [56]:
reg_model.predict([[4428,3,4,0,0,1,0,0,0,0,0,0]])



array([190.4])

In [57]:
reg_model.predict([[3209,1,3,0,0,0,0,0,1,0,0,0]])



array([96.99])

In [58]:
reg_model.predict([[2688,1,3,0,0,0,1,0,0,0,0,0]])



array([124.49])

In [59]:
reg_model.predict([[3473,2,4,0,0,0,0,1,0,0,0,0]])



array([114.02])

In [60]:
X, y = make_regression(n_samples=100, n_features=5, noise=0.1, random_state=42)

model = LinearRegression()

# Define cross-validation strategy
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(model, X, y, cv=kf, scoring='r2')  # 'r2' for regression

# Display the results
print("Cross-Validation Scores (R^2):", cv_scores)
print("Mean R^2 Score:", np.mean(cv_scores))
print("Standard Deviation of R^2 Scores:", np.std(cv_scores))

Cross-Validation Scores (R^2): [0.99999944 0.99999958 0.99999954 0.99999952 0.9999994 ]
Mean R^2 Score: 0.9999994951564535
Standard Deviation of R^2 Scores: 6.758292405189928e-08


In [61]:


# Generate synthetic dataset (replace with your real estate dataset)
X, y = make_regression(n_samples=100, n_features=5, noise=0.1, random_state=42)

# Define the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Define cross-validation strategy
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(rf_model, X, y, cv=kf, scoring='r2')  # 'r2' for regression

# Display results
print(f"R^2 Score for each fold: {cv_scores}")
print(f"Mean R^2 Score: {cv_scores.mean():.2f}")
print(f"Standard Deviation of R^2 Scores: {cv_scores.std():.2f}")

R^2 Score for each fold: [0.81623017 0.52052244 0.70495914 0.74947337 0.85752566]
Mean R^2 Score: 0.73
Standard Deviation of R^2 Scores: 0.12


In [62]:
# Generate a sample classification dataset
X, y = make_classification(n_samples=100, n_features=5, n_classes=2, random_state=42)

# Instantiate the model
model =  DecisionTreeRegressor()

# Use Stratified K-Fold for classification
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation for accuracy
accuracy_scores = cross_val_score(model, X, y, cv=skf, scoring='accuracy')

print(f"Accuracy for each fold: {accuracy_scores}")
print(f"Mean Accuracy: {accuracy_scores.mean():.2f}")
print(f"Standard Deviation of Accuracy: {accuracy_scores.std():.2f}")

Accuracy for each fold: [0.95 0.85 1.   0.85 1.  ]
Mean Accuracy: 0.93
Standard Deviation of Accuracy: 0.07


In [48]:
import pickle
with open('model3.pkl', 'wb') as f:
    pickle.dump(rf_model, f)