In [274]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [275]:
#Copying Dataset from drive to the current session storage
!cp /content/drive/MyDrive/Dataset.zip .

In [276]:
#Programmatically gettingthe dataset
import zipfile
import pandas as pd
import numpy as np
#from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import root_mean_squared_error, r2_score

# Path to your zip file
zip_path = '/content/Dataset.zip' #local session path for colab
with zipfile.ZipFile(zip_path, 'r') as z:
        dfs = {f: pd.read_csv(z.open(f), encoding='latin-1') for f in z.namelist() if f.endswith('.csv')}
train, test, sub = dfs['Dataset/Train.csv'], dfs['Dataset/Test.csv'],  dfs['Dataset/sample_submission.csv']

In [277]:
train.shape, test.shape,sub.shape

((13320, 9), (1480, 8), (1480, 1))

In [278]:
#Make a backup
train_orig = train.copy()
test_orig = test.copy()

In [279]:
train.isna().sum()

Unnamed: 0,0
area_type,0
availability,0
location,1
size,16
society,5502
total_sqft,0
bath,73
balcony,609
price,0


In [280]:
train.head(1)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07


In [281]:
#train['availability'] = df['column_name'].replace({'old_value1': 'new_value1', 'old_value2': 'new_value2'})
# to replace availability column to Quarter 1 if Jan, Feb, Mar present in the column value
train['availability']
train.head(10)
train['availability'] = np.where(train['availability'].str.contains('Oct|Nov|Dec'), 'Quarter 4', train['availability'])

train['availability'] = np.where(train['availability'].str.contains('Jan|Feb|Mar'), 'Quarter 1', train['availability'])

train['availability'] = np.where(train['availability'].str.contains('Apr|May|Jun'), 'Quarter 2', train['availability'])

train['availability'] = np.where(train['availability'].str.contains('Jul|Aug|Sep'), 'Quarter 3', train['availability'])

train['availability']
train['availability'].unique()


array(['Quarter 4', 'Ready To Move', 'Quarter 2', 'Quarter 1',
       'Quarter 3', 'Immediate Possession'], dtype=object)

In [282]:
train['size'].unique()


array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', nan, '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [283]:
train['size'] = train['size'].str.replace('Bedroom', 'BHK')
train['size'].unique()

array(['2 BHK', '4 BHK', '3 BHK', '6 BHK', '1 BHK', '1 RK', '8 BHK',
       '7 BHK', '5 BHK', '11 BHK', '9 BHK', nan, '27 BHK', '10 BHK',
       '19 BHK', '16 BHK', '43 BHK', '14 BHK', '12 BHK', '13 BHK',
       '18 BHK'], dtype=object)

In [284]:
display(train.head(10))


Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,Quarter 4,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 BHK,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0
5,Super built-up Area,Ready To Move,Whitefield,2 BHK,DuenaTa,1170,2.0,1.0,38.0
6,Super built-up Area,Quarter 2,Old Airport Road,4 BHK,Jaades,2732,4.0,,204.0
7,Super built-up Area,Ready To Move,Rajaji Nagar,4 BHK,Brway G,3300,4.0,,600.0
8,Super built-up Area,Ready To Move,Marathahalli,3 BHK,,1310,3.0,1.0,63.25
9,Plot Area,Ready To Move,Gandhi Bazar,6 BHK,,1020,6.0,,370.0


In [285]:
train.isna().sum()


Unnamed: 0,0
area_type,0
availability,0
location,1
size,16
society,5502
total_sqft,0
bath,73
balcony,609
price,0


In [286]:
#Dropping Null values
train = train.dropna()
train.shape

(7496, 9)

In [287]:
#Checking duplicates
train.duplicated().sum()


np.int64(386)

In [288]:
#Dropping duplicates
train = train.drop_duplicates()

In [265]:
train.shape

(7110, 9)

In [289]:
# Separate X and y
X = train.drop(columns=['price'])
y = train['price']
X.shape, y.shape

((7110, 8), (7110,))

In [290]:
#Identify the categorical columns and numerical columns
cat_cols = train.select_dtypes(include=['object']).columns
num_cols = train.select_dtypes(exclude=['object']).columns
#cat_cols, num_cols

In [291]:
# label encoder
#label_encoder = LabelEncoder()



if isinstance(X, pd.DataFrame):
    label_encoders = {}
    for col in cat_cols:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
        label_encoders[col] = le
X.head(2)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony
0,3,3,210,3,443,63,2.0,1.0
1,2,4,149,5,2353,1128,5.0,3.0


In [292]:
#Scaling
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
pd.DataFrame(X_scaled).head(1)

Unnamed: 0,0,1,2,3,4,5,6,7
0,1.0,0.75,0.331754,0.333333,0.170976,0.037478,0.125,0.333333


In [293]:
#Splitting train and test from the train dataset
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
X_train.shape, X_val.shape, y_train.shape, y_val.shape


((5688, 8), (1422, 8), (5688,), (1422,))

In [294]:
test.isna().sum()


Unnamed: 0,0
area_type,0
availability,0
location,0
size,2
society,626
total_sqft,0
bath,7
balcony,69


In [295]:
# Imputing values for null columns
test['availability']

test['availability'] = np.where(test['availability'].str.contains('Oct|Nov|Dec'), 'Quarter 4', test['availability'])

test['availability'] = np.where(test['availability'].str.contains('Jan|Feb|Mar'), 'Quarter 1', test['availability'])

test['availability'] = np.where(test['availability'].str.contains('Apr|May|Jun'), 'Quarter 2', test['availability'])

test['availability'] = np.where(test['availability'].str.contains('Jul|Aug|Sep'), 'Quarter 3', test['availability'])

test['availability']

test['availability'].unique()
test['size'] = test['size'].str.replace('Bedroom', 'BHK')
test['size'].unique()
# # Create and train the Ridge Regression model
# ridge = Ridge(alpha=1.0)
# ridge.fit(X_train, y_train)
# val_pred = ridge.predict(X_val)
# print("Root Mean Squared Error:", root_mean_squared_error(y_val, val_pred))
# print("R2 Score:", r2_score(y_val, val_pred))
# comp_res_df = pd.DataFrame({'Actual':y_val, 'Predicted':val_pred})
# comp_res_df['Difference'] = comp_res_df['Actual'] - comp_res_df['Predicted']
# comp_res_df.head(10)

array(['2 BHK', '9 BHK', '4 BHK', '3 BHK', '1 BHK', '5 BHK', '7 BHK',
       '8 BHK', '6 BHK', nan, '1 RK', '10 BHK', '16 BHK'], dtype=object)

In [245]:
# #KNN algorithm
# KNN = KNeighborsRegressor()
# KNN = KNN.fit(X_train, y_train)
# val_pred = KNN.predict(X_val)
# error = root_mean_squared_error(y_val,val_pred) # Checking error
# r2 = r2_score(y_val,val_pred) # Checking accuracy
# print("avg error difference [Avg]", error) # Avg value difference between the original val and the predicted val
# print("prediction accuracy [%]", r2) # Able to predict only 56%

In [296]:
#Preprocessing the test data
test[cat_cols].isna().sum()

Unnamed: 0,0
area_type,0
availability,0
location,0
size,2
society,626
total_sqft,0


In [297]:
test.isna().sum()

Unnamed: 0,0
area_type,0
availability,0
location,0
size,2
society,626
total_sqft,0
bath,7
balcony,69


In [298]:
# Imputing values for null columns
test['bath'] = test['bath'].fillna(0)
test['balcony'] = test['balcony'].fillna(0)

In [299]:
# Inputing values for null columns
test['size'] = test['size'].fillna(test['size'].mode()[0])
test['society'] = test['society'].fillna(test['society'].mode()[0])

In [300]:
test[cat_cols].isna().sum()

Unnamed: 0,0
area_type,0
availability,0
location,0
size,0
society,0
total_sqft,0


In [301]:
# Encoding test cat cols
if isinstance(test, pd.DataFrame):
    label_encoders = {}
    for col in cat_cols:
        le = LabelEncoder()
        test[col] = le.fit_transform(test[col].astype(str))
        label_encoders[col] = le
X.head(2)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony
0,3,3,210,3,443,63,2.0,1.0
1,2,4,149,5,2353,1128,5.0,3.0


In [302]:
test_scaled= scaler.fit_transform(test)
#Linear regression
reg = LinearRegression()
reg.fit(X_train, y_train)
val_pred = reg.predict(X_val)
print("Root Mean Squared Error:", root_mean_squared_error(y_val, val_pred))
print("R2 Score:", r2_score(y_val, val_pred))
comp_res_df = pd.DataFrame({'Actual':y_val, 'Predicted':val_pred})
comp_res_df['Difference'] = comp_res_df['Actual'] - comp_res_df['Predicted']
comp_res_df.tail(10)
pd.DataFrame(test_scaled).head(1)

Root Mean Squared Error: 92.83123186244204
R2 Score: 0.30605346029104774


Unnamed: 0,0,1,2,3,4,5,6,7
0,1.0,1.0,0.192308,0.363636,0.681282,0.181955,0.125,0.666667


In [308]:
test_predictions = reg.predict(test_scaled)
test_predictions

sub['price'].unique()
sub['price'].nunique()
sub['price'] = test_predictions
sub['price']

Unnamed: 0,price
0,51.500327
1,343.460239
2,177.085782
3,94.964967
4,53.373332
...,...
1475,60.644381
1476,111.282059
1477,50.096808
1478,115.126042


In [309]:
sub['price'].nunique()
len(test_predictions)

1480

In [310]:
sub.to_csv('Linear_Regressionprediction.csv', index=False)