In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Copying Dataset from drive to the current session storage
!cp /content/drive/MyDrive/Dataset.zip .

In [None]:
#Programmatically gettingthe dataset
import zipfile
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor # to predict numeric values
from sklearn.metrics import root_mean_squared_error, r2_score

# Path to your zip file
zip_path = '/content/Dataset.zip' #local session path for colab
with zipfile.ZipFile(zip_path, 'r') as z:
        dfs = {f: pd.read_csv(z.open(f), encoding='latin-1') for f in z.namelist() if f.endswith('.csv')}
train, test, sub = dfs['Dataset/Train.csv'], dfs['Dataset/Test.csv'],  dfs['Dataset/sample_submission.csv']

In [None]:
train.shape, test.shape,sub.shape

((13320, 9), (1480, 8), (1480, 1))

In [None]:
#Make a backup
train_orig = train.copy()
test_orig = test.copy()

In [None]:
#checking null values
train.isna().sum()

Unnamed: 0,0
area_type,0
availability,0
location,1
size,16
society,5502
total_sqft,0
bath,73
balcony,609
price,0


In [None]:
#dropping null values
train = train.dropna()
train.shape

(7496, 9)

In [None]:
#checking duplicates
train.duplicated()



Unnamed: 0,0
0,False
1,False
3,False
5,False
11,False
...,...
13313,True
13314,False
13315,False
13317,False


In [None]:
#To show the number of duplicates
train.duplicated().sum()


np.int64(352)

In [None]:
#Dropping duplicates
train = train.drop_duplicates()
train.shape

(7144, 9)

In [None]:
#Separate predictors/independent/features(x) and predicted/dependent/target(y)
X = train.drop(columns=['price'])
y = train['price']
X.shape, y.shape


((7144, 8), (7144,))

In [None]:
display(X.head(2), y.head(2))

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0


Unnamed: 0,price
0,39.07
1,120.0


In [None]:
#Identify the categorical columns and numerical columns
cat_cols = train.select_dtypes(include=['object']).columns
num_cols = train.select_dtypes(exclude=['object']).columns
cat_cols,num_cols



(Index(['area_type', 'availability', 'location', 'size', 'society',
        'total_sqft'],
       dtype='object'),
 Index(['bath', 'balcony', 'price'], dtype='object'))

In [None]:
#Create an encoding object to apply encoding to categorical columns
#any algorithm cant accept string values
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)


In [None]:
X[cat_cols] = encoder.fit_transform(X[cat_cols])
X.head(2)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony
0,3.0,35.0,210.0,3.0,443.0,63.0,2.0,1.0
1,2.0,73.0,149.0,8.0,2353.0,1128.0,5.0,3.0


In [None]:
#Initialize Scaling
scaler = StandardScaler()

In [None]:
# Scale the features
X_scaled = scaler.fit_transform(X)
pd.DataFrame(X_scaled).head(1)
display(train.head(1), X.head(1), pd.DataFrame(X_scaled).head(1))

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07


Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony
0,3.0,35.0,210.0,3.0,443.0,63.0,2.0,1.0


Unnamed: 0,0,1,2,3,4,5,6,7
0,0.482989,-1.556075,-0.71818,-0.644897,-1.208053,-1.176862,-0.522612,-0.797179


In [None]:
# While scaling column names also changed to integers
# after scaling we can assign the original colmn name
scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
scaled_df.head(1)


Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony
0,0.482989,-1.556075,-0.71818,-0.644897,-1.208053,-1.176862,-0.522612,-0.797179


In [None]:
# Split the data sets into train and validations Training = 80%, validation=20%
# random state to say give the same rows of data for train and test
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
X_train.shape, X_val.shape, y_train.shape, y_val.shape


((5715, 8), (1429, 8), (5715,), (1429,))

In [None]:
# Initialize KNN
KNN = KNeighborsRegressor()

In [None]:
# Fit the model with the training data (5715 rows 80%) - giving  both input(x) and what is the output(y)
KNN = KNN.fit(X_train, y_train)

In [None]:
# Make a Prediction on the validation data
val_pred = KNN.predict(X_val)

In [None]:
#Calculate error - Difference between the actual val and predicted val
error = root_mean_squared_error(y_val,val_pred) # Checking error
r2 = r2_score(y_val,val_pred) # Checking accuracy
print("avg error difference [Avg]", error) # Avg value difference between the original val and the predicted val
print("prediction accuracy [%]", r2) # Able to predict only 56%

avg error difference [Avg] 64.65498285365297
prediction accuracy [%] 0.5654555945447639


In [None]:
comp_res_df = pd.DataFrame({'Actual':y_val, 'Predicted':val_pred})
comp_res_df['Difference'] = comp_res_df['Actual'] - comp_res_df['Predicted']
comp_res_df.head(10)

Unnamed: 0,Actual,Predicted,Difference
6735,135.0,106.0,29.0
8498,65.0,52.09,12.91
550,260.0,206.8,53.2
11064,65.0,60.0,5.0
850,58.935,54.278,4.657
1794,55.0,91.8,-36.8
11284,96.5,116.978,-20.478
11722,46.29,57.88,-11.59
7449,74.0,68.8,5.2
917,53.0,68.3,-15.3


In [None]:
# Apply diffrent encoding, scaling
#perform various feature transformations like converting availability column - date into under snstruction [ready to move, immediate pssession]
#or categorize date into Q1, Q2, Q3 and Q4
#BHK into numeric data
#80/20, 85/15, 70/30,
# Instead of removing duplicates, apply impute values
# remove outliers
#try to find the best k
#randon forst, decision free

In [None]:
train[train.index==6735]

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
6735,Super built-up Area,Ready To Move,Koramangala,3 BHK,Ansna K,1500,3.0,1.0,135.0


In [None]:
# applying teh same process to test colun as encoding and scaling
test.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony'],
      dtype='object')

In [None]:
cat_cols

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft'],
      dtype='object')

In [None]:
# finding null values
#test.isna().sum()
test[cat_cols].isna().sum()

Unnamed: 0,0
area_type,0
availability,0
location,0
size,2
society,626
total_sqft,0


In [None]:

test.isna().sum()

Unnamed: 0,0
area_type,0
availability,0
location,0
size,0
society,0
total_sqft,0
bath,0
balcony,0


In [None]:
# Imputing values for null columns
test['bath'] = test['bath'].fillna(0)
test['balcony'] = test['balcony'].fillna(0)

In [None]:
# Inputing values for null columns
test['size'] = test['size'].fillna(test['size'].mode()[0])
test['society'] = test['society'].fillna(test['society'].mode()[0])

In [None]:
#test.dtypes
test[cat_cols].isna().sum()

Unnamed: 0,0
area_type,0
availability,0
location,0
size,0
society,0
total_sqft,0


In [None]:
test[cat_cols] = encoder.transform(test[cat_cols])

#test.head(2)
#test.head(2)

In [None]:
test_scaled = scaler.transform(test)

In [None]:
pd.DataFrame(test_scaled).head(1)

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.482989,0.52014,-1.16945,-0.644897,0.56843,-0.784926,-0.522612,0.474321


In [None]:
test_predictions = KNN.predict(test_scaled)
test_predictions

array([ 57.34, 371.  , 346.4 , ...,  79.2 ,  92.8 ,  69.8 ])

In [None]:
len(test_predictions)

1480

In [None]:
test.shape

(1480, 8)

In [None]:
#csv file sub contains only price column, all values are 0
sub['price']
sub['price'].unique()
sub['price'].nunique()

1

In [None]:
sub['price'] = test_predictions
sub['price']

Unnamed: 0,price
0,57.34
1,371.00
2,346.40
3,89.00
4,60.95
...,...
1475,66.74
1476,105.00
1477,79.20
1478,92.80


In [None]:
sub['price'].nunique()

1113

In [None]:
sub.to_csv('my_first_prediction.csv', index=False)