# Day 2 Simple Data Cleaning for regression analysis

Importing necessary libraries

In [165]:
import pandas as pd
import numpy as np
import random
import math
from sklearn.preprocessing import OneHotEncoder, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression

We will be doing per city analysis, so we don't need area/locality

In [166]:
df = pd.read_csv("House_Rent_Dataset.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4746 entries, 0 to 4745
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Posted On          4746 non-null   object
 1   BHK                4746 non-null   int64 
 2   Rent               4746 non-null   int64 
 3   Size               4746 non-null   int64 
 4   Floor              4746 non-null   object
 5   Area Type          4746 non-null   object
 6   Area Locality      4746 non-null   object
 7   City               4746 non-null   object
 8   Furnishing Status  4746 non-null   object
 9   Tenant Preferred   4746 non-null   object
 10  Bathroom           4746 non-null   int64 
 11  Point of Contact   4746 non-null   object
dtypes: int64(4), object(8)
memory usage: 445.1+ KB


In [167]:
# we will be dropping Area Type, Area Locality, Tenant Preferred, Point of Contact, Posted on 
df = df.drop(["Area Type", "Area Locality", "Tenant Preferred", "Point of Contact", "Posted On"], axis = 1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4746 entries, 0 to 4745
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   BHK                4746 non-null   int64 
 1   Rent               4746 non-null   int64 
 2   Size               4746 non-null   int64 
 3   Floor              4746 non-null   object
 4   City               4746 non-null   object
 5   Furnishing Status  4746 non-null   object
 6   Bathroom           4746 non-null   int64 
dtypes: int64(4), object(3)
memory usage: 259.7+ KB


We seperate our Y from X, i.e, rent from the rest

In [168]:
X = df[["BHK", "Size", "Floor", "City", "Furnishing Status", "Bathroom"]]
y = df["Rent"]
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4746 entries, 0 to 4745
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   BHK                4746 non-null   int64 
 1   Size               4746 non-null   int64 
 2   Floor              4746 non-null   object
 3   City               4746 non-null   object
 4   Furnishing Status  4746 non-null   object
 5   Bathroom           4746 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 222.6+ KB


Lets Analyze X

In [169]:
X.head()

Unnamed: 0,BHK,Size,Floor,City,Furnishing Status,Bathroom
0,2,1100,Ground out of 2,Kolkata,Unfurnished,2
1,2,800,1 out of 3,Kolkata,Semi-Furnished,1
2,2,1000,1 out of 3,Kolkata,Semi-Furnished,1
3,2,800,1 out of 2,Kolkata,Unfurnished,1
4,2,850,1 out of 2,Kolkata,Unfurnished,1


We need to make Floor into categorical feature of 0 to n, before that we need to replace 'ground' to 0 and everything
else to its number respectively

In [170]:
X_new = X
X_new.loc[:,"Floor"] = X_new.loc[:,"Floor"].str.replace(r"\W*out.*","", regex=True)
X_new.loc[:,"Floor"] = X_new.loc[:,"Floor"].str.replace("Ground", "0", regex=True)
X_new.loc[:,"Floor"] = X_new.loc[:,"Floor"].str.replace("Upper Basement", "0", regex=True)
X_new.loc[:,"Floor"] = X_new.loc[:,"Floor"].str.replace("Lower Basement", "0", regex=True)
X_new.loc[:,"Floor"]= X_new.loc[:,"Floor"].map(lambda x: int(x))
X_new.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_new.loc[:,"Floor"] = X_new.loc[:,"Floor"].str.replace(r"\W*out.*","", regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_new.loc[:,"Floor"] = X_new.loc[:,"Floor"].str.replace("Ground", "0", regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_new.loc[:,"Floor"] = X_new.loc[:,"Fl

Unnamed: 0,BHK,Size,Floor,City,Furnishing Status,Bathroom
0,2,1100,0,Kolkata,Unfurnished,2
1,2,800,1,Kolkata,Semi-Furnished,1
2,2,1000,1,Kolkata,Semi-Furnished,1
3,2,800,1,Kolkata,Unfurnished,1
4,2,850,1,Kolkata,Unfurnished,1



Now we need to One hot encode categorical features

In [191]:
OH = OneHotEncoder()
Cities = X_new["City"].unique().tolist()
Furnished_states = X_new["Furnishing Status"].unique().tolist()
Cities.sort()
Furnished_states.sort()
X_encoded = OH.fit_transform(X_new[["City"]]).toarray()
X_encoded2 = OH.fit_transform(X_new[["Furnishing Status"]]).toarray()
X_new[Cities] = X_encoded
X_new[Furnished_states] = X_encoded2
X_new = X_new.drop(["City", "Furnishing Status"], axis=1)
X_new.head()

Unnamed: 0,BHK,Size,Floor,Bathroom,Bangalore,Chennai,Delhi,Hyderabad,Kolkata,Mumbai,Furnished,Semi-Furnished,Unfurnished
0,2,1100,0,2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,2,800,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,2,1000,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,2,800,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,2,850,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
