In [None]:
import pandas as pd
import numpy as np

In [None]:
data=pd.read_csv('Bengaluru_House_Data.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
for column in data.columns:
    print(data[column].value_counts())
    print("*"*20)

In [None]:
data.isna().sum() #for checking null values

In [None]:
data.drop(columns=['society','balcony'],inplace=True)

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data['location'].value_counts()

In [None]:
data['location'] = data['location'].fillna('Sarjapur Road')

In [None]:
data['area_type'].value_counts()

In [None]:
data['size'] = data['size'].fillna('2 BHK')

In [None]:
data['bath'] = data['bath'].fillna(data['bath'].median())

In [None]:
data.info()

In [None]:
data['bhk'] = data['size'].str.split().str.get(0).astype(int)
#In BHK column created on first column of size

In [None]:
data[data.bhk > 20]

In [None]:
data['total_sqft'].unique()

In [None]:
def convertRange(x):
    temp = x.split('-')
    if len(temp) == 2:
        return (float(temp[0]) + float(temp[1]))/2   #mean
    try:
        return float(x)
    except:
        return None

In [None]:
data['total_sqft']=data['total_sqft'].apply(convertRange)

In [None]:
data.head()

Price Per Square Feet
#this column only for removing outliers

In [None]:
data['price_per_sqft'] = data['price'] *100000/data['total_sqft']

In [None]:
data['price_per_sqft']

In [None]:
data.describe()

In [None]:
data['location'].value_counts()

In [None]:
data['location'] = data['location'].apply(lambda x:x.strip())  #strip for removing white-spaces
data['availability'] = data['availability'].apply(lambda x:x.strip())
data['area_type'] = data['area_type'].apply(lambda x:x.strip())
location_count=data['location'].value_counts()
location_count


In [None]:
location_count_less_10=location_count[location_count<=10]
location_count_less_10

In [None]:
data['location'] = data['location'].apply(lambda x: 'other'if x in location_count_less_10 else x)
data['location'].value_counts()
data['availability'].value_counts()

Outlier detection and removal

In [None]:
data.describe()

In [None]:
(data['total_sqft']/data['bhk']).describe()

In [None]:
data=data[((data['total_sqft']/data['bhk']) >= 300)]
data.describe()

In [None]:
data.shape

In [None]:
data.price_per_sqft.describe()

In [None]:
def remove_outliers_sqft(df):
    df_output = pd.DataFrame()
    for key,subdf in df.groupby('location'):
        m=np.mean(subdf.price_per_sqft)
        st=np.std(subdf.price_per_sqft)
        gen_df=subdf[(subdf.price_per_sqft > (m-st)) &(subdf.price_per_sqft <=(m+st))]
        df_output=pd.concat([df_output,gen_df],ignore_index=True)
    return df_output
data = remove_outliers_sqft(data)
data.describe()
#now here got data whose price near to mean minus standard deviation

In [None]:
def bhk_outlier_remover(df):
    exclude_indices = np.array([])#for excluding
    for location,location_df in df.groupby('location'):
        bhk_states = {}
        for bhk,bhk_df in location_df.groupby('bhk'):
            bhk_states[bhk] = {
                'mean' : np.mean(bhk_df.price_per_sqft),
                'std' : np.std(bhk_df.price_per_sqft),
                'count' : bhk_df.shape[0]
            }
        for bhk,bhk_df in location_df.groupby('bhk'):
            stats=bhk_states.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices=np.append(exclude_indices,bhk_df[bhk_df.price_per_sqft < (stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')

In [None]:
data=bhk_outlier_remover(data)

In [None]:
data.shape

In [None]:
data

In [None]:
data.drop(columns=['size'],inplace=True)
data.drop(columns=['price_per_sqft'],inplace=True)


Cleaned Data

In [None]:
data.head()

In [None]:
data.to_csv("Cleaned_data.csv")


In [None]:
X=data.drop(columns=['price'])
y=data['price']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge  #LinearRegression ,
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score# used to evaluate the performance of a regression model.

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0) 

In [None]:
print(X_train.shape)
print(X_test.shape)

Applying Regrassion

In [None]:
columns_trans = make_column_transformer(
    (OneHotEncoder(sparse=False,handle_unknown='ignore'), ['location']),
    (OneHotEncoder(sparse=False,handle_unknown='ignore'), ['availability']),
    (OneHotEncoder(sparse=False,handle_unknown='ignore'), ['area_type']),
   remainder='passthrough'  
)


In [None]:
scaler= StandardScaler()

Applying Lasso

In [None]:
from sklearn.linear_model import Lasso
lasso = Lasso()

In [None]:
pipe=make_pipeline(columns_trans,scaler,lasso)

In [None]:
pipe.fit(X_train,y_train)

In [None]:
y_pred_lasso=pipe.predict(X_test)
r2_score(y_test,y_pred_lasso)

Applying Ridge

In [None]:
ridge=Ridge()

In [None]:
pipe=make_pipeline(columns_trans,scaler,ridge)

In [None]:
pipe.fit(X_train,y_train)

In [None]:
y_pred_ridge = pipe.predict(X_test)
r2_score(y_test,y_pred_ridge)

In [None]:
print("Lasso: ",r2_score(y_test,y_pred_lasso))
print("Ridge : ",r2_score(y_test,y_pred_ridge))

In [None]:
import pickle 

In [None]:
pickle.dump(pipe,open('RidgeModel.pkl','wb'))