In [5]:
# EDA Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
data = pd.read_csv('Bengaluru_House_Data.csv')
data.head()

AttributeError: module 'pandas' has no attribute 'read_csv'

In [7]:
data.shape

NameError: name 'data' is not defined

In [None]:
data.info()

In [None]:
data.isna().sum()

In [None]:
for column in data.columns:
    print(data[column].value_counts())
    

In [None]:
data.drop(columns=['area_type','availability','society','balcony'],inplace=True)

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data['location'].value_counts()

In [None]:
data['location'].isna().sum()

In [None]:
# we will be replacing the missing value with Sarjapur road as there is only 1 missing value
data['location']=data['location'].fillna('Sarjapur  Road')

In [None]:
# we will be replacing the missing values of size
data['size']=data['size'].fillna('2 BHK')

In [None]:
data['bath']=data['bath'].fillna(data['bath'].median())

In [None]:
data.info()

In [None]:
data['bhk'] = data['size'].str.split().str.get(0).astype(int)

In [None]:
data[data.bhk > 20]

In [None]:
data['total_sqft'].unique()

In [None]:
def convertRange(x):
    
    temp=x.split('-')
    if len(temp) == 2:
        return (float(temp[0]) + float(temp[1])/2)
    try:
        return float(x)
    except:
        return None

In [None]:
data['total_sqft']=data['total_sqft'].apply(convertRange)

In [None]:
data.head()

In [None]:
data['price_per_sqft'] = data['price']*100000 / data['total_sqft']

In [None]:
data['price_per_sqft']

In [None]:
data.describe()

In [None]:
data['location'].value_counts()

In [None]:
data['location'] = data['location'].apply(lambda x: x.strip())
location_counts = data['location'].value_counts()
location_counts

In [None]:
location_counts_less_10 = location_counts[location_counts<=10]
location_counts_less_10

In [None]:
data['location'] = data['location'].apply(lambda x: 'other' if x in location_counts_less_10 else x)

In [None]:
data['location'].value_counts()

In [None]:
(data['total_sqft']/data['bhk']).describe()

In [None]:
data = data[((data['total_sqft']/data['bhk']) >= 300)]
data.describe()

In [None]:
data.price_per_sqft.describe()

In [None]:
def remove_outliers_sqft(df):
    df_output = pd.DataFrame()
    for key,subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        gen_df = subdf[(subdf.price_per_sqft > (m-st)) & (subdf.price_per_sqft <= (m+st))]
        df_output = pd.concat([df_output,gen_df],ignore_index=True)
    return df_output
data = remove_outliers_sqft(data)
data.describe()

In [None]:
def bhk_outlier_removal(df):
    exclude_indices = np.array([])
    for location,location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk,bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk,bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices,bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')

In [None]:
data = bhk_outlier_removal(data)

In [None]:
data.shape

In [None]:
data

In [None]:
# price_per_sqft feature was only used to drop the outliers
data.drop(columns=['size','price_per_sqft'],inplace=True)

### Cleaned Data

In [None]:
data.head()

In [None]:
data.to_csv('Cleaned_data.csv')

In [None]:
X = data.drop(columns=['price'])
y = data['price']

In [None]:
# Sampling Library
from sklearn.model_selection import train_test_split
# Machine Learning Models and Evaluation Metrices
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.metrics import r2_score
# Data Transformation Libraries
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
print(X_train.shape)
print(X_test.shape)
print( y_train.shape)
print( y_test.shape)

In [None]:
column_trans = make_column_transformer((OneHotEncoder(sparse_output=False),['location']),remainder='passthrough')

In [None]:
lr = make_pipeline(column_trans, StandardScaler(), LinearRegression())

In [None]:
lr.fit(X_train,y_train)

In [None]:
y_pred = lr.predict(X_test)
y_pred

In [None]:
r2_score_value = r2_score(y_test, y_pred)

print("R-squared score:", r2_score_value)

### Appling Lasso

In [None]:
lasso = Lasso()

In [None]:
scaler = StandardScaler()
pipe = make_pipeline(column_trans,scaler,lasso)

In [None]:
pipe.fit(X_train,y_train)

In [None]:
y_pred_lasso = pipe.predict(X_test)
r2_score(y_test,y_pred_lasso)

### Applying Ridge

In [None]:
ridge = Ridge()

In [None]:
pipe = make_pipeline(column_trans,scaler,ridge)

In [None]:
pipe.fit(X_train,y_train)

In [None]:
y_pred_ridge = pipe.predict(X_test)
r2_score(y_test,y_pred_ridge)

In [None]:
print("R-squared score:", r2_score_value)
print("Lasso:",r2_score(y_test,y_pred_lasso))
print("Ridge:",r2_score(y_test,y_pred_ridge))

In [None]:
import pickle
pickle.dump(pipe,open('RidgeModel.pkl','wb'))

In [None]:
data['location'].unique()