In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Reading the CSV
train_df = pd.read_csv("C:/Users/YANTRA/Desktop/Python projects/House_price_prediction_INDIA/train.csv")

# Dropping the columns that are not going to be needed
train_df = train_df.drop(['LATITUDE' , 'LONGITUDE' , 'POSTED_BY'] , axis='columns')

# Dropping the rows that have RK instead of BHK 
train_df = train_df.drop(train_df[train_df['BHK_OR_RK'] == 'RK'].index)
train_df.drop(columns=['BHK_OR_RK'])

# Splitting the City and Area aqnd then dropping the Address Column
train_df['Area'] = train_df['ADDRESS'].apply(lambda x: str(x.split(",")[0]))
train_df['City'] = train_df['ADDRESS'].apply(lambda y: str(y.split(",")[1]))
train_df = train_df.drop(columns=['ADDRESS' , 'City' , 'BHK_OR_RK'])

In [20]:
# Feature Engineering
# train_df.Area = train_df.Area.apply(lambda x: x.strip())
# Area_Stats = train_df.groupby('Area')['Area'].agg('count').sort_values(ascending=False)
# Area_Less_Than_10 = Area_Stats[Area_Stats < 10]
# train_df.Area = train_df.Area.apply(lambda x: "Other" if x in Area_Less_Than_10 else x)

In [21]:
# Outlier Detection and Removal
train_df['Cost_Per_Square_Ft'] = ((train_df['TARGET(PRICE_IN_LACS)']) / train_df['SQUARE_FT'])
train_df = train_df[~(train_df.SQUARE_FT / train_df['BHK_NO.'] < 300)]

# Function for outlier removal
def remove_outliers(df):
    df_out = pd.DataFrame()
    for key, sub_df in df.groupby('Area'):
        mean = np.mean(sub_df.Cost_Per_Square_Ft)
        SD = np.std(sub_df.Cost_Per_Square_Ft)
        reduced_df = sub_df[
            (sub_df.Cost_Per_Square_Ft > (mean - (SD))) & (sub_df.Cost_Per_Square_Ft < (mean + (SD)))]
        df_out = pd.concat([df_out, reduced_df], ignore_index=True)
    return df_out

mod_train_df = remove_outliers(train_df)

#Function to remove outliers 
def remove_outliers_1(df):
    exc_indices = np.array([])
    for area , area_df in df.groupby('Area'):
        bhk_stats = {}
        for bhk , bhk_df in area_df.groupby('BHK_NO.'):
            bhk_stats[bhk] = {
                'mean' : np.mean(bhk_df.Cost_Per_Square_Ft),
                'std' : np.std(bhk_df.Cost_Per_Square_Ft),
                'count' : bhk_df.shape[0]
            }
        for bhk , bhk_df in area_df.groupby('BHK_NO.'):
            stats = bhk_stats.get(bhk - 1)
            if stats and stats['count'] > 5:
                exc_indices = np.append(exc_indices , bhk_df[bhk_df.Cost_Per_Square_Ft < (stats['mean'])].index.values)
        return df.drop(exc_indices , axis = 'index')
    
mod_train_df = remove_outliers_1(mod_train_df)

In [22]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

scaler_X = MinMaxScaler(feature_range = (0 , 1))
scaler_Y = MinMaxScaler(feature_range = (0 , 1))

# Model Building 
model_train_data = mod_train_df.drop(['Cost_Per_Square_Ft' , 'UNDER_CONSTRUCTION'] , axis = 'columns')

# Creating Dummies or One hot encoding the entire dataset
ohe = OneHotEncoder()
ohe.fit(model_train_data.loc[: , ['Area']])
model_train_data_OHE = ohe.transform(model_train_data.loc[: , ['Area']]).toarray()
categories = ohe.categories_
model_train_data_OHE = pd.DataFrame(model_train_data_OHE , columns = categories)
model_train_data_OHE = pd.concat([model_train_data.reset_index() , model_train_data_OHE] , axis='columns')

X = model_train_data_OHE.drop(['Area' ] , axis = 'columns')
Y = X['TARGET(PRICE_IN_LACS)']
X = X.drop(["TARGET(PRICE_IN_LACS)"] , axis = 'columns')

X_Fin = scaler_X.fit_transform(X)
Y_Fin = np.array(Y).reshape(-1 , 1)
Y_Fin = scaler_Y.fit_transform(Y_Fin)

X_Fin = pd.DataFrame(X_Fin)
Y_Fin = pd.DataFrame(Y_Fin)

In [23]:
# Decision Tree Model 
from sklearn.tree import DecisionTreeRegressor

X_train , X_test , Y_train , Y_test = train_test_split(X_Fin , Y_Fin , test_size = 0.2)

In [24]:
reg1 = DecisionTreeRegressor()

reg1.fit(X_train , Y_train)

reg1.score(X_test , Y_test)

0.837027350160049

In [26]:
# Ultimately the model is trained on Decision Tree Algorithmn and the paramters on which it is trained are as follows : 
# 1. RERA
# 2. BHK_NO.
# 3. Square Feet 
# 4. Ready To Move
# 5. Resale 
# 6. Area

# Creating a predict function and checking how well the model performs on the test and on some instances of train datasets
def user_dict_to_df():
    ''' This function will take the entire inputs from the User and returns an input dictionary '''
    
    print(" Enter the following details for the prediction : ")
    
    print("REPLY (1/0)")
    r = input(" Enter the wheather the property is under RERA : ")
    
    bhkn = input(" Enter the BHK number : ")
    
    sft = input(" Enter the square Feet : ")
    
    print("REPLY (1/0)")
    rtm = input(" Is the place ready to move : ")
    
    print("REPLY (1/0)")
    rs = input(" Is the place for resale : ")
    
    area = input(" Area : ")
    
    inp_dict = {'RERA' : r , 'BHK_NO.' : bhkn , 'SQUARE_FT' : sft , 'READY_TO_MOVE' : rtm , 'RESALE' : rs , 'Area' : area} 
    df = pd.DataFrame.from_dict([inp_dict])
    
    return df

inp_dict = user_dict_to_df()

 Enter the following details for the prediction : 
REPLY (1/0)
 Enter the wheather the property is under RERA : s
 Enter the BHK number : lkj
 Enter the square Feet : fdg
REPLY (1/0)
 Is the place ready to move : sdxfdf
REPLY (1/0)
 Is the place for resale : dsfdfg
 Area : fdg


In [9]:
# A function that will take the input dictionary as dataframe , preprocess it to make it prediction ready.
def pre_dict(inp_dict):
    
    new_df_1 = inp_dict.copy()
    new_df_2 = ohe.transform(new_df_1.loc[: , ['Area']]).toarray()
    new_df_2 = pd.DataFrame(new_df_2 , columns = categories)
    new_df_2 = pd.concat([new_df_1.reset_index() , new_df_2] , axis='columns')
    new_df_2 = new_df_2.drop('Area' , axis='columns')
    new_df_2 = scaler_X.transform(new_df_2)
    pred_val = reg1.predict(new_df_2)
    pred_val = scaler_Y.inverse_transform([pred_val])
    print(pred_val)
    
pre_dict(inp_dict)

[[110.]]


In [28]:
# A function that generates Target price Column for the test dataset
def predict_test_csv():
    
    # Reading the CSV
    test_df = pd.read_csv("C:/Users/YANTRA/Desktop/Python projects/House_price_prediction_INDIA/test.csv")
    
    # Dropping the columns that are not going to be needed
    test_df = test_df.drop(['LATITUDE' , 'LONGITUDE' , 'POSTED_BY'] , axis='columns')
    
    # Dropping the rows that have RK instead of BHK 
    test_df = test_df.drop(test_df[test_df['BHK_OR_RK'] == 'RK'].index)
    test_df.drop(columns=['BHK_OR_RK'])
    
    # Splitting the City and Area aqnd then dropping the Address Column
    test_df['Area'] = test_df['ADDRESS'].apply(lambda x: str(x.split(",")[0]))
    test_df['City'] = test_df['ADDRESS'].apply(lambda y: str(y.split(",")[1]))
    test_df = test_df.drop(columns=['ADDRESS' , 'City' , 'BHK_OR_RK'])
    
#     # Feature Engineering
#     test_df.Area = test_df.Area.apply(lambda x: x.strip())
#     Area_Stats = test_df.groupby('Area')['Area'].agg('count').sort_values(ascending=False)
#     Area_Less_Than_10 = Area_Stats[Area_Stats < 10]
#     test_df.Area = test_df.Area.apply(lambda x: "Other" if x in Area_Less_Than_10 else x)

    # Code to make number of features in test dataset and train dataset same 
    unique_area_train = list(model_train_data_OHE['Area'].unique())
    test_df = test_df.loc[(test_df['Area'].isin(unique_area_train)) , :]
    
    # Normalizing and other operations to make testing data prediction ready
    scaler_X = MinMaxScaler(feature_range = (0 , 1))
    test_df = test_df.drop(['UNDER_CONSTRUCTION'] , axis = 'columns')
    ohe.fit(test_df.loc[: , ['Area']])
    temp = ohe.transform(test_df.loc[: , ['Area']]).toarray()
    categories = ohe.categories_
    temp = pd.DataFrame(temp , columns = categories)
    test_df_OHE = pd.concat([test_df.reset_index() , temp] , axis='columns')
    test_df_Fin = test_df_OHE.drop(['Area'] , axis = 'columns')
    test_df_Fin = scaler_X.fit_transform(test_df_Fin)
    test_df_Fin = pd.DataFrame(test_df_Fin)
    print(test_df_Fin.shape)
    
    # Prediction on test Dataset
    Target_Price = reg1.predict(test_df_Fin)
    
    print(Target_Price)
    
predict_test_csv()

(54880, 2067)


ValueError: Number of features of the model must match the input. Model n_features is 2125 and input n_features is 2067 

In [29]:
X_Fin

(16778, 2125)