In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest,mutual_info_regression

In [34]:
data= pd.read_csv('../Data/Chennai houseing sale.csv')

In [35]:
data.head()

Unnamed: 0,PRT_ID,AREA,INT_SQFT,DATE_SALE,DIST_MAINROAD,N_BEDROOM,N_BATHROOM,N_ROOM,SALE_COND,PARK_FACIL,...,UTILITY_AVAIL,STREET,MZZONE,QS_ROOMS,QS_BATHROOM,QS_BEDROOM,QS_OVERALL,REG_FEE,COMMIS,SALES_PRICE
0,P03210,Karapakkam,1004,04-05-2011,131,1.0,1.0,3,AbNormal,Yes,...,AllPub,Paved,A,4.0,3.9,4.9,4.33,380000,144400,7600000
1,P09411,Anna Nagar,1986,19-12-2006,26,2.0,1.0,5,AbNormal,No,...,AllPub,Gravel,RH,4.9,4.2,2.5,3.765,760122,304049,21717770
2,P01812,Adyar,909,04-02-2012,70,1.0,1.0,3,AbNormal,Yes,...,ELO,Gravel,RL,4.1,3.8,2.2,3.09,421094,92114,13159200
3,P05346,Velachery,1855,13-03-2010,14,3.0,2.0,5,Family,No,...,NoSewr,Paved,I,4.7,3.9,3.6,4.01,356321,77042,9630290
4,P06210,Karapakkam,1226,05-10-2009,84,1.0,1.0,3,AbNormal,Yes,...,AllPub,Gravel,C,3.0,2.5,4.1,3.29,237000,74063,7406250


In [36]:
data['TOTAL_PRICE']=data['REG_FEE']+data['COMMIS']+data['SALES_PRICE']

In [37]:
data.drop(['REG_FEE','COMMIS','PRT_ID','SALES_PRICE'],axis=1, inplace=True)

In [38]:
data.head()

Unnamed: 0,AREA,INT_SQFT,DATE_SALE,DIST_MAINROAD,N_BEDROOM,N_BATHROOM,N_ROOM,SALE_COND,PARK_FACIL,DATE_BUILD,BUILDTYPE,UTILITY_AVAIL,STREET,MZZONE,QS_ROOMS,QS_BATHROOM,QS_BEDROOM,QS_OVERALL,TOTAL_PRICE
0,Karapakkam,1004,04-05-2011,131,1.0,1.0,3,AbNormal,Yes,15-05-1967,Commercial,AllPub,Paved,A,4.0,3.9,4.9,4.33,8124400
1,Anna Nagar,1986,19-12-2006,26,2.0,1.0,5,AbNormal,No,22-12-1995,Commercial,AllPub,Gravel,RH,4.9,4.2,2.5,3.765,22781941
2,Adyar,909,04-02-2012,70,1.0,1.0,3,AbNormal,Yes,09-02-1992,Commercial,ELO,Gravel,RL,4.1,3.8,2.2,3.09,13672408
3,Velachery,1855,13-03-2010,14,3.0,2.0,5,Family,No,18-03-1988,Others,NoSewr,Paved,I,4.7,3.9,3.6,4.01,10063653
4,Karapakkam,1226,05-10-2009,84,1.0,1.0,3,AbNormal,Yes,13-10-1979,Others,AllPub,Gravel,C,3.0,2.5,4.1,3.29,7717313


In [39]:
x_train, x_test, y_train, y_test = train_test_split(data.loc[:,:'QS_OVERALL'], data['TOTAL_PRICE'],test_size=0.15)

In [40]:
x_train.shape, x_test.shape

((6042, 18), (1067, 18))

## -- Fixing redundancy of values in object datatype features --

In [41]:
# Getting all features with object datatype in obj_features variable
obj_features=[features for features in data.columns if data[features].dtypes=='object' and 'DATE' not in features]

In [42]:
val_dict={'AREA':{'Karapakkam':['Karapakam'], 'Anna Nagar':['Ana Nagar', 'Ann Nagar'], 'Adyar':['Adyr'], 'Velachery':['Velchery'], 
 'Chrompet':['Chrompt', 'Chrmpet', 'Chormpet'], 'KK Nagar':['KKNagar'],'T Nagar': ['TNagar']},
'SALE_COND':{'AbNormal':['Ab Normal'], 'Partial':['Partiall', 'PartiaLl'], 'AdjLand':['Adj Land']},
'PARK_FACIL': {'No':['Noo']},
'BUILDTYPE':{'Commercial':['Comercial'], 'Others': ['Other']},
'UTILITY_AVAIL': {'AllPub':['All Pub'], 'NoSewa': ['NoSewr ','NoSwer','NoSeWa'], },
'STREET': {'Paved':['Pavd'], 'No Access':['NoAccess']}

}

In [43]:
# Replacing all misspelled values with original values
def reduce_redundant_val(data,val_dict):
    for feature in val_dict:
        for classes in val_dict[feature]:
            for val in val_dict[feature][classes]:
                data[feature]=data[feature].replace(val,classes)

In [44]:
reduce_redundant_val(x_train,val_dict)
reduce_redundant_val(x_test,val_dict)

In [48]:
for features in obj_features:
    print(features,":",x_train[features].unique())

AREA : ['Chrompet' 'Adyar' 'KK Nagar' 'T Nagar' 'Karapakkam' 'Anna Nagar'
 'Velachery']
SALE_COND : ['AdjLand' 'AbNormal' 'Partial' 'Normal Sale' 'Family']
PARK_FACIL : ['Yes' 'No']
BUILDTYPE : ['Commercial' 'House' 'Others']
UTILITY_AVAIL : ['NoSewa' 'AllPub' 'ELO']
STREET : ['No Access' 'Gravel' 'Paved']
MZZONE : ['RL' 'RM' 'A' 'RH' 'I' 'C']


## -- Performing Label Encoding over Object Datatype Features --

In [None]:
label_encoder=LabelEncoder()
for feature in obj_features:
    label_encoder.fit(x_train[feature])
    x_train[feature+'_enc']=label_encoder.transform(x_train[feature])
    x_test[feature+'_enc']=label_encoder.transform(x_test[feature])

In [None]:
x_train.drop(obj_features,axis=1,inplace=True)
x_test.drop(obj_features,axis=1,inplace=True)

In [None]:
x_train.head()

## -- Converting Temporal data to Numeric feature --

In [None]:
# Converting DATE_SALE and DATE_BUILD feature to House_age feature

x_train['DATE_BUILD']=pd.to_datetime(x_train['DATE_BUILD'],format='%d-%M-%Y')
x_train['DATE_SALE']=pd.to_datetime(x_train['DATE_SALE'],format='%d-%M-%Y')

x_train['HOUSE_AGE']=x_train['DATE_SALE'].dt.year-x_train['DATE_BUILD'].dt.year

x_train.drop(['DATE_BUILD', 'DATE_SALE'],axis=1,inplace=True)

In [None]:
x_train.head()

In [None]:
x_test['DATE_BUILD']=pd.to_datetime(x_test['DATE_BUILD'],format='%d-%M-%Y')
x_test['DATE_SALE']=pd.to_datetime(x_test['DATE_SALE'],format='%d-%M-%Y')

x_test['HOUSE_AGE']=x_test['DATE_SALE'].dt.year-x_test['DATE_BUILD'].dt.year

x_test.drop(['DATE_BUILD', 'DATE_SALE'],axis=1,inplace=True)

In [None]:
x_test.head()

## -- Imputing Missing values in training and test data --

In [None]:
imputer=KNNImputer()

In [None]:
imputer.fit(x_train)

In [None]:
x_train=pd.DataFrame(imputer.transform(x_train),columns=x_train.columns)
x_test=pd.DataFrame(imputer.transform(x_test),columns=x_test.columns)

In [None]:
x_test.head()

## -- Feature Scaling --

In [None]:
scale=StandardScaler()

In [None]:
scale.fit(x_train)

In [None]:
scale_x_train=pd.DataFrame(scale.transform(x_train),columns=x_train.columns)
scale_x_test=pd.DataFrame(scale.transform(x_test),columns=x_test.columns)

## -- Feature Selection --

In [None]:
k_features=SelectKBest(mutual_info_regression,k=8)

In [None]:
k_features.fit(scale_x_train,y_train)

In [None]:
k_features.transform(scale_x_train)
scale_x_train=scale_x_train[k_features.get_feature_names_out()]
scale_x_test=scale_x_test[k_features.get_feature_names_out()]

In [None]:
scale_x_train.head()

In [None]:
# scale_x_train.to_csv('../Data/processed_data/x_train.csv',index=False)
# scale_x_test.to_csv('../Data/processed_data/x_test.csv',index=False)
# y_train.to_csv('../Data/processed_data/y_train.csv',index=False)
# y_test.to_csv('../Data/processed_data/y_test.csv',index=False)