In [6]:
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import OneHotEncoder, StandardScaler 
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline 
from sklearn.linear_model import Ridge 
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score 


In [8]:
data = pd.read_csv("C:/Users/HP/OneDrive/Documents/Desktop/SummerTrainnigProject/Train.csv")



In [9]:
print(data.head())

  POSTED_BY  UNDER_CONSTRUCTION  RERA  BHK_NO. BHK_OR_RK    SQUARE_FT  \
0     Owner                   0     0        2       BHK  1300.236407   
1    Dealer                   0     0        2       BHK  1275.000000   
2     Owner                   0     0        2       BHK   933.159722   
3     Owner                   0     1        2       BHK   929.921143   
4    Dealer                   1     0        2       BHK   999.009247   

   READY_TO_MOVE  RESALE                      ADDRESS  LONGITUDE   LATITUDE  \
0              1       1        Ksfc Layout,Bangalore  12.969910  77.597960   
1              1       1    Vishweshwara Nagar,Mysore  12.274538  76.644605   
2              1       1             Jigani,Bangalore  12.778033  77.632191   
3              1       1  Sector-1 Vaishali,Ghaziabad  28.642300  77.344500   
4              0       1             New Town,Kolkata  22.592200  88.484911   

   TARGET(PRICE_IN_LACS)  
0                   55.0  
1                   51.0  
2    

In [11]:
#Define the independent features and dependent feature

x = data.drop("TARGET(PRICE_IN_LACS)",axis = 1) #axis = 1 ==> indicates that it is column
y = data['TARGET(PRICE_IN_LACS)'] #Dependent Variable

#  NOW x has independent variables and y has dependent variable


In [14]:
print(type(x))
print(type(y))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [16]:
#Identify the DATA TYPES

print(data.dtypes)

POSTED_BY                 object
UNDER_CONSTRUCTION         int64
RERA                       int64
BHK_NO.                    int64
BHK_OR_RK                 object
SQUARE_FT                float64
READY_TO_MOVE              int64
RESALE                     int64
ADDRESS                   object
LONGITUDE                float64
LATITUDE                 float64
TARGET(PRICE_IN_LACS)    float64
dtype: object


In [17]:
# Separate the numerical(int and float) variables and Object (String) variables from the our data

categorical_columns = ['POSTED_BY', 'BHK_OR_RK','ADDRESS']

numeric_columns = ['UNDER_CONSTRUCTION','RERA','BHK_NO.','SQUARE_FT','READY_TO_MOVE','RESALE','LONGITUDE','LATITUDE']


In [18]:
# Preprocessing for numeric data

numerical_transformer = StandardScaler() # mean = 0 & variance = 1


In [19]:
# Preprocessing for categorical data

categorical_transformer = OneHotEncoder(handle_unknown = 'ignore')



In [20]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers = [
        ('num',numerical_transformer,numeric_columns),
        ('cat',categorical_transformer,categorical_columns)
])

In [21]:
# Apply Transformation to the features

x_preprocessed = preprocessor.fit_transform(x)

print(x_preprocessed)

  (0, 0)	-0.46813430643038767
  (0, 1)	-0.6827145550519697
  (0, 2)	-0.446239615511022
  (0, 3)	-0.009731188221234862
  (0, 4)	0.4681343064303876
  (0, 5)	0.2752399359068451
  (0, 6)	-1.3424778708261196
  (0, 7)	0.0720113564310989
  (0, 10)	1.0
  (0, 11)	1.0
  (0, 2857)	1.0
  (1, 0)	-0.46813430643038767
  (1, 1)	-0.6827145550519697
  (1, 2)	-0.446239615511022
  (1, 3)	-0.009744461440641035
  (1, 4)	0.4681343064303876
  (1, 5)	0.2752399359068451
  (1, 6)	-1.454540636953881
  (1, 7)	-0.018289274499596865
  (1, 9)	1.0
  (1, 11)	1.0
  (1, 6589)	1.0
  (2, 0)	-0.46813430643038767
  (2, 1)	-0.6827145550519697
  (2, 2)	-0.446239615511022
  :	:
  (29448, 9)	1.0
  (29448, 11)	1.0
  (29448, 163)	1.0
  (29449, 0)	-0.46813430643038767
  (29449, 1)	-0.6827145550519697
  (29449, 2)	-0.446239615511022
  (29449, 3)	-0.009927452293719854
  (29449, 4)	0.4681343064303876
  (29449, 5)	0.2752399359068451
  (29449, 6)	-1.3537200528022024
  (29449, 7)	0.3211170351302008
  (29449, 10)	1.0
  (29449, 11)	1.0
  (

In [22]:
# split the data into train and test  

x_train, x_test, y_train, y_test = train_test_split(x_preprocessed,y,test_size=0.2,random_state=16)



In [23]:
# Train Ridge Regression Model

ridge_reg = Ridge(random_state=56)

ridge_reg.fit(x_train,y_train)


In [24]:
#Make Predictions with Ridge Regression 

y_pred_train = ridge_reg.predict(x_train)


In [25]:
y_pred_test = ridge_reg.predict(x_test)


In [26]:
# Evaluating the Model

def evaluate_model(y_true,y_pred):
    mae = mean_absolute_error(y_true,y_pred)
    mse = mean_squared_error(y_true,y_pred)
    rmse = mean_squared_error(y_true,y_pred)
    r2 = r2_score(y_true,y_pred)
    return mae,mse,rmse,r2


In [27]:
# training evaluation
train_evaluation = evaluate_model(y_train,y_pred_train)
test_evaluation = evaluate_model(y_test,y_pred_test)


In [28]:
evaluation_result = pd.DataFrame({
    'Metric':['MAE','MSE','RMSE','R2'],
    'Train': train_evaluation,
    'Test' : test_evaluation
})


In [29]:
print(evaluation_result)

  Metric          Train           Test
0    MAE     117.019433     150.339571
1    MSE  253645.836922  425478.225418
2   RMSE  253645.836922  425478.225418
3     R2       0.373066       0.210481
