In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_pickle('dataframe.pkl')

### Selecting features and target

In [4]:
features = df[['area', 'parking', 'furnishingstatus', 'bedrooms', 'stories']]

target = df[['price']]

### Using Standard Scaler on the features to avoid higher values prejudicing the model

In [5]:
F_scl = StandardScaler().fit_transform(features)

### Spliting the data into training and test sets

In [7]:
X_train, X_test, y_train, y_test = train_test_split(F_scl, target, test_size=0.2, random_state=42)


In [11]:
import pickle

with open('X_train.pkl', 'wb') as file:
    pickle.dump(X_train, file)

with open('X_test.pkl', 'wb') as file:
    pickle.dump(X_test, file)

with open('y_train.pkl', 'wb') as file:
    pickle.dump(y_train, file)

with open('y_test.pkl', 'wb') as file:
    pickle.dump(y_test, file)

In [12]:
## Importing the models

# Linear Regression
from sklearn.linear_model import LinearRegression

# Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor

# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

# Gradient Boosting Regressor
from sklearn.ensemble import GradientBoostingRegressor

# Support Vector Regressor
from sklearn.svm import SVR

In [13]:
## Initializing the models

# Linear Regression
linear_reg_model = LinearRegression()

# Decision Tree Regressor
decision_tree_model = DecisionTreeRegressor(random_state=42)

# Random Forest Regressor
random_forest_model = RandomForestRegressor(random_state=42)

# Gradient Boosting Regressor
gradient_boosting_model = GradientBoostingRegressor(random_state=42)

# Support Vector Regressor
support_vector_model = SVR()




In [14]:
# Train the Linear Regression Model
linear_reg_model.fit(X_train, y_train)

# Train the Decision Tree Regressor Model
decision_tree_model.fit(X_train, y_train)

# Train the Random Forest Regressor Model
random_forest_model.fit(X_train, y_train)

# Train the Gradient Boosting Regressor Model
gradient_boosting_model.fit(X_train, y_train)

# Train the Support Vector Regressor Model
support_vector_model.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [15]:
# Predictions
# Make Predictions using Linear Regression Model
linear_reg_predictions = linear_reg_model.predict(X_test)

# Make Predictions using Decision Tree Regressor Model
decision_tree_predictions = decision_tree_model.predict(X_test)

# Make Predictions using Random Forest Regressor Model
random_forest_predictions = random_forest_model.predict(X_test)

# Make Predictions using Gradient Boosting Regressor Model
gradient_boosting_predictions = gradient_boosting_model.predict(X_test)

# Make Predictions using Support Vector Regressor Model
support_vector_predictions = support_vector_model.predict(X_test)



In [17]:
from sklearn.metrics import mean_squared_error, r2_score

# Evaluate Linear Regression Model
linear_reg_mse = mean_squared_error(y_test, linear_reg_predictions)
linear_reg_r2 = r2_score(y_test, linear_reg_predictions)
print(f'Linear Regression - MSE: {linear_reg_mse}, R2 Score: {linear_reg_r2}')
print('Linear Regression' + ' ' + str(linear_reg_model.score(X_test, y_test)))

# Evaluate Decision Tree Regressor Model
decision_tree_mse = mean_squared_error(y_test, decision_tree_predictions)
decision_tree_r2 = r2_score(y_test, decision_tree_predictions)
print(f'Decision Tree - MSE: {decision_tree_mse}, R2 Score: {decision_tree_r2}')
print('Decision tree' + ' ' + str(decision_tree_model.score(X_test, y_test)))

# Evaluate Random Forest Regressor Model
random_forest_mse = mean_squared_error(y_test, random_forest_predictions)
random_forest_r2 = r2_score(y_test, random_forest_predictions)
print(f'Random Forest - MSE: {random_forest_mse}, R2 Score: {random_forest_r2}')
print('Random Forest' + ' ' + str(random_forest_model.score(X_test, y_test)))

# Evaluate Gradient Boosting Regressor Model
gradient_boosting_mse = mean_squared_error(y_test, gradient_boosting_predictions)
gradient_boosting_r2 = r2_score(y_test, gradient_boosting_predictions)
print(f'Gradient Boosting - MSE: {gradient_boosting_mse}, R2 Score: {gradient_boosting_r2}')
print('Gradient Boosting' + ' ' + str(gradient_boosting_model.score(X_test, y_test)))

# Evaluate Support Vector Regressor Model
support_vector_mse = mean_squared_error(y_test, support_vector_predictions)
support_vector_r2 = r2_score(y_test, support_vector_predictions)
print(f'Support Vector - MSE: {support_vector_mse}, R2 Score: {support_vector_r2}')
print('Support Vector' + ' ' + str(support_vector_model.score(X_test, y_test)))

Linear Regression - MSE: 1300084658363.6882, R2 Score: 0.5254755666665837
Linear Regression 0.5254755666665837
Decision Tree - MSE: 2256984560061.0, R2 Score: 0.17621186242348996
Decision tree 0.17621186242348996
Random Forest - MSE: 1400505318096.4143, R2 Score: 0.48882252538339865
Random Forest 0.48882252538339865
Gradient Boosting - MSE: 1307159385427.59, R2 Score: 0.522893326479847
Gradient Boosting 0.522893326479847
Support Vector - MSE: 2846593241213.9355, R2 Score: -0.03899237332581951
Support Vector -0.03899237332581951


### Hypertunnning

In [31]:
from sklearn.metrics import make_scorer
scorer = make_scorer(mean_squared_error)

In [32]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
parameters_rf = {
 'bootstrap': [True, False],
 'max_depth': [10, 20, None],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [25, 50]
}

# Create the grid search object
grid_obj_rf = GridSearchCV(random_forest_model, parameters_rf, cv=5, scoring=scorer, n_jobs=-1)

# Fit the grid search object to the data
grid_fit_rf = grid_obj_rf.fit(X_train, y_train)

# Get the estimator with the best parameters
best_reg_rf = grid_fit_rf.best_estimator_

# Fit the best model to the data
best_reg_rf.fit(X_train, y_train)


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


In [33]:
# Model performance on TRAIN data
best_reg_rf.score(X_train, y_train)

0.9926158450552337

In [34]:
# Model performance on TRAIN data
best_reg_rf.score(X_test, y_test)

0.1609110186996563

In [35]:
parameters_dt = {
 'max_depth': [10, 20, None],
 'min_samples_split': [2, 5, 10]
}

# Create the grid search object
grid_obj_dt = GridSearchCV(decision_tree_model, parameters_dt, cv=5, scoring=scorer, n_jobs=-1)

# Fit the grid search object to the data
grid_fit_dt = grid_obj_dt.fit(X_train, y_train)

# Get the estimator with the best parameters
best_reg_dt = grid_fit_dt.best_estimator_

# Fit the best model to the data
best_reg_dt.fit(X_train, y_train)

In [36]:
# Model performance on TRAIN data
best_reg_dt.score(X_train, y_train)

0.9926158450552337

In [37]:
# Model performance on TRAIN data
best_reg_dt.score(X_test, y_test)

0.17621186242348996

In [38]:
parameters_gb = {
 'learning_rate': [0.01, 0.1, 0.2],
 'n_estimators': [50, 100, 150],
 'max_depth': [3, 5, 7]
}

# Create the grid search object
grid_obj_gb = GridSearchCV(gradient_boosting_model, parameters_gb, cv=5, scoring=scorer, n_jobs=-1)

# Fit the grid search object to the data
grid_fit_gb = grid_obj_gb.fit(X_train, y_train)

# Get the estimator with the best parameters
best_reg_gb = grid_fit_gb.best_estimator_

# Fit the best model to the data
best_reg_gb.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [39]:
# Model performance on TRAIN data
best_reg_gb.score(X_train, y_train)

0.9889546673946583

In [40]:
# Model performance on TRAIN data
best_reg_gb.score(X_test, y_test)

0.36865993457011104

In [41]:
parameters_svr = {
 'C': [0.1, 1, 10],
 'kernel': ['linear', 'rbf'],
 'epsilon': [0.1, 0.2, 0.3]
}

# Create the grid search object
grid_obj_svr = GridSearchCV(support_vector_model, parameters_svr, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the grid search object to the data
grid_fit_svr = grid_obj_svr.fit(X_train, y_train)

# Get the estimator with the best parameters
best_reg_svr = grid_fit_svr.best_estimator_

# Fit the best model to the data
best_reg_svr.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [42]:
# Model performance on TRAIN data
best_reg_svr.score(X_train, y_train)

-0.02809728183005067

In [43]:
# Model performance on TEST data
best_reg_svr.score(X_test, y_test)

-0.0353428600639738

In [44]:
# Predictions

# Make Predictions using Decision Tree Regressor Model
best_reg_dt = decision_tree_model.predict(X_test)

# Make Predictions using Random Forest Regressor Model
best_reg_rf = random_forest_model.predict(X_test)

# Make Predictions using Gradient Boosting Regressor Model
best_reg_gb = gradient_boosting_model.predict(X_test)

# Make Predictions using Support Vector Regressor Model
best_reg_svr = support_vector_model.predict(X_test)



In [45]:
# Evaluate Decision Tree Regressor Model
decision_tree_mse = mean_squared_error(y_test, decision_tree_predictions)
decision_tree_r2 = r2_score(y_test, decision_tree_predictions)
print(f'Decision Tree - MSE: {decision_tree_mse}, R2 Score: {decision_tree_r2}')
print('Decision tree' + ' ' + str(decision_tree_model.score(X_test, y_test)))

# Evaluate Random Forest Regressor Model
random_forest_mse = mean_squared_error(y_test, random_forest_predictions)
random_forest_r2 = r2_score(y_test, random_forest_predictions)
print(f'Random Forest - MSE: {random_forest_mse}, R2 Score: {random_forest_r2}')
print('Random Forest' + ' ' + str(random_forest_model.score(X_test, y_test)))

# Evaluate Gradient Boosting Regressor Model
gradient_boosting_mse = mean_squared_error(y_test, gradient_boosting_predictions)
gradient_boosting_r2 = r2_score(y_test, gradient_boosting_predictions)
print(f'Gradient Boosting - MSE: {gradient_boosting_mse}, R2 Score: {gradient_boosting_r2}')
print('Gradient Boosting' + ' ' + str(gradient_boosting_model.score(X_test, y_test)))

# Evaluate Support Vector Regressor Model
support_vector_mse = mean_squared_error(y_test, support_vector_predictions)
support_vector_r2 = r2_score(y_test, support_vector_predictions)
print(f'Support Vector - MSE: {support_vector_mse}, R2 Score: {support_vector_r2}')
print('Support Vector' + ' ' + str(support_vector_model.score(X_test, y_test)))

Decision Tree - MSE: 2256984560061.0, R2 Score: 0.17621186242348996
Decision tree 0.17621186242348996
Random Forest - MSE: 1400505318096.4143, R2 Score: 0.48882252538339865
Random Forest 0.48882252538339865
Gradient Boosting - MSE: 1307159385427.59, R2 Score: 0.522893326479847
Gradient Boosting 0.522893326479847
Support Vector - MSE: 2846593241213.9355, R2 Score: -0.03899237332581951
Support Vector -0.03899237332581951


In [47]:
# Let's assume you have a new_data DataFrame or a single row of the DataFrame with the same features.
new_data = pd.DataFrame({
    'area': [7420],
    'parking': [2],
    'furnishingstatus': ['2'],
    'bedrooms': [4],
    'stories': [3]
})

# Ensure that any preprocessing (like encoding or scaling) that was applied to the training data is also applied to this new data.

# Make Predictions using Decision Tree Regressor Model
prediction_dt = decision_tree_model.predict(new_data)

# Make Predictions using Random Forest Regressor Model
prediction_rf = random_forest_model.predict(new_data)

# Make Predictions using Gradient Boosting Regressor Model
prediction_gb = gradient_boosting_model.predict(new_data)

# Make Predictions using Support Vector Regressor Model
prediction_svr = support_vector_model.predict(new_data)

# Print Predictions
print(f"Decision Tree Prediction: {prediction_dt[0]}")
print(f"Random Forest Prediction: {prediction_rf[0]}")
print(f"Gradient Boosting Prediction: {prediction_gb[0]}")
print(f"Support Vector Regressor Prediction: {prediction_svr[0]}")

Decision Tree Prediction: 4550000.0
Random Forest Prediction: 5457480.0
Gradient Boosting Prediction: 5346366.950239121
Support Vector Regressor Prediction: 4200015.6697568465


