In [None]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import mean_squared_error

from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
#Load data
df = pd.read_csv('regression.csv')

In [None]:
#Encode strings with numerical values
#First Step
encoder = LabelEncoder()
encoded = encoder.fit_transform(df['ocean_proximity'])
df['ocean_proximity'] = encoded

In [None]:
#Print null columns
#Second Step
for column in df.columns:
  if not df[column].isnull().any():
    continue
  print(column)

total_bedrooms


In [None]:
#Replace nans with median values
#Second Step
df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)

In [None]:
#Get rid of useless columns
#Third step
df.drop(['No.'], inplace=True, axis=1)

In [None]:
Y = df['median_house_value']
X = df.drop(['median_house_value'], axis=1)

In [None]:
#Fourth Step
#Scaling the data
minmax = MinMaxScaler()
X_train, X_test, y_train, y_test = train_test_split(minmax.fit_transform(X), Y, test_size=0.2, shuffle=True)

In [None]:
#Train and see accuracies
def find_accuracy(ml):
  return mean_squared_error(ml.fit(X_train, y_train).predict(X_test), y_test)

svm = SVR()
rforest = RandomForestRegressor()
lregression= LinearRegression()

print("SVR accuracy:",find_accuracy(svm))
print("RandomFR accuracy:", find_accuracy(rforest))
print("LinerR accuracy:", find_accuracy(lregression))

SVR accuracy: 11572519266.144312
RandomFR accuracy: 4542544629.960737
LinerR accuracy: 5241344123.82143


In [None]:
#Fine tune the model
#Used GridSearch to finetune
params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
rf = RandomForestRegressor()
grid_search = GridSearchCV(rf, params, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}


In [None]:
best_rf = grid_search.best_estimator_
best_rf.fit(X_train, y_train)
y_pred = best_rf.predict(X_test)

In [None]:
#Final result
mean_squared_error(y_test, y_pred)

4530452807.747878