In [None]:
# Initial imports
import pandas as pd
from pathlib import Path
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, r2_score
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

### Step 1: Read the data from the `Resources` folder into a Pandas DataFrame.

In [None]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
# YOUR CODE HERE!
df = pd.read_csv("../Resources/housing.csv")
# Review the DataFrame
# YOUR CODE HERE!
df = df.fillna(0)
df.head()

In [None]:
# Look at correlation matrix
print(df.corr())

In [None]:
# Plot matrix
plt.figure(figsize=(20, 15))
correlations = df.corr()
sns.heatmap(correlations, cmap="coolwarm", annot=True)
plt.show()

In [None]:
df_encoded = pd.get_dummies(df, columns=['ocean_proximity'])
df_encoded.head()

In [None]:
print(df.dtypes)

In [None]:
# Define features set
data_x = df_encoded.drop(['median_house_value', 'latitude', 'longitude'], axis =1)
y = df['median_house_value'].values.reshape(-1, 1)
X = data_x

In [None]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [None]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [None]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
n_estimators = [20, 50, 100, 200, 300, 400, 500]
accuracy = []
for n in n_estimators:
    rf_model = RandomForestRegressor(n_estimators=n, random_state=78)
    rf_model = rf_model.fit(X_train_scaled, y_train.ravel())
    predictions = rf_model.predict(X_test_scaled)
    acc = rf_model.score(X_test_scaled, y_test)
    accuracy.append(acc)

In [None]:
print(accuracy)

In [None]:
n_estimators = [400,425,450,475,500]
accuracy = []
for n in n_estimators:
    rf_model = RandomForestRegressor(n_estimators=n, random_state=78)
    rf_model = rf_model.fit(X_train_scaled, y_train.ravel())
    predictions = rf_model.predict(X_test_scaled)
    acc = rf_model.score(X_test_scaled, y_test)
    accuracy.append(acc)

In [None]:
print(accuracy)

In [None]:
param_grid = {'n_estimators': [400, 410],
              'max_depth': [10, 20, 30, 40],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4],
              'bootstrap': [True, False]}
rf = RandomForestRegressor()

In [None]:
grid_search = GridSearchCV(rf, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train.ravel())
print("Best hyperparameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

In [None]:
# Create the random forest regressor instance
rf_model = RandomForestRegressor(n_estimators=400, random_state=78)
# Fit the model and use .ravel()on the "y_train" data. 
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)
# Calculating the accuracy score
accuracy = rf_model.score(X_test_scaled, y_test)
print("Accuracy:", accuracy)

In [None]:
plt.plot(y_test, predictions, "o")
plt.plot(y_test, y_test, "--")
plt.xlabel("True Values")
plt.ylabel("Predictions")
plt.title("Accuracy: %.2f" % accuracy)
plt.show()