In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

data = pd.read_csv('Salary_Data.csv') 

print(data.head())

data = data.dropna()

X = data[['Area', 'Bedrooms', 'Location']]  
y = data['Price']  

X = pd.get_dummies(X, columns=['Location'], drop_first=True)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model_lr = LinearRegression()
model_lr.fit(X_train, y_train)

y_pred_lr = model_lr.predict(X_test)

print("Linear Regression Performance:")
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred_lr))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_lr))
print("R-squared:", r2_score(y_test, y_pred_lr))

model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

y_pred_rf = model_rf.predict(X_test)

print("\nRandom Forest Performance:")
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred_rf))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_rf))
print("R-squared:", r2_score(y_test, y_pred_rf))


   Area  Bedrooms Location   Price
0  1500         3    Urban  400000
1  1800         4    Urban  500000
2  2000         3    Rural  350000
3  2500         5    Urban  600000
4  2200         4    Rural  450000
Linear Regression Performance:
Mean Absolute Error: 5034.883720930346
Mean Squared Error: 34419956.73337099
R-squared: 0.990438900907397

Random Forest Performance:
Mean Absolute Error: 73350.0
Mean Squared Error: 5425785000.0
R-squared: -0.5071625
