# Import The Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

In [37]:
df = pd.read_csv(r"C:\Users\itsso\Downloads\HouseData\data.csv")
df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


# Data Preprocessing

In [38]:
# Drop any missing values
df.dropna(inplace=True)

# Convert categorical to numerical variables
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = df[cat_cols].apply(pd.Categorical)
df[cat_cols] = df[cat_cols].apply(lambda x: x.cat.codes)
df['date'] = pd.to_datetime(df['date'])
df['date'] = df['date'].apply(lambda x: x.timestamp())
df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,0.0,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,1522,36,62,0
1,0.0,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,3899,35,58,0
2,0.0,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,2291,18,26,0
3,0.0,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,4263,3,7,0
4,0.0,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,4352,31,31,0


In [39]:
X = df.drop(['price'], axis=1) 
y = df['price']

# Split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training The Models

In [43]:
# Linear Regression
lr= LinearRegression()
lr.fit(X_train, y_train)

# Random Forest Regression
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Decision Tree Regression
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)

# Gradient Boosting Regressor 
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
gb.fit(X_train, y_train)

# Testing The Models

In [44]:
# Evaluate the models
lr_pred = lr.predict(X_test)
rf_pred = rf.predict(X_test)
dt_pred = dt.predict(X_test)
gb_pred = gb.predict(X_test)

model = ['LinearRegression','RandomForestRegression','DecisionTreeRegression','GradientBoostingRegression']
mse = [mean_squared_error(y_test, lr_pred), mean_squared_error(y_test, rf_pred), mean_squared_error(y_test, dt_pred), mean_squared_error(y_test, gb_pred)]

result = pd.DataFrame({
    "Model":model,
    "MSE":mse
})

result

Unnamed: 0,Model,MSE
0,LinearRegression,986290800000.0
1,RandomForestRegression,975000100000.0
2,DecisionTreeRegression,1022649000000.0
3,GradientBoostingRegression,967919800000.0
