In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import pickle
from scipy import stats

In [2]:
# Load the dataset
data = pd.read_csv(r"C:\Users\USER\Desktop\Final_Assignment - Copy\House Price Project\data.csv")

In [None]:
data.head()

In [3]:
data.tail()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
4595,09-07-2014 00:00,308166.6667,3,1.75,1510,6360,1.0,0,0,4,1510,0,1954,1979,501 N 143rd St,Seattle,WA 98133,USA
4596,09-07-2014 00:00,534333.3333,3,2.5,1460,7573,2.0,0,0,3,1460,0,1983,2009,14855 SE 10th Pl,Bellevue,WA 98007,USA
4597,09-07-2014 00:00,416904.1667,3,2.5,3010,7014,2.0,0,0,3,3010,0,2009,0,759 Ilwaco Pl NE,Renton,WA 98059,USA
4598,10-07-2014 00:00,203400.0,4,2.0,2090,6630,1.0,0,0,3,1070,1020,1974,0,5148 S Creston St,Seattle,WA 98178,USA
4599,10-07-2014 00:00,220600.0,3,2.5,1490,8102,2.0,0,0,4,1490,0,1990,0,18717 SE 258th St,Covington,WA 98042,USA


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4600 non-null   int64  
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   int64  
 8   view           4600 non-null   int64  
 9   condition      4600 non-null   int64  
 10  sqft_above     4600 non-null   int64  
 11  sqft_basement  4600 non-null   int64  
 12  yr_built       4600 non-null   int64  
 13  yr_renovated   4600 non-null   int64  
 14  street         4600 non-null   object 
 15  city           4600 non-null   object 
 16  statezip       4600 non-null   object 
 17  country        4600 non-null   object 
dtypes: float

In [None]:
data.isnull().sum()

In [None]:
data.describe()

In [None]:
# Data Cleaning
# Since there are no missing values, we can skip imputation.
# Check for duplicates
data.drop_duplicates(inplace=True)

In [None]:
# Outlier Removal
# You can use various methods such as Z-score, IQR, or domain knowledge.
# Let's use Z-score for demonstration.
z_scores = np.abs(stats.zscore(data['price']))
threshold = 3
data_cleaned = data[(z_scores < threshold)]

In [None]:
# Exploratory Data Analysis (EDA)
# Visualize the distribution of numerical features
sns.pairplot(data_cleaned[['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot']])
plt.show()

In [None]:
# Visualize correlations between numerical features
numerical_cols = data_cleaned.select_dtypes(include=[np.number]).columns
plt.figure(figsize=(12, 8))
sns.heatmap(data_cleaned[numerical_cols].corr(), annot=True, cmap='coolwarm')
plt.show()

In [None]:
# Data Preprocessing
# Convert categorical variables to numerical using one-hot encoding
categorical_cols = ['city', 'statezip']
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), categorical_cols)
    ],
    remainder='passthrough'
)
X_encoded = preprocessor.fit_transform(data_cleaned.drop(['price', 'date', 'street', 'country'], axis=1))

# Split data into features and target variable
y = data_cleaned['price']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Print the shape of X_train_scaled before fitting the model
print("Shape of X_train_scaled:", X_train_scaled.shape)

# Model Training and Evaluation
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Support Vector Machine': SVR()
}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
    rmse_scores = np.sqrt(-cv_scores)
    print(f'{name} CV RMSE: {rmse_scores}')

# Print the shape of X_train_scaled before fitting the model
print("Shape of X_test_scaled:", X_test_scaled.shape)

In [None]:
# Model Deployment
# Choose the best performing model and save it
best_model = RandomForestRegressor()  # Change to the best model from above
best_model.fit(X_train_scaled, y_train)
pickle.dump(best_model, open('model.pkl', 'wb'))

# Later, you can load the model and use it for predictions
loaded_model = pickle.load(open('model.pkl', 'rb'))
prediction = loaded_model.predict(X_test_scaled)

In [None]:
data.info()