In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
train_data = pd.read_csv("Data/train.csv")
train_data.head()

Unnamed: 0,Year,Date,Locality,Address,Estimated Value,Sale Price,Property,Residential,num_rooms,carpet_area,property_tax_rate
0,2009,2009-01-02,Greenwich,40 ETTL LN UT 24,711270.0,975000.0,Condo,Condominium,2,760,1.025953
1,2009,2009-01-02,East Hampton,18 BAUER RD,119970.0,189900.0,Single Family,Detached House,3,921,1.025953
2,2009,2009-01-02,Ridgefield,48 HIGH VALLEY RD.,494530.0,825000.0,Single Family,Detached House,3,982,1.025953
3,2009,2009-01-02,Old Lyme,56 MERIDEN RD,197600.0,450000.0,Single Family,Detached House,3,976,1.025953
4,2009,2009-01-02,Naugatuck,13 CELENTANO DR,105440.0,200000.0,Single Family,Detached House,3,947,1.025953


In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 553952 entries, 0 to 553951
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Year               553952 non-null  int64  
 1   Date               553952 non-null  object 
 2   Locality           553952 non-null  object 
 3   Address            553952 non-null  object 
 4   Estimated Value    553952 non-null  float64
 5   Sale Price         553952 non-null  float64
 6   Property           553952 non-null  object 
 7   Residential        553952 non-null  object 
 8   num_rooms          553952 non-null  int64  
 9   carpet_area        553952 non-null  int64  
 10  property_tax_rate  553952 non-null  float64
dtypes: float64(3), int64(3), object(5)
memory usage: 46.5+ MB


In [4]:
# checking null values
train_data.isnull().sum()

Year                 0
Date                 0
Locality             0
Address              0
Estimated Value      0
Sale Price           0
Property             0
Residential          0
num_rooms            0
carpet_area          0
property_tax_rate    0
dtype: int64

In [5]:
# converting Date column to Datetime object
train_data["Date"] = pd.to_datetime(train_data["Date"], format="%Y-%m-%d")

In [7]:
# spliting Date,month from Date column
train_data["Day"] =  train_data["Date"].dt.day
train_data["month"] =  train_data["Date"].dt.month
train_data["is_weekend"] = (train_data['Date'].dt.dayofweek >= 5).astype(int)
train_data['quarter'] = train_data['Date'].dt.quarter

In [8]:
train_data_new = train_data.drop("Date", axis = 1)

In [9]:
#converting categorical column to numerical column
from sklearn.preprocessing import LabelEncoder

In [10]:
label_encode = LabelEncoder()
train_data_new["Residential"] = label_encode.fit_transform(train_data_new["Residential"])

In [11]:
label_encode_1 = LabelEncoder()
train_data_new["Property"] = label_encode_1.fit_transform(train_data_new["Property"])

In [13]:
train_data_new.head()

Unnamed: 0,Year,Locality,Address,Estimated Value,Sale Price,Property,Residential,num_rooms,carpet_area,property_tax_rate,Day,month,is_weekend,quarter
0,2009,Greenwich,40 ETTL LN UT 24,711270.0,975000.0,0,0,2,760,1.025953,2,1,0,1
1,2009,East Hampton,18 BAUER RD,119970.0,189900.0,3,1,3,921,1.025953,2,1,0,1
2,2009,Ridgefield,48 HIGH VALLEY RD.,494530.0,825000.0,3,1,3,982,1.025953,2,1,0,1
3,2009,Old Lyme,56 MERIDEN RD,197600.0,450000.0,3,1,3,976,1.025953,2,1,0,1
4,2009,Naugatuck,13 CELENTANO DR,105440.0,200000.0,3,1,3,947,1.025953,2,1,0,1


In [23]:
train_data_new["Locality"].value_counts()

Locality
Bridgeport       17672
Waterbury        15537
Stamford         15341
Norwalk          12800
Fairfield        11118
                 ...  
Scotland           223
Eastford           222
Canaan             220
Union               99
***Unknown***        1
Name: count, Length: 170, dtype: int64

In [54]:
from xgboost import XGBRegressor

xgb_model = XGBRegressor()

In [55]:
X = train_data_new.drop(["Locality","Address","Sale Price"], axis = 1)
y = train_data_new["Sale Price"]

In [56]:
# split data to train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state=42)

In [57]:
X_train.shape,X_test.shape

((387766, 10), (166186, 10))

In [58]:
y_train.shape, y_test.shape

((387766,), (166186,))

In [59]:
xgb_model.fit(X_train, y_train)

In [60]:
# evaltuing performance
from sklearn.metrics import mean_squared_error,mean_absolute_error

mean_absolute_error(y_test, y_pred)

83401.70615491101

In [51]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

In [54]:
def train(X, y):
    """
    Trains different regression models, evaluates their performance on test data,
    and stores the resulting metrics in a pandas DataFrame.
    :param X: pandas DataFrame containing the features
    :param y: pandas Series containing the target variable
    :return: pandas DataFrame containing the model names and corresponding metrics
    """
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define the regression models to be trained
    models = [
        LinearRegression(),
        Ridge(),
        Lasso(),
        DecisionTreeRegressor(),
        RandomForestRegressor(),
        AdaBoostRegressor(),
        xgb.XGBRegressor(),
        lgb.LGBMRegressor(),
        cb.CatBoostRegressor(silent=True)
    ]

    # Train the regression models and evaluate their performance on test data
    results = []
    for model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        results.append(pd.DataFrame({'Model': [type(model).__name__], 'RMSE': [rmse], 'MAE': [mae], 'R2': [r2]}))

    return pd.concat(results, ignore_index=True)

In [55]:
results = train(X,y)
results

# Creating Cluster

In [61]:
from sklearn.cluster import KMeans

X_cluster_data = train_data_new.drop(["Locality","Address"], axis = 1)

# Train the KMeans clustering model with 4 clusters
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans.fit(X_cluster_data)



In [62]:
# Assign the predicted cluster labels to each row in the dataframe
train_data_new['segment'] = kmeans.predict(X_cluster_data)

In [63]:
train_data_new["segment"].value_counts()

segment
0    533785
3     19286
2       880
1         1
Name: count, dtype: int64

# Test Data

In [64]:
test_data = pd.read_csv("Data/test.csv")
test_data.head()

Unnamed: 0,Year,Date,Locality,Address,Estimated Value,Sale Price,Property,Residential,num_rooms,carpet_area,property_tax_rate,Segment
0,2023,2023-01-01,Old Lyme,12 SWAN AVE,151400.0,0,Residential,Detached House,3,947.0,1.46,0
1,2023,2023-01-01,Ridgefield,59 LINCOLN LANE,686900.0,0,Residential,Detached House,3,1051.0,1.46,0
2,2023,2023-01-04,Cromwell,6 GROVE RD,152030.0,0,Residential,Detached House,3,925.0,1.46,0
3,2023,2023-01-04,New Haven,346 CONCORD ST,156130.0,0,Residential,Duplex,4,1210.0,1.46,0
4,2023,2023-01-04,Beacon Falls,14 LASKY ROAD,108970.0,0,Residential,Detached House,3,1089.0,1.46,0


In [65]:
#test_data["Residential"] = label_encode.transform(test_data["Residential"])
test_data["Property"] = label_encode_1.transform(test_data["Property"])

In [66]:
test_data["Residential"] = label_encode.transform(test_data["Residential"])

In [68]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43954 entries, 0 to 43953
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Year               43954 non-null  int64  
 1   Date               43954 non-null  object 
 2   Locality           43954 non-null  object 
 3   Address            43954 non-null  object 
 4   Estimated Value    43954 non-null  float64
 5   Sale Price         43954 non-null  int64  
 6   Property           43954 non-null  int32  
 7   Residential        43954 non-null  int32  
 8   num_rooms          43954 non-null  int64  
 9   carpet_area        43954 non-null  float64
 10  property_tax_rate  43954 non-null  float64
 11  Segment            43954 non-null  int64  
dtypes: float64(3), int32(2), int64(4), object(3)
memory usage: 3.7+ MB


In [69]:
# converting Date column to Datetime object
test_data["Date"] = pd.to_datetime(test_data["Date"], format="%Y-%m-%d")

# spliting Date,month from Date column
test_data["Day"] =  test_data["Date"].dt.day
test_data["month"] =  test_data["Date"].dt.month
test_data["is_weekend"] = (test_data['Date'].dt.dayofweek >= 5).astype(int)

test_data_new = test_data.drop("Date", axis = 1)

In [71]:
test_data_new =test_data_new.drop(["Locality","Address"], axis = 1)

In [76]:
test_data_new =test_data_new.drop(["Segment"], axis = 1)

In [79]:
test_data_new =test_data_new.drop(["Sale Price"], axis = 1)

In [80]:
y_pred = xgb_model.predict(test_data_new)

In [81]:
test_data_new["Sale Price"] = y_pred

In [82]:
test_data_new

Unnamed: 0,Year,Estimated Value,Property,Residential,num_rooms,carpet_area,property_tax_rate,Day,month,is_weekend,Sale Price
0,2023,151400.0,2,1,3,947.0,1.46,1,1,1,2.531179e+05
1,2023,686900.0,2,1,3,1051.0,1.46,1,1,1,1.021576e+06
2,2023,152030.0,2,1,3,925.0,1.46,4,1,0,2.382342e+05
3,2023,156130.0,2,2,4,1210.0,1.46,4,1,0,2.491857e+05
4,2023,108970.0,2,1,3,1089.0,1.46,4,1,0,2.363705e+05
...,...,...,...,...,...,...,...,...,...,...,...
43949,2023,137600.0,2,1,3,1047.0,1.46,30,9,1,2.556108e+05
43950,2023,114650.0,2,1,3,921.0,1.46,30,9,1,2.902830e+05
43951,2023,163400.0,2,1,3,957.0,1.46,30,9,1,2.905474e+05
43952,2023,282300.0,2,1,3,1081.0,1.46,30,9,1,4.776798e+05


In [83]:
train_data_new.to_csv("processed_train_data.csv", index = False)

In [84]:
test_data_new.to_csv("processed_test_data.csv", index = False)

In [90]:
from sklearn.mixture import GaussianMixture

In [91]:
n_components = 4
gmm = GaussianMixture(n_components=n_components, covariance_type='full', random_state=42)

In [92]:
# Fit the model and predict clusters
gmm.fit(X_cluster_data)

In [93]:
# predict the cluster labels
y_pred = gmm.predict(X_cluster_data)

In [94]:
# Assign the predicted cluster labels to each row in the dataframe
train_data_new['segment'] = y_pred

In [95]:
train_data_new.to_csv("processed_train_data_v1.csv", index = False)