In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
train_data = pd.read_csv("processed_train_data.csv")
train_data.head()

Unnamed: 0,Year,Latitude,Longitude,Estimated Value,Sale Price,Property,Residential,num_rooms,carpet_area,property_tax_rate,Day,month,is_weekend,quarter,segment
0,2009,41.0389,-73.6136,711270,975000.0,0,0,2,760,1.025953,2,1,0,1,2
1,2009,41.5667,-72.5,119970,189900.0,3,1,3,921,1.025953,2,1,0,1,2
2,2009,41.3053,-73.5014,494530,825000.0,3,1,3,982,1.025953,2,1,0,1,2
3,2009,41.3167,-72.3,197600,450000.0,3,1,3,976,1.025953,2,1,0,1,2
4,2009,41.4897,-73.0514,105440,200000.0,3,1,3,947,1.025953,2,1,0,1,2


In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 328193 entries, 0 to 328192
Data columns (total 15 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Year               328193 non-null  int64  
 1   Latitude           328193 non-null  float64
 2   Longitude          328193 non-null  float64
 3   Estimated Value    328193 non-null  int64  
 4   Sale Price         328193 non-null  float64
 5   Property           328193 non-null  int64  
 6   Residential        328193 non-null  int64  
 7   num_rooms          328193 non-null  int64  
 8   carpet_area        328193 non-null  int64  
 9   property_tax_rate  328193 non-null  float64
 10  Day                328193 non-null  int64  
 11  month              328193 non-null  int64  
 12  is_weekend         328193 non-null  int64  
 13  quarter            328193 non-null  int64  
 14  segment            328193 non-null  int64  
dtypes: float64(4), int64(11)
memory usage: 37.6 MB


In [4]:
X = train_data.drop(["segment"], axis = 1)
y = train_data["segment"]

In [5]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier()

In [6]:
xgb_model.fit(X,y)

In [7]:
test_data = pd.read_csv("processed_test_data.csv")
test_data.shape

(43954, 14)

In [12]:
X_test = test_data[['Year', 'Latitude','Longitude','Estimated Value', 'Sale Price', 'Property', 'Residential', 'num_rooms', 'carpet_area', 'property_tax_rate', 'Day', 'month', 'is_weekend',"quarter"]]

In [13]:
X.shape

(328193, 14)

In [14]:
X_test.shape

(43954, 14)

In [15]:
y_pred  = xgb_model.predict(X_test)

In [18]:
submission_df = pd.DataFrame(y_pred, columns=["Segment"])
submission_df

Unnamed: 0,Segment
0,2
1,2
2,2
3,2
4,2
...,...
43949,2
43950,2
43951,2
43952,2


In [19]:
submission_df["Segment"].value_counts()

Segment
2    42183
0     1656
3      113
1        2
Name: count, dtype: int64

In [21]:
submission_df.to_csv("Submission_6.csv", index = False)

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import pandas as pd
import numpy as np


def train_classification(X, y):
    """
    Trains different classification models, evaluates their performance on test data,
    and stores the resulting metrics in a pandas DataFrame.
    :param X: pandas DataFrame containing the features
    :param y: pandas Series containing the target variable
    :return: pandas DataFrame containing the model names and corresponding metrics
    """
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define the classification models to be trained
    models = [
        DecisionTreeClassifier(),
        RandomForestClassifier(),
        GradientBoostingClassifier(),
        XGBClassifier(),
        LGBMClassifier(),
        CatBoostClassifier(verbose=False)
    ]

    # Train the classification models and evaluate their performance on test data
    # Train the classification models and evaluate their performance on test data
    results = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score'])
    for model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        results = pd.concat([results, pd.DataFrame({'Model': [type(model).__name__], 'Accuracy': [accuracy], 
                                                     'Precision': [precision], 'Recall': [recall], 'F1-Score': [f1]})], 
                            ignore_index=True)

    return results

In [24]:
results = train_classification(X, y)

In [25]:
results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,DecisionTreeClassifier,0.999528,0.999529,0.999528,0.999528
1,RandomForestClassifier,0.999208,0.99921,0.999208,0.999206
2,GradientBoostingClassifier,0.998172,0.998142,0.998172,0.998088
3,XGBClassifier,0.999695,0.999695,0.999695,0.999695
4,LGBMClassifier,0.99479,0.995504,0.99479,0.995089
5,CatBoostClassifier,0.997745,0.997745,0.997745,0.997745


In [26]:
cat_boost = CatBoostClassifier(verbose=False)

In [27]:
cat_boost.fit(X,y)

<catboost.core.CatBoostClassifier at 0x15554a810d0>

In [28]:
y_pred_v1  = cat_boost.predict(X_test)

In [29]:
submission_df_1 = pd.DataFrame(y_pred_v1, columns=["Segment"])
submission_df_1.to_csv("Submission_V7.csv", index = False)

In [30]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

# Define the parameter grid to search over
param_dist = {
    "n_estimators": [100, 200, 300, 400, 500],
    "learning_rate": [0.01, 0.05, 0.1, 0.2, 0.3],
    "max_depth": [3, 4, 5, 6, 7, 8],
    #"min_child_weight": [1, 3, 5, 7],
    #"gamma": [0, 0.1, 0.2, 0.3, 0.4],
    #"subsample": [0.5, 0.6, 0.7, 0.8, 0.9],
    #"colsample_bytree": [0.5, 0.6, 0.7, 0.8, 0.9],
}


In [35]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
# Create the RandomizedSearchCV object
xgb = XGBClassifier()
rs = RandomizedSearchCV(
    xgb, param_distributions=param_dist, n_iter=100, cv=5, verbose=3, n_jobs=-1
)

In [None]:
# Fit the RandomizedSearchCV object to the data
rs.fit(X, y)

Fitting 5 folds for each of 100 candidates, totalling 500 fits




In [41]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9979059670911897

In [42]:
X_test = test_data[['Year', 'Estimated Value', 'Sale Price', 'Property', 'Residential', 'num_rooms', 'carpet_area', 'property_tax_rate', 'Day', 'month', 'is_weekend']]

In [43]:
y_pred_v3 = model.predict(X_test)

In [44]:
submission_df_3 = pd.DataFrame(y_pred_v3, columns=["Segment"])
submission_df_3.to_csv("Submission_V3.csv", index = False)

In [45]:
model.fit(X,y)

0:	learn: 1.2445605	total: 210ms	remaining: 1m 44s
1:	learn: 1.1284213	total: 390ms	remaining: 1m 37s
2:	learn: 1.0298754	total: 570ms	remaining: 1m 34s
3:	learn: 0.9455425	total: 733ms	remaining: 1m 30s
4:	learn: 0.8715335	total: 894ms	remaining: 1m 28s
5:	learn: 0.8064648	total: 1.04s	remaining: 1m 26s
6:	learn: 0.7483540	total: 1.22s	remaining: 1m 25s
7:	learn: 0.6960960	total: 1.35s	remaining: 1m 23s
8:	learn: 0.6490425	total: 1.49s	remaining: 1m 21s
9:	learn: 0.6066457	total: 1.63s	remaining: 1m 20s
10:	learn: 0.5677375	total: 1.8s	remaining: 1m 20s
11:	learn: 0.5324695	total: 1.96s	remaining: 1m 19s
12:	learn: 0.4998001	total: 2.12s	remaining: 1m 19s
13:	learn: 0.4697515	total: 2.25s	remaining: 1m 18s
14:	learn: 0.4417686	total: 2.4s	remaining: 1m 17s
15:	learn: 0.4160644	total: 2.54s	remaining: 1m 16s
16:	learn: 0.3920872	total: 2.72s	remaining: 1m 17s
17:	learn: 0.3699144	total: 2.91s	remaining: 1m 17s
18:	learn: 0.3491126	total: 3.08s	remaining: 1m 17s
19:	learn: 0.3298746	tot

160:	learn: 0.0055282	total: 25.3s	remaining: 53.3s
161:	learn: 0.0054958	total: 25.5s	remaining: 53.1s
162:	learn: 0.0054703	total: 25.6s	remaining: 53s
163:	learn: 0.0054597	total: 25.8s	remaining: 52.8s
164:	learn: 0.0054446	total: 25.9s	remaining: 52.6s
165:	learn: 0.0054104	total: 26.1s	remaining: 52.4s
166:	learn: 0.0053854	total: 26.2s	remaining: 52.3s
167:	learn: 0.0053752	total: 26.3s	remaining: 52s
168:	learn: 0.0053446	total: 26.5s	remaining: 51.9s
169:	learn: 0.0053196	total: 26.6s	remaining: 51.7s
170:	learn: 0.0053108	total: 26.8s	remaining: 51.5s
171:	learn: 0.0052868	total: 26.9s	remaining: 51.3s
172:	learn: 0.0052606	total: 27s	remaining: 51.1s
173:	learn: 0.0052495	total: 27.2s	remaining: 50.9s
174:	learn: 0.0052322	total: 27.3s	remaining: 50.7s
175:	learn: 0.0052065	total: 27.5s	remaining: 50.5s
176:	learn: 0.0052007	total: 27.6s	remaining: 50.4s
177:	learn: 0.0051939	total: 27.7s	remaining: 50.2s
178:	learn: 0.0051703	total: 27.9s	remaining: 50s
179:	learn: 0.005165

320:	learn: 0.0042779	total: 49.8s	remaining: 27.7s
321:	learn: 0.0042772	total: 49.9s	remaining: 27.6s
322:	learn: 0.0042765	total: 50.1s	remaining: 27.4s
323:	learn: 0.0042722	total: 50.2s	remaining: 27.3s
324:	learn: 0.0042687	total: 50.4s	remaining: 27.1s
325:	learn: 0.0042641	total: 50.5s	remaining: 27s
326:	learn: 0.0042624	total: 50.6s	remaining: 26.8s
327:	learn: 0.0042616	total: 50.8s	remaining: 26.6s
328:	learn: 0.0042604	total: 50.9s	remaining: 26.5s
329:	learn: 0.0042572	total: 51.1s	remaining: 26.3s
330:	learn: 0.0042549	total: 51.2s	remaining: 26.1s
331:	learn: 0.0042538	total: 51.4s	remaining: 26s
332:	learn: 0.0042526	total: 51.5s	remaining: 25.8s
333:	learn: 0.0042487	total: 51.6s	remaining: 25.7s
334:	learn: 0.0042477	total: 51.8s	remaining: 25.5s
335:	learn: 0.0042427	total: 51.9s	remaining: 25.3s
336:	learn: 0.0042416	total: 52s	remaining: 25.2s
337:	learn: 0.0042372	total: 52.2s	remaining: 25s
338:	learn: 0.0042357	total: 52.3s	remaining: 24.8s
339:	learn: 0.004232

479:	learn: 0.0039368	total: 1m 12s	remaining: 3.04s
480:	learn: 0.0039346	total: 1m 13s	remaining: 2.89s
481:	learn: 0.0039341	total: 1m 13s	remaining: 2.73s
482:	learn: 0.0039322	total: 1m 13s	remaining: 2.58s
483:	learn: 0.0039318	total: 1m 13s	remaining: 2.43s
484:	learn: 0.0039297	total: 1m 13s	remaining: 2.28s
485:	learn: 0.0039275	total: 1m 13s	remaining: 2.13s
486:	learn: 0.0039257	total: 1m 14s	remaining: 1.98s
487:	learn: 0.0039250	total: 1m 14s	remaining: 1.83s
488:	learn: 0.0039242	total: 1m 14s	remaining: 1.68s
489:	learn: 0.0039217	total: 1m 14s	remaining: 1.52s
490:	learn: 0.0039210	total: 1m 14s	remaining: 1.37s
491:	learn: 0.0039201	total: 1m 15s	remaining: 1.22s
492:	learn: 0.0039196	total: 1m 15s	remaining: 1.07s
493:	learn: 0.0039175	total: 1m 15s	remaining: 916ms
494:	learn: 0.0039158	total: 1m 15s	remaining: 764ms
495:	learn: 0.0039153	total: 1m 15s	remaining: 611ms
496:	learn: 0.0039143	total: 1m 15s	remaining: 459ms
497:	learn: 0.0039137	total: 1m 16s	remaining:

<catboost.core.CatBoostClassifier at 0x2e5ce807820>

In [46]:
y_pred_v4 = model.predict(X_test)

submission_df_4 = pd.DataFrame(y_pred_v4, columns=["Segment"])
submission_df_4.to_csv("Submission_V4.csv", index = False)

In [51]:
New_train_data = pd.read_csv("processed_train_data_v1.csv")
New_train_data.head()

Unnamed: 0,Year,Locality,Address,Estimated Value,Sale Price,Property,Residential,num_rooms,carpet_area,property_tax_rate,Day,month,is_weekend,segment
0,2009,Greenwich,40 ETTL LN UT 24,711270.0,975000.0,0,0,2,760,1.025953,2,1,0,3
1,2009,East Hampton,18 BAUER RD,119970.0,189900.0,3,1,3,921,1.025953,2,1,0,2
2,2009,Ridgefield,48 HIGH VALLEY RD.,494530.0,825000.0,3,1,3,982,1.025953,2,1,0,2
3,2009,Old Lyme,56 MERIDEN RD,197600.0,450000.0,3,1,3,976,1.025953,2,1,0,2
4,2009,Naugatuck,13 CELENTANO DR,105440.0,200000.0,3,1,3,947,1.025953,2,1,0,2


In [52]:
X = New_train_data.drop(["segment","Locality","Address"], axis = 1)
y = New_train_data["segment"]

In [54]:
y.value_counts()

segment
2    386093
0    152990
3     14868
1         1
Name: count, dtype: int64

In [55]:
model.fit(X,y)

0:	learn: 1.2472015	total: 120ms	remaining: 59.9s
1:	learn: 1.1319061	total: 239ms	remaining: 59.4s
2:	learn: 1.0347465	total: 369ms	remaining: 1m 1s
3:	learn: 0.9510267	total: 492ms	remaining: 1m 1s
4:	learn: 0.8779965	total: 614ms	remaining: 1m
5:	learn: 0.8134868	total: 731ms	remaining: 1m
6:	learn: 0.7559279	total: 859ms	remaining: 1m
7:	learn: 0.7046376	total: 986ms	remaining: 1m
8:	learn: 0.6581663	total: 1.11s	remaining: 1m
9:	learn: 0.6157342	total: 1.23s	remaining: 1m
10:	learn: 0.5773941	total: 1.35s	remaining: 1m
11:	learn: 0.5423454	total: 1.47s	remaining: 59.7s
12:	learn: 0.5100784	total: 1.58s	remaining: 59.1s
13:	learn: 0.4801422	total: 1.69s	remaining: 58.7s
14:	learn: 0.4526054	total: 1.8s	remaining: 58.3s
15:	learn: 0.4271885	total: 1.92s	remaining: 58s
16:	learn: 0.4036700	total: 2.03s	remaining: 57.7s
17:	learn: 0.3816265	total: 2.15s	remaining: 57.5s
18:	learn: 0.3611493	total: 2.26s	remaining: 57.2s
19:	learn: 0.3421302	total: 2.38s	remaining: 57.2s
20:	learn: 0.3

160:	learn: 0.0160427	total: 33s	remaining: 1m 9s
161:	learn: 0.0159202	total: 33.2s	remaining: 1m 9s
162:	learn: 0.0157601	total: 33.3s	remaining: 1m 8s
163:	learn: 0.0156539	total: 33.5s	remaining: 1m 8s
164:	learn: 0.0155416	total: 33.7s	remaining: 1m 8s
165:	learn: 0.0154449	total: 33.9s	remaining: 1m 8s
166:	learn: 0.0153391	total: 34.1s	remaining: 1m 7s
167:	learn: 0.0152862	total: 34.3s	remaining: 1m 7s
168:	learn: 0.0151747	total: 34.5s	remaining: 1m 7s
169:	learn: 0.0150523	total: 34.8s	remaining: 1m 7s
170:	learn: 0.0150134	total: 35s	remaining: 1m 7s
171:	learn: 0.0149391	total: 35.2s	remaining: 1m 7s
172:	learn: 0.0148505	total: 35.4s	remaining: 1m 6s
173:	learn: 0.0147655	total: 35.5s	remaining: 1m 6s
174:	learn: 0.0146915	total: 35.7s	remaining: 1m 6s
175:	learn: 0.0146147	total: 35.9s	remaining: 1m 6s
176:	learn: 0.0144870	total: 36s	remaining: 1m 5s
177:	learn: 0.0143905	total: 36.2s	remaining: 1m 5s
178:	learn: 0.0143060	total: 36.4s	remaining: 1m 5s
179:	learn: 0.0142

320:	learn: 0.0091016	total: 57.1s	remaining: 31.8s
321:	learn: 0.0090820	total: 57.2s	remaining: 31.6s
322:	learn: 0.0090755	total: 57.3s	remaining: 31.4s
323:	learn: 0.0090297	total: 57.5s	remaining: 31.2s
324:	learn: 0.0090160	total: 57.6s	remaining: 31s
325:	learn: 0.0089933	total: 57.7s	remaining: 30.8s
326:	learn: 0.0089811	total: 57.9s	remaining: 30.6s
327:	learn: 0.0089685	total: 58s	remaining: 30.4s
328:	learn: 0.0089567	total: 58.1s	remaining: 30.2s
329:	learn: 0.0089451	total: 58.3s	remaining: 30s
330:	learn: 0.0089134	total: 58.4s	remaining: 29.8s
331:	learn: 0.0088977	total: 58.5s	remaining: 29.6s
332:	learn: 0.0088740	total: 58.7s	remaining: 29.4s
333:	learn: 0.0088582	total: 58.8s	remaining: 29.2s
334:	learn: 0.0088337	total: 59s	remaining: 29s
335:	learn: 0.0088254	total: 59.1s	remaining: 28.9s
336:	learn: 0.0088173	total: 59.2s	remaining: 28.7s
337:	learn: 0.0088115	total: 59.4s	remaining: 28.5s
338:	learn: 0.0088032	total: 59.5s	remaining: 28.3s
339:	learn: 0.0087599	

478:	learn: 0.0071390	total: 1m 18s	remaining: 3.45s
479:	learn: 0.0071270	total: 1m 18s	remaining: 3.28s
480:	learn: 0.0071097	total: 1m 18s	remaining: 3.12s
481:	learn: 0.0071071	total: 1m 19s	remaining: 2.95s
482:	learn: 0.0071012	total: 1m 19s	remaining: 2.79s
483:	learn: 0.0070989	total: 1m 19s	remaining: 2.62s
484:	learn: 0.0070959	total: 1m 19s	remaining: 2.46s
485:	learn: 0.0070917	total: 1m 19s	remaining: 2.29s
486:	learn: 0.0070888	total: 1m 19s	remaining: 2.13s
487:	learn: 0.0070865	total: 1m 19s	remaining: 1.97s
488:	learn: 0.0070706	total: 1m 20s	remaining: 1.8s
489:	learn: 0.0070688	total: 1m 20s	remaining: 1.64s
490:	learn: 0.0070642	total: 1m 20s	remaining: 1.47s
491:	learn: 0.0070387	total: 1m 20s	remaining: 1.31s
492:	learn: 0.0070278	total: 1m 20s	remaining: 1.15s
493:	learn: 0.0070226	total: 1m 20s	remaining: 981ms
494:	learn: 0.0070022	total: 1m 20s	remaining: 817ms
495:	learn: 0.0069978	total: 1m 21s	remaining: 654ms
496:	learn: 0.0069927	total: 1m 21s	remaining: 

<catboost.core.CatBoostClassifier at 0x2e5ce807820>

In [56]:
y_pred_v5 = model.predict(X_test)

submission_df_5 = pd.DataFrame(y_pred_v5, columns=["Segment"])
submission_df_5.to_csv("Submission_V5.csv", index = False)