In [1]:
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
np.random.seed(23)

In [2]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=UserWarning, module='sklearn')

In [3]:
houses_df = pd.read_csv("HousePriceIndia.csv")

In [4]:
prices = houses_df["Price"]

q1 = np.percentile(prices, 25)
q3 = np.percentile(prices, 75)

iqr = q3 - q1
print(f"iqr: {iqr}")


lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
print(f"lower bound: {lower_bound}")
print(f"upper bound: {upper_bound}")
lower_bound = max(0, lower_bound)
threshold1 = lower_bound + (upper_bound - lower_bound) / 3
threshold2 = lower_bound + (upper_bound - lower_bound) / 3 * 2

print(f"threshold 1: {threshold1}")
print(f"threshold 2: {threshold2}")
labels = [0, 1, 2]
bins = [-float("inf"), threshold1, threshold2, float("inf")]
houses_df["price_category"] = pd.cut(houses_df["Price"], bins=bins, labels=labels, include_lowest=True)
houses_df=houses_df.drop(['Id','Date'],axis=1)

iqr: 325000.0
lower bound: -167500.0
upper bound: 1132500.0
threshold 1: 377500.0
threshold 2: 755000.0


KeyError: "['Id'] not found in axis"

In [13]:
from sklearn.model_selection import train_test_split

y = np.array(houses_df['price_category'])
X = houses_df.drop(['Price','price_category'],axis=1)
X_used, X_val, y_used, y_val = train_test_split(
    X, y, stratify=y, test_size=0.1, random_state=42
)

In [ ]:
from sklearn.model_selection import train_test_split

y = np.array(houses_df['price_category'])
X = houses_df.drop(['Price','price_category'],axis=1)

X_used, X_test, y_used, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_used, y_used, stratify=y_used, test_size=1/8, random_state=42
)

## Functions


In [11]:
from sklearn.model_selection import train_test_split

def split():
    X_train, X_test, y_train, y_test = train_test_split(X_used,y_used,
                                                        test_size = 0.25,
                                                        random_state = 42,
                                                        stratify=y_used)
 

    return X_train, X_test, y_train, y_test

In [7]:
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
def random_param_search(model,model_params):
    random_s_m = RandomizedSearchCV(estimator=model, param_distributions=model_params, cv=3, n_jobs=-1, scoring="recall", random_state=42)
    random_result = random_s_m.fit(X_train, y_train)
    print("Best: %f using %s" % (random_result.best_score_, random_result.best_params_))
def grid_param_search(model,model_params):
    grid_s_m = GridSearchCV(estimator=model, param_grid=model_params, cv=3, n_jobs=-1, scoring="recall")
    grids_result = grid_s_m.fit(X_train, y_train)
    print("Best: %f using %s" % (grids_result.best_score_, grids_result.best_params_))

In [16]:
lr=LogisticRegression(solver='saga')
lr_params=dict(max_iter=[200,500,600,700,800,900,1000],penalty=['l1', 'l2'])

In [17]:
random_param_search(lr,lr_params)

Best: nan using {'penalty': 'l2', 'max_iter': 800}


In [14]:
X_train, X_test, y_train, y_test=split()

In [ ]:
tree_params=dict(max_depth=[4,5,6,7,8,9,10,11,12,13,14,15,16],min_samples_leaf=[3,4,5,6,7,8,9,10,11,12],max_features=[3,4,5,6,7,8,9,10])
xgb_params=dict(max_depth=[4,5,6,7,8,9,10],eta=[0.2, 0.25, 0.3, 0.35],reg_lambda=[0.01, 0.1, 1, 10],scale_pos_weight=[1/2,1,2],booster=["gbtree", "dart"])



In [18]:
lr_best=LogisticRegression(max_iter=800,penalty='l2' ,solver='saga')