In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
df = pd.read_csv("/content/housing.csv")

In [None]:
#copying the dataset
df_copy = df.copy()

In [None]:
df_copy['longitude'] = df_copy['longitude'].round().astype(int)
df_copy['latitude'] = df_copy['latitude'].round().astype(int)

In [None]:
df_copy.isnull().sum()

Unnamed: 0,0
longitude,0
latitude,0
housing_median_age,0
total_rooms,0
total_bedrooms,207
population,0
households,0
median_income,0
median_house_value,0
ocean_proximity,0


In [None]:
print(df_copy.duplicated().sum())

0


In [None]:
df_copy["total_bedrooms"].fillna(df_copy["total_bedrooms"].mean(),inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_copy["total_bedrooms"].fillna(df_copy["total_bedrooms"].mean(),inplace = True)


In [None]:
df_copy.isnull().sum()

Unnamed: 0,0
longitude,0
latitude,0
housing_median_age,0
total_rooms,0
total_bedrooms,0
population,0
households,0
median_income,0
median_house_value,0
ocean_proximity,0


In [None]:
le = LabelEncoder()#tool from scikit-learn used to convert categorical labels into numerical values.
df_copy['ocean_proximity'] = le.fit_transform(df_copy['ocean_proximity'])

In [None]:
# 3. Define features and target
X = df_copy.drop('median_house_value', axis=1)  # Change target column if needed
y = df_copy['median_house_value']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
model_before = DecisionTreeRegressor(random_state=42)
model_before.fit(X_train, y_train)

In [None]:

y_pred_before = model_before.predict(X_test)
r2_before = r2_score(y_test, y_pred_before)
mse_before = mean_squared_error(y_test, y_pred_before)

In [None]:

print("R-squared before tuning:", r2_before)
print("MSE before tuning:", mse_before)

R-squared before tuning: 0.48919150997006866
MSE before tuning: 6693680314.968992


In [None]:
param_grid = {
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [None]:
grid = GridSearchCV(DecisionTreeRegressor(random_state=42), param_grid, cv=5, scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)

In [None]:
# 7. After tuning
y_pred_after = grid.predict(X_test)
r2_after = r2_score(y_test, y_pred_after)
mse_after = mean_squared_error(y_test, y_pred_after)

In [None]:
print("R-squared after tuning:", r2_after)
print("MSE after tuning:", mse_after)
print("Best Parameters:", grid.best_params_)

R-squared after tuning: 0.6211520686979668
MSE after tuning: 4964457305.66176
Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}


In [None]:
#inferencing(predicting)
model_before.predict([[41,245,67,8,1234,67,7,9,0]])



array([112500.])

In [None]:
import pickle
with open('projecthouse2.pkl','wb') as file:
  pickle.dump(model_before,file)