# ***LIBRARIES***

In [None]:
import pandas as pd
import folium
from folium import plugins
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dropout
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# ***DATA LOADING & PREPROCESSING***

In [None]:
df = pd.read_csv('train.csv')
df.head(5)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.isna().sum()

In [None]:
df.isnull().sum()

In [None]:
df_new=df.drop_duplicates()

In [None]:
filtered_df = df_new[
    (df_new['LONGITUDE'] <= 37.6) &
    (df_new['LONGITUDE'] >= 8.4) &
    (df_new['LATITUDE'] >= 68.1) &
    (df_new['LATITUDE'] <= 97.4)
]

filtered_df.shape

In [None]:
map_center = [filtered_df['LONGITUDE'].mean(), filtered_df['LATITUDE'].mean()]
my_map = folium.Map(location=map_center, zoom_start=10)

marker_cluster = plugins.MarkerCluster().add_to(my_map)

for index, row in filtered_df.iterrows():
    folium.Marker(
        location=[row['LONGITUDE'], row['LATITUDE']],
        popup=index
    ).add_to(marker_cluster)

display(my_map)

In [None]:
filtered_df.drop([17235,19975,12001,5093,23479,11633,7142], inplace=True)

In [None]:
new_column_names = {'LONGITUDE': 'LATITUDE', 'LATITUDE': 'LONGITUDE'}
filtered_df.rename(columns=new_column_names, inplace=True)

In [None]:
filtered_df.drop('ADDRESS', axis=1, inplace=True)
filtered_df.head()

# ***EXPLORATORY DATA ANALYSIS***

In [None]:
plt.figure(figsize=(8, 8))
plt.pie(filtered_df['POSTED_BY'].value_counts(), labels=filtered_df['POSTED_BY'].value_counts().index, autopct='%1.1f%%', startangle=90)
plt.title('Distribution of Property Lister')
plt.show()

# Majority of the properities have been listed by real-estate dealers.

In [None]:
plt.figure(figsize=(8, 8))
plt.pie(filtered_df['BHK_OR_RK'].value_counts(), labels=filtered_df['BHK_OR_RK'].value_counts().index, autopct='%1.1f%%', startangle=90)
plt.title('Distribution of Type of Property ')
plt.show()

# Almost all properties are equipped with a bedroom, extremely few are one room-kitchen.

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(filtered_df.corr(), annot=True)

In [None]:
filtered_df[['UNDER_CONSTRUCTION','READY_TO_MOVE']].head() # same columns, drop one

In [None]:
filtered_df.drop('UNDER_CONSTRUCTION', axis=1, inplace=True)
filtered_df.head()

In [None]:
filtered_df.skew()

In [None]:
sns.distplot(filtered_df['SQUARE_FT'],kde=True,color="g")
plt.show()

In [None]:
filtered_df.boxplot(['SQUARE_FT'])
plt.title('Boxplot of Categories')
plt.ylabel('Values')
plt.show()

In [None]:
Q1=filtered_df['SQUARE_FT'].quantile(0.25)
Q3=filtered_df['SQUARE_FT'].quantile(0.75)
IQR = Q3 - Q1
print(Q1, Q3, IQR)

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers_mask = (filtered_df['SQUARE_FT'] < lower_bound) | (filtered_df['SQUARE_FT'] > upper_bound)

filtered_df_no_outliers = filtered_df[~outliers_mask]

In [None]:
filtered_df_no_outliers.boxplot(['TARGET(PRICE_IN_LACS)'])
plt.title('Boxplot of Categories')
plt.ylabel('Values')
plt.show()

In [None]:
Q1=filtered_df_no_outliers['TARGET(PRICE_IN_LACS)'].quantile(0.25)
Q3=filtered_df_no_outliers['TARGET(PRICE_IN_LACS)'].quantile(0.75)
IQR = Q3 - Q1
print(Q1, Q3, IQR)

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers_mask = (filtered_df_no_outliers['TARGET(PRICE_IN_LACS)'] < lower_bound) | (filtered_df_no_outliers['TARGET(PRICE_IN_LACS)'] > upper_bound)

filtered_df_no_price_outliers = filtered_df_no_outliers[~outliers_mask]

In [None]:
plt.figure(figsize=(20,18))
sns.pairplot(filtered_df_no_price_outliers)
plt.show()

In [None]:
filtered_df_catgeorical = filtered_df_no_price_outliers[['RERA', 'BHK_NO.', 'SQUARE_FT', 'READY_TO_MOVE', 'RESALE', 'LATITUDE', 'LONGITUDE', 'TARGET(PRICE_IN_LACS)']]

In [None]:
vif_data = pd.DataFrame()
vif_data["feature"] = filtered_df_catgeorical.columns

vif_data["VIF"] = [variance_inflation_factor(filtered_df_catgeorical.values, i)
                          for i in range(len(filtered_df_catgeorical.columns))]

print(vif_data)

# ***MODELLING***

### ***Separating the dataset into X & y***

In [None]:
X = filtered_df_no_price_outliers.drop('TARGET(PRICE_IN_LACS)', axis=1)
y = filtered_df_no_price_outliers['TARGET(PRICE_IN_LACS)']

### ***One Hot Encoding Categorical columns***

In [None]:
X = pd.get_dummies(X, columns=['POSTED_BY'], prefix='POSTED_BY')
X = pd.get_dummies(X, columns=['BHK_OR_RK'], prefix='BHK_OR_RK')
print(X.head())

### ***Scaling the predictors using Standard Scalar***

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

### ***Splitting the dataset***

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape

### ***Polynomial Regression***

In [None]:
degree = 2
poly = PolynomialFeatures(degree)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

In [None]:
poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)

In [None]:
y_pred_poly = poly_model.predict(X_test_poly)

In [None]:
mse_poly = mean_squared_error(y_test, y_pred_poly)
mae_poly = mean_absolute_error(y_test, y_pred_poly)
r2_poly = r2_score(y_test, y_pred_poly)

print(f'Mean Squared Error (Polynomial Regression): {mse_poly}')
print(f'Mean Absolute Error (Polynomial Regression): {mae_poly}')
print(f'R-squared Score (Polynomial Regression): {r2_poly}')

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred_poly, color='blue', marker='o', label='Actual vs. Predicted')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Scatter Plot of Actual vs. Predicted Values')
plt.plot([min(y_test), max(y_test)], [min(y_pred_poly), max(y_pred_poly)], linestyle='--', color='red', label='Perfect Prediction')
plt.legend()
plt.show()

### ***Ridge Regression***

In [None]:
alphas = [0.1, 1.0, 10.0]
ridge_cv = RidgeCV(alphas=alphas, store_cv_values=True)
ridge_cv.fit(X_train, y_train)

In [None]:
best_alpha = ridge_cv.alpha_

best_alpha

In [None]:
best_ridge_model = Ridge(alpha=best_alpha)
best_ridge_model.fit(X_train, y_train)

In [None]:
y_pred_best = best_ridge_model.predict(X_test)

In [None]:
mse_best = mean_squared_error(y_test, y_pred_best)
mae_best = mean_absolute_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)

print(f'Best Alpha: {best_alpha}')
print(f'Mean Squared Error (Ridge Regression - Best Model): {mse_best}')
print(f'Mean Absolute Error (Ridge Regression - Best Model): {mae_best}')
print(f'R-squared Score (Ridge Regression - Best Model): {r2_best}')

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred_best, color='blue', marker='o', label='Actual vs. Predicted')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Scatter Plot of Actual vs. Predicted Values')
plt.plot([min(y_test), max(y_test)], [min(y_pred_best), max(y_pred_best)], linestyle='--', color='red', label='Perfect Prediction')
plt.legend()
plt.show()

### ***Lasso Regression***

In [None]:
alphas = [0.1, 1.0, 10.0]
lasso_cv = LassoCV(alphas=alphas)
lasso_cv.fit(X_train, y_train)

In [None]:
best_alpha = lasso_cv.alpha_
best_alpha

In [None]:
best_lasso_model = Lasso(alpha=best_alpha)
best_lasso_model.fit(X_train, y_train)

In [None]:
y_pred_best1 = best_lasso_model.predict(X_test)

In [None]:
mse_best = mean_squared_error(y_test, y_pred_best1)
mae_best = mean_absolute_error(y_test, y_pred_best1)
r2_best = r2_score(y_test, y_pred_best1)

print(f'Best Alpha: {best_alpha}')
print(f'Mean Squared Error (Lasso Regression - Best Model): {mse_best}')
print(f'Mean Absolute Error (Lasso Regression - Best Model): {mae_best}')
print(f'R-squared Score (Lasso Regression - Best Model): {r2_best}')

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred_best1, color='blue', marker='o', label='Actual vs. Predicted')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Scatter Plot of Actual vs. Predicted Values')
plt.plot([min(y_test), max(y_test)], [min(y_pred_best1), max(y_pred_best1)], linestyle='--', color='red', label='Perfect Prediction')
plt.legend()
plt.show()

### ***Random Forest Regressor***

In [None]:
param_grid = {
    'n_estimators': [10, 20, 50],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(rf_model, param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

print(best_params)

In [None]:
best_rf_model = RandomForestRegressor(random_state=42, **best_params)
best_rf_model.fit(X_train, y_train)

y_pred_best3 = best_rf_model.predict(X_test)

mse_best = mean_squared_error(y_test, y_pred_best3)
mae_best = mean_absolute_error(y_test, y_pred_best3)
r2_best = r2_score(y_test, y_pred_best3)

print(f'Best Hyperparameters: {best_params}')
print(f'Mean Squared Error (Random Forest Regression - Best Model): {mse_best}')
print(f'Mean Absolute Error (Random Forest Regression - Best Model): {mae_best}')
print(f'R-squared Score (Random Forest Regression - Best Model): {r2_best}')

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred_best3, color='blue', marker='o', label='Actual vs. Predicted')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Scatter Plot of Actual vs. Predicted Values')
plt.plot([min(y_test), max(y_test)], [min(y_pred_best3), max(y_pred_best3)], linestyle='--', color='red', label='Perfect Prediction')
plt.legend()
plt.show()

### ***XGB-REGRESSOR***

In [None]:
param_grid = {
    'n_estimators': [10, 20, 50],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 1, 5]
}

xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)
grid_search = GridSearchCV(xgb_model, param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

print(best_params)

In [None]:
best_xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42, **best_params)
best_xgb_model.fit(X_train, y_train)

y_pred_best4 = best_xgb_model.predict(X_test)

mse_best = mean_squared_error(y_test, y_pred_best4)
mae_best = mean_absolute_error(y_test, y_pred_best4)
r2_best = r2_score(y_test, y_pred_best4)

print(f'Best Hyperparameters: {best_params}')
print(f'Mean Squared Error (XGBoost Regression - Best Model): {mse_best}')
print(f'Mean Absolute Error (XGBoost Regression - Best Model): {mae_best}')
print(f'R-squared Score (XGBoost Regression - Best Model): {r2_best}')

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred_best4, color='blue', marker='o', label='Actual vs. Predicted')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Scatter Plot of Actual vs. Predicted Values')
plt.plot([min(y_test), max(y_test)], [min(y_pred_best4), max(y_pred_best4)], linestyle='--', color='red', label='Perfect Prediction')
plt.legend()
plt.show()

### ***ANN regression***

In [None]:
model = keras.Sequential()

model.add(layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(8, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(4, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(1, activation='linear'))

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

In [None]:
y_pred = model.predict(X_test).flatten()

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (ANN Regression): {mse}')
print(f'Mean Absolute Error (ANN Regression): {mae}')
print(f'R-squared Score (ANN Regression): {r2}')