In [None]:
#importing all essential libraries

from numpy import *
from pandas import *
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error,mean_absolute_error,root_mean_squared_error,r2_score

train=read_csv("/content/Advtrain.csv")
test=read_csv("/content/Advtest.csv")


# Dividing data into numerical and categorical columns
train_num_cols = train.select_dtypes(include=["int64","float64"]).columns
test_num_cols = test.select_dtypes(include=["int64","float64"]).columns

# Remove 'SalePrice' from the list of numerical columns for imputation in the training data
if 'SalePrice' in train_num_cols:
    train_num_cols = train_num_cols.drop('SalePrice')

# Applying imputation for num_cols
num_imputer=SimpleImputer(strategy="mean")
train[train_num_cols]=num_imputer.fit_transform(train[train_num_cols])
test[test_num_cols]=num_imputer.transform(test[test_num_cols])

# Handle categorical columns
train_cat_cols=train.select_dtypes(include=["object"]).columns
test_cat_cols=test.select_dtypes(include=["object"]).columns
#Applying imputaion for cat_cols
cat_imputer=SimpleImputer(strategy="most_frequent")
train[train_cat_cols]=cat_imputer.fit_transform(train[train_cat_cols])
test[test_cat_cols]=cat_imputer.transform(test[test_cat_cols])

#plot for saleprice distribution
sns.histplot (train["SalePrice"],kde=True,color="blue")
plt.title("Distribution of SalePrice")
plt.xlabel("SalePrice")
plt.ylabel("Frequency")
plt.show()
plt.savefig("Distribution of SalePrice.png")

#Normalizing the skewed data
after_skew=log1p(train["SalePrice"])
sns.histplot(after_skew,kde=True,color="blue")
plt.xlabel("SalePrice")
plt.ylabel("Frequency")
plt.title("Distribution of SalePrice after log transformation")
plt.show()
plt.savefig("Distribution of SalePrice after log transformation.png")


#New Features
train["TotalSF"]=train["TotalBsmtSF"]+train["1stFlrSF"]+train["2ndFlrSF"]
test["TotalSF"]=test["TotalBsmtSF"]+test["1stFlrSF"]+test["2ndFlrSF"]
train["TotalPorchSF"]=train["OpenPorchSF"]+train["EnclosedPorch"]+train["3SsnPorch"]+train["ScreenPorch"]


all_data=concat([train.drop("SalePrice",axis=1),test])
#print(all_data)
le=LabelEncoder()


for col in train_cat_cols:
  all_data[col]=le.fit_transform(all_data[col])


x=all_data.iloc[:train.shape[0],:]
y=train["SalePrice"]

#Scaling
scalar=StandardScaler()
x_scaled=scalar.fit_transform(x)

#PCA
pca=PCA(n_components=50)
x_pca=pca.fit_transform(x_scaled)

x_train,x_test,y_train,y_test=train_test_split(x_pca,y,test_size=0.2,random_state=42)

#Linear Regression
print("Linear Regression:")
lr=LinearRegression()
lr.fit(x_train,y_train)
y_pred_lr=lr.predict(x_test)
print("predcition of linear regression:",y_pred_lr)
mse_lr=mean_squared_error(y_test,y_pred_lr)
print("mean_squared_error of lr:",mse_lr)
print("root_mean_squared of lr:",sqrt(mse_lr))
print("r2_score of lr:",r2_score(y_test,y_pred_lr))
print()
print()

#Ridge Regreesion
print("Ridge Regression:")
rr=Ridge()
rr.fit(x_train,y_train)
y_pred_rr=rr.predict(x_test)
mse_rr=mean_squared_error(y_test,y_pred_rr)
print("metrics before hyperparameter tuning:")
print("mean_squared_error of rr:",mse_rr)
print("root_mean_squared of rr:",sqrt(mse_rr))
print("r2_score of rr:",r2_score(y_test,y_pred_rr))
print()
print("using hyperparameter tuning:")
param_grid_rr={"alpha":[0.01,0.1,1],
            "solver":["auto","saga"],
            "max_iter":[500,1000,5000]}
grid_rr=GridSearchCV(rr,param_grid_rr,cv=5,scoring="neg_mean_squared_error")
grid_rr.fit(x_train,y_train)
print("best parameters:",grid_rr.best_params_)
print("best score:",grid_rr.best_score_)
print()
print("metrics after hyperparameter tuning:")
y_tune_pred_rr=grid_rr.predict(x_test)
mse_tune_rr=mean_squared_error(y_test,y_tune_pred_rr)
print("mean_squared_error:",mse_tune_rr)
print("root_mean_squared_error:",sqrt(mse_tune_rr))
print("r2_score:",r2_score(y_test,y_tune_pred_rr))
print()
print()

#Lasso Regression
print("Lasso Regression:")
lsr=Lasso()
lsr.fit(x_train,y_train)
y_pred_lsr=lsr.predict(x_test)
mse_lsr=mean_squared_error(y_test,y_pred_lsr)
print("metrics before hyperparameter tuning:")
print("mean_squared_error of rr:",mse_lsr)
print("root_mean_squared of rr:",sqrt(mse_lsr))
print("r2_score of rr:",r2_score(y_test,y_pred_lsr))
print()
print("using hyperparameter tuning:")
param_grid_lsr={"alpha":[0.01,0.1,1],
                "selection":["cyclic","random"],
               "max_iter":[500,1000,5000]}
grid_lsr=GridSearchCV(lsr,param_grid_lsr,cv=5,scoring="neg_mean_squared_error")
grid_lsr.fit(x_train,y_train)
print("best parameters:",grid_lsr.best_params_)
print("best score:",grid_lsr.best_score_)
print()
print("metrics after hyperparameter tuning:")
y_tune_pred_lsr=grid_lsr.predict(x_test)
mse_tune_lsr=mean_squared_error(y_test,y_tune_pred_lsr)
print("mean_squared_error:",mse_tune_lsr)
print("root_mean_squared_error:",sqrt(mse_tune_lsr))
print("r2_score:",r2_score(y_test,y_tune_pred_lsr))
print()
print()

#Random forest Regression
print("Randomforest Regressor:")
rfr=RandomForestRegressor()
rfr.fit(x_train,y_train)
y_pred_rfr=rfr.predict(x_test)
mse_rfr=mean_squared_error(y_test,y_pred_rfr)
print("metrics before hyperparameter tuning:")
print("mean_squared_error of rr:",mse_rfr)
print("root_mean_squared of rr:",sqrt(mse_rfr))
print("r2_score of rr:",r2_score(y_test,y_pred_rfr))
print()
print("using hyperparameter tuning:")
param_grid_rfr={"n_estimators":[50,100],
                "max_depth":[3,5],
                "max_features":["sqrt","log2"]} # Corrected max_features
grid_rfr=GridSearchCV(rfr,param_grid_rfr,cv=5,scoring="neg_mean_squared_error")
grid_rfr.fit(x_train,y_train)
print("best parameters:",grid_rfr.best_params_)
print("best score:",grid_rfr.best_score_)
print()
print("metrics after hyperparameter tuning:")
y_tune_pred_rfr=grid_rfr.predict(x_test)
mse_tune_rfr=mean_squared_error(y_test,y_tune_pred_rfr)
print("mean_squared_error:",mse_tune_rfr)
print("root_mean_squared_error:",sqrt(mse_tune_rfr))
print("r2_score:",r2_score(y_test,y_tune_pred_rfr))
print()
print()

#predicting the model with new data
new_data = DataFrame([{
    "OverallQual": 7,
    "GrLivArea": 1500,
    "OverallCond":5,
    "GarageCars": 2,
    "GarageArea": 450,
    "TotalBsmtSF": 800,
    "1stFlrSF": 900,
    "FullBath": 2,
    "TotRmsAbvGrd": 6,
    "YearBuilt": 2005,
    "YearRemodAdd": 2007,
    "MasVnrArea": 150,
    "ExterQual": 3,       # e.g., Po=0, Fa=1, TA=2, Gd=3, Ex=4
    "KitchenQual": 3,     # same mapping as above
    "BsmtFinSF1": 500,
    "LotFrontage": 70,
    "BedroomAbvGr": 3,
    "Fireplaces": 1,
    "GarageYrBlt": 2005,
    "MSSubClass": 60,
    "LotArea": 8500,
    "BsmtUnfSF": 300,
    "BsmtFullBath": 1,
    "WoodDeckSF": 120,
    "OpenPorchSF": 50,
    "HalfBath": 1,
    "EnclosedPorch": 0,
    "2ndFlrSF": 500,
    "ScreenPorch": 0,
    "BsmtFinSF2": 0,
    "PoolArea": 0,
    "MiscVal": 0,
    "YrSold": 2008,
    "MoSold": 5,
    "3SsnPorch": 0,
    "BsmtHalfBath": 0,
    "LowQualFinSF": 0,
    "BsmtFinType1": 4,    # e.g., GLQ=4, ALQ=3, BLQ=2, Rec=1, LwQ=0, Unf=5
    "BsmtExposure": 1,    # No=1, Mn=2, Av=3, Gd=4
    "BsmtQual": 3,        # same mapping as ExterQual
    "GarageCond": 2,      # Po=0, Fa=1, TA=2, Gd=3, Ex=4
    "GarageQual": 2,      # same mapping
    "HeatingQC": 4,       # Po=0, Fa=1, TA=2, Gd=3, Ex=4
    "CentralAir": 1,      # N=0, Y=1
    "KitchenAbvGr": 1,
    "Functional": 0,      # Typ=0, Min1=1, Min2=2, etc.
    "Exterior1st": 5,     # manually map each category to int
    "Exterior2nd": 5,
    "Neighborhood": 10,   # manually map neighborhoods
    "SaleCondition": 1 }])   # e.g., Normal=1, Abnorml=2, Partial=3

#use the above data for predicting new saleprice after doimg all preprocessing

plt.figure(figsize=(10,6))
plt.scatter(y_test,y_tune_pred_rr,color="blue",label="predicted vs actual")
plt.xlabel("Actual SalePrice")
plt.ylabel("Predicted SalePrice")
plt.savefig("actual vs predicted.png")
plt.show()