In [None]:
# Data collection
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
def load_data():
    file_path=r"C:\Users\dines\Downloads\House Price India\House Price India.csv"
    df=pd.read_csv(file_path)
    print("Data collection completed",df.shape)
    """print("About Data",df.info())
    print("Missing value",df.isnull().sum())
    print("duplicate",df.duplicated().sum())
    print("columns",df.columns)"""
    return df

#Data preprocess
def preprocess_data(df):
    #fill missing values
    for col in df.columns:
        df.fillna(df.median(),inplace=True)
    #remove duplicates
    df.drop_duplicates(inplace=True)
    #outlier handiling
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]
    #drop the columns
    df= df.drop(columns=["id","Date","lot area","number of floors","waterfront present","number of views","condition of the house",
                        "Area of the basement","Built Year","Renovation Year","Postal Code","Longitude","lot_area_renov",
                         "Number of schools nearby","Distance from the airport","Area of the house(excluding basement)","living_area_renov"])
    return df

def Perform_EDA(df):
    #identify the correlation
    print("Statistics",df.describe())
    plt.figure(figsize=(20,15))
    sns.heatmap(df.corr(),annot=True,cmap='coolwarm')
    plt.title("Feature Correlation")
    plt.show()
    corr_matrix = df.corr()
    print(corr_matrix)
    return df
    
def spilt_data(df):
    x=df.drop(columns=["Price"])
    y=df["Price"]
    x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.2,random_state=42)
    print("Feature selection completed.")
    return x_train, x_test, y_train, y_test
    
def model_train_linear(x_train,x_test,y_train):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(x_train)
    X_test_scaled = scaler.transform(x_test)
    
    model=LinearRegression()
    model.fit(X_train_scaled,y_train)
    print("Model training completed.")
    return model,X_test_scaled,scaler
    
def model_train_ridge(x_train, x_test, y_train, alpha=10):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(x_train)
    X_test_scaled = scaler.transform(x_test)
    ridge_model = Ridge(alpha=alpha)
    ridge_model.fit(X_train_scaled, y_train)
    print("Ridge Regression model training completed with alpha =", alpha)
    return ridge_model, X_test_scaled, scaler
def model_XGboost(x_train, x_test, y_train):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(x_train)
    X_test_scaled = scaler.transform(x_test)
    xgb_regressor = xgb.XGBRegressor(
    objective="reg:squarederror", 
    n_estimators=100,  
    learning_rate=0.1,  
    max_depth=6,  
    random_state=42
)
    XGBoost_model=xgb_regressor.fit(X_train_scaled, y_train)
    xgb.plot_importance(xgb_regressor, importance_type="weight", max_num_features=10)
    plt.show()
    return XGBoost_model,X_test_scaled,scaler

def model_Randomforest(x_train, x_test, y_train):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(x_train)
    X_test_scaled = scaler.transform(x_test)
    rf_regressor = RandomForestRegressor(
    n_estimators=100,  # Number of trees in the forest
    max_depth=10,  # Maximum depth of each tree
    random_state=42,  # Ensures reproducibility
    n_jobs=-1  # Uses all available CPU cores
)
    Randomforest_model=rf_regressor.fit(X_train_scaled, y_train)
    return Randomforest_model,X_test_scaled,scaler
    
def model_evalution(model,X_test_scaled,y_test):
    y_pred=model.predict(X_test_scaled)
    mse=mean_squared_error(y_test,y_pred)
    r2=r2_score(y_test,y_pred)
    print(f"Model Evaluation:\nMSE: {mse:.2f}\nR² Score: {r2:.2f}")
  
df=load_data()
df=preprocess_data(df)
df=Perform_EDA(df)
x_train, x_test, y_train, y_test=spilt_data(df)

linear_model,X_test_linear_scaled,scaler=model_train_linear(x_train,x_test,y_train)
print("Linear Regression Results:")
model_evalution(linear_model, X_test_linear_scaled, y_test)

# Train and evaluate Ridge Regression model with default alpha (you can change alpha as needed)
ridge_model, X_test_ridge_scaled, _ = model_train_ridge(x_train, x_test, y_train, alpha=100)
print("Ridge Regression Results:")
model_evalution(ridge_model, X_test_ridge_scaled, y_test)

# Train and evaluate xgboost Regression model with default alpha (you can change alpha as needed)
XGBoost_model, X_test_XGBoost_scaled,scaler = model_XGboost(x_train, x_test, y_train)
print("XGboost Regression Results:")
model_evalution(XGBoost_model, X_test_XGBoost_scaled, y_test)

# Train and evaluate Randomforest Regression model with default alpha (you can change alpha as needed)
Randomforest_model, X_test_REF_scaled,scaler = model_Randomforest(x_train, x_test, y_train)
print("Randomforest Regression Results:")
model_evalution(Randomforest_model, X_test_REF_scaled, y_test)



    