In [89]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import joblib
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.model_selection import learning_curve
import numpy as np
from sklearn.impute import SimpleImputer

In [90]:
def Data_Preparation(dataset_path):
    # Load dataset
    dataset = pd.read_csv(dataset_path)

    # Drop unnecessary columns
    if 'Id' in dataset.columns:
        dataset.drop(['Id'], axis=1, inplace=True)

    # Handle missing values in SalePrice (Mean Imputation)
    dataset['SalePrice'] = dataset['SalePrice'].fillna(
    dataset['SalePrice'].mean())

    # Removing Missing Values from the Dataset
    new_dataset = dataset.dropna()

    # Missing Values Check
    new_dataset.isnull().sum()

    # List Categorical Features
    from sklearn.preprocessing import OneHotEncoder

    s = (new_dataset.dtypes == 'object')
    object_cols = list(s[s].index)
    print("Categorical variables:")
    print(object_cols)
    print('No. of. categorical features: ',
	  len(object_cols))

    # Impute missing values in categorical columns with 'Missing'
    for col in object_cols:
      dataset[col] = dataset[col].fillna('Missing')

    # Drop rows with missing values in numerical columns (if any)
    new_dataset = dataset.dropna(subset=[col for col in dataset.columns if col not in object_cols])

    # One-hot encoding
    OH_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    OH_cols = pd.DataFrame(OH_encoder.fit_transform(new_dataset[object_cols]))
    OH_cols.index = new_dataset.index
    OH_cols.columns = OH_encoder.get_feature_names_out()
    df_final = new_dataset.drop(object_cols, axis=1)
    df_final = pd.concat([df_final, OH_cols], axis=1)

    # Standardization
    scaler = StandardScaler()
    numeric_features = df_final.select_dtypes(include=['number']).columns
    df_final[numeric_features] = scaler.fit_transform(df_final[numeric_features])

    # Define X (features) and y (target) before outlier handling
    X = df_final.drop(columns=['SalePrice'])
    y = df_final['SalePrice']


    # Handle outliers using IQR method

    Q1, Q3 = y.quantile(0.25), y.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound, upper_bound = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
    outlier_mask = (y >= lower_bound) & (y <= upper_bound)

    # Update X and y using the outlier mask
    X = X[outlier_mask]
    y = y[outlier_mask]

    # Apply log transformation
    y = np.log1p(y)

    #The X, y are already defined before outlier handling
    y.dropna(inplace=True)
    X = X.loc[y.index]

    # Select 10 best features
    selector = SelectKBest(score_func=mutual_info_regression, k=10)
    X_selected = selector.fit_transform(X, y)

    selected_features = X.columns[selector.get_support()]
    print("Top 10 Important Features:\n", selected_features.tolist())

    df_final = df_final[selected_features.tolist() + ['SalePrice']]



    return X, y

In [96]:
# Example usage:
dataset_path = "/content/drive/MyDrive/train.csv"
X, y = Data_Preparation(dataset_path)

# Load trained model outside the function
model_path = "/content/drive/My Drive/lasso_model.pkl"
model = joblib.load(model_path)
print("Model loaded successfully!")

# Make predictions
predictions = model.predict(X)

# Print evaluation metrics if y is available
if y is not None:
    from sklearn.metrics import mean_absolute_percentage_error, r2_score
    print(f"MAPE: {mean_absolute_percentage_error(y, predictions):.4f}")
    print(f"R² Score: {r2_score(y, predictions):.4f}")

print("Predictions:", predictions[:5])  # Print first 5 predictions

Categorical variables:
['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
No. of. categorical features:  43


  result = getattr(ufunc, method)(*inputs, **kwargs)


Top 10 Important Features:
 ['OverallQual', 'YearBuilt', 'TotalBsmtSF', 'GrLivArea', 'FullBath', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'ExterQual_Gd', 'ExterQual_TA']
Model loaded successfully!
MAPE: 2.1415
R² Score: 0.0398
Predictions: [-0.06855849 -0.13861112 -0.06855849 -0.06855849  0.00149413]


