# Ans1-

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


In [2]:
# import the datasets
df=pd.read_csv("/content/Bengaluru_House_Data.csv")
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [9]:
# Drop columns you want to exclude
columns_to_exclude = ['size', 'society', 'balcony']
df = df.drop(columns=columns_to_exclude)


In [13]:
# Handle null values
df['bath'].fillna(df['bath'].median(), inplace=True)
df['location'].fillna(df['location'].mode()[0], inplace=True)

In [14]:
X=df.drop('price',axis=1)
y=df['price']

In [15]:
#Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [16]:
df.isnull().sum()

area_type       0
availability    0
location        0
total_sqft      0
bath            0
price           0
dtype: int64

In [20]:
# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# Create transformers for numerical and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, X.select_dtypes(exclude=['object']).columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

In [22]:
# Create and train the SVM regression model with preprocessor
svm_model = Pipeline(steps=[('preprocessor', preprocessor),
                             ('regressor', SVR(kernel='linear'))])

svm_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test)

# Evaluate the model using regression metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print the results
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared (R2): {r2}')

Mean Absolute Error (MAE): 44.01809697352625
Mean Squared Error (MSE): 16303.82119031062
Root Mean Squared Error (RMSE): 127.68641740729755
R-squared (R2): 0.23422221144137056


# Ans2-

f your primary goal is to predict house prices with high accuracy, MSE would be a more suitable metric to guide model evaluation and improvement. Keep in mind that you can still consider other metrics, such as MAE (Mean Absolute Error), to get a more comprehensive view of your model's performance.

# Ans3-

When outliers are present, using MAE as a regression metric can provide a more robust assessment of the model's performance.

# Ans4-

 if both MSE and RMSE are very close, the choice between them may not be critical. You might choose based on interpretability, sensitivity to outliers, or stakeholder preferences.

# Ans5-

In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

# Assuming X contains your features and y contains your target variable
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a preprocessor for handling categorical variables
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', X.select_dtypes(exclude=['object']).columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), X.select_dtypes(include=['object']).columns)
    ])

# Create and train SVM regression models with different kernels
kernels = ['linear', 'poly', 'rbf']
for kernel in kernels:
    # Create a pipeline with the preprocessor and the SVM model
    svm_model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', SVR(kernel=kernel))
    ])

    # Fit the model
    svm_model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = svm_model.predict(X_test)

    # Evaluate the model using R-squared
    r2 = r2_score(y_test, y_pred)

    # Print the R-squared for each model
    print(f'R-squared (R2) for {kernel} kernel: {r2}')


R-squared (R2) for linear kernel: 0.23422221144137056
R-squared (R2) for poly kernel: -0.017841537518802575
R-squared (R2) for rbf kernel: 0.20268274453743806


 you train SVM regression models with different kernels and evaluate their performance using R-squared on the test set. The kernel with the highest R-squared value would indicate the model that explains the variance in the target variable the best.