In [1]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score


# Load the dataset
file_path = '/content/Melbourne_housing_FULL.csv'
melbourne_data = pd.read_csv(file_path)

# Handle missing data by dropping rows with missing target values (Price)
melbourne_data_clean = melbourne_data.dropna(subset=['Price'])
X_full = melbourne_data_clean.drop(['Price'], axis=1)
y_full = melbourne_data_clean['Price']

# Select only numeric columns for simplicity
X_numeric = X_full.select_dtypes(exclude=['object'])

# Handle missing values in the numeric columns
imputer = SimpleImputer(strategy='mean')
X_numeric_imputed = imputer.fit_transform(X_numeric)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_numeric_imputed, y_full, test_size=0.2, random_state=42)

# Initialize the linear regression model
model = LinearRegression()

# Implement forward feature selection using SequentialFeatureSelector
sfs = SequentialFeatureSelector(model, direction='forward', n_features_to_select='auto', scoring='neg_mean_absolute_error', cv=5)

# Fit the selector on the training data
sfs.fit(X_train, y_train)

# Get the selected features
selected_features = sfs.get_support(indices=True)

# Train the model on the selected features
X_train_selected = X_train[:, selected_features]
X_test_selected = X_test[:, selected_features]
model.fit(X_train_selected, y_train)

# Make predictions and evaluate the model
y_pred = model.predict(X_test_selected)
mae = mean_absolute_error(y_test, y_pred)
r2=r2_score(y_test, y_pred)


# Output the results
print("Selected features:", selected_features)
print("Mean Absolute Error after feature selection:", mae)
print("R2 score:", r2)



Selected features: [ 0  1  2  8  9 10]
Mean Absolute Error after feature selection: 310442.66414210177
R2 score: 0.4398394043768179
