In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold, RFECV
from sklearn.metrics import accuracy_score
diabetes_df = pd.read_csv('/content/sample_data/diabetes.csv')


missing_values = diabetes_df.isnull().mean()
diabetes_filtered = diabetes_df.loc[:, missing_values < 0.3]


X = diabetes_filtered.drop(columns=['Outcome'])
y = diabetes_filtered['Outcome']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


logistic_model = LogisticRegression(max_iter=200)
logistic_model.fit(X_train, y_train)


y_pred = logistic_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy after Missing Value Filter (Diabetes): {accuracy:.4f}')


Accuracy after Missing Value Filter (Diabetes): 0.7359


In [6]:
corr_matrix = diabetes_filtered.corr()

high_corr_pairs = corr_matrix[corr_matrix.abs() > 0.8].stack().index.tolist()


cols_to_drop = set()


for col1, col2 in high_corr_pairs:
    if col1 != col2:
        cols_to_drop.add(col2)

diabetes_filtered_corr = diabetes_filtered.drop(columns=cols_to_drop)

X_corr = diabetes_filtered_corr.drop(columns=['Outcome'])
y_corr = diabetes_filtered_corr['Outcome']

X_train_corr, X_test_corr, y_train_corr, y_test_corr = train_test_split(X_corr, y_corr, test_size=0.3, random_state=42)

logistic_model_corr = LogisticRegression(max_iter=200)
logistic_model_corr.fit(X_train_corr, y_train_corr)

y_pred_corr = logistic_model_corr.predict(X_test_corr)

accuracy_corr = accuracy_score(y_test_corr, y_pred_corr)
print(f'Accuracy after High Correlation Filter (Diabetes): {accuracy_corr:.4f}')


Accuracy after High Correlation Filter (Diabetes): 0.7359


In [8]:
from sklearn.feature_selection import RFECV

rfecv_selector = RFECV(estimator=LogisticRegression(max_iter=200), step=1, cv=5)
rfecv_selector.fit(X, y)


X_train_selected, X_test_selected, y_train_selected, y_test_selected = train_test_split(X_selected, y, test_size=0.3, random_state=42)

logistic_model_selected = LogisticRegression(max_iter=200)
logistic_model_selected.fit(X_train_selected, y_train_selected)

y_pred_selected = logistic_model_selected.predict(X_test_selected)

accuracy_selected = accuracy_score(y_test_selected, y_pred_selected)
print(f'Accuracy after Forward Feature Selection (Diabetes): {accuracy_selected:.4f}')


Accuracy after Forward Feature Selection (Diabetes): 0.7359


In [10]:
from sklearn.tree import DecisionTreeClassifier
import numpy as np

decision_tree_model = DecisionTreeClassifier()

decision_tree_model.fit(X_train, y_train)

feature_importances = decision_tree_model.feature_importances_
sorted_indices = np.argsort(feature_importances)

remaining_features = list(X.columns)
print(f"Initial Number of Features: {len(remaining_features)}")

while len(remaining_features) > 5:

    least_important_feature_index = sorted_indices[0]
    feature_to_drop = remaining_features[least_important_feature_index]

    X = X.drop(columns=[feature_to_drop])
    remaining_features.remove(feature_to_drop)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    decision_tree_model.fit(X_train, y_train)

    feature_importances = decision_tree_model.feature_importances_
    sorted_indices = np.argsort(feature_importances)

print(f"Final Number of Features After Elimination: {len(remaining_features)}")


Initial Number of Features: 5
Final Number of Features After Elimination: 5


In [12]:

sorted_importance_indices = np.argsort(feature_importances)[::-1]

num_features_to_display = min(5, len(remaining_features))

print("Top Important Features:")

for i in range(num_features_to_display):
    if i < len(feature_importances):
        feature_name = remaining_features[sorted_importance_indices[i]]
        importance_value = feature_importances[sorted_importance_indices[i]]
        print(f"{i + 1}. Feature '{feature_name}': {importance_value:.4f}")


Top Important Features:
1. Feature 'Glucose': 0.3619
2. Feature 'BMI': 0.2167
3. Feature 'DiabetesPedigreeFunction': 0.1719
4. Feature 'Age': 0.1658
5. Feature 'BloodPressure': 0.0838


In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold, RFE, SelectFromModel
from sklearn.metrics import mean_squared_error

housing_data = pd.read_csv('/content/sample_data/melbourne_housing_raw.csv')

missing_pct = housing_data.isnull().mean() * 100
cols_to_remove = [col for col in missing_pct.index if missing_pct[col] > 20 and col != 'Price']
cleaned_data = housing_data.drop(columns=cols_to_remove)
cleaned_data = cleaned_data.dropna(subset=['Price'])

X = cleaned_data.drop(columns=['Price', 'Date', 'Suburb', 'Type', 'Method', 'SellerG', 'CouncilArea', 'Regionname'])
y = cleaned_data['Price']

X.fillna(X.mean(), inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

corr_matrix = X_train.corr().abs()
high_corr_pairs = np.where(corr_matrix > 0.85)
high_corr_features = set([X_train.columns[i] for i in high_corr_pairs[0] if i != high_corr_pairs[1][i]])

X_train_corr_filtered = X_train.drop(columns=high_corr_features)
X_test_corr_filtered = X_test.drop(columns=high_corr_features)

variance_filter = VarianceThreshold(threshold=0.01)
X_train_low_var = variance_filter.fit_transform(X_train_corr_filtered)
X_test_low_var = variance_filter.transform(X_test_corr_filtered)

linear_reg_model = LinearRegression()
rfe_forward_selector = RFE(estimator=linear_reg_model, n_features_to_select=5, step=1)
rfe_forward_selector.fit(X_train_low_var, y_train)
X_train_forward_selected = rfe_forward_selector.transform(X_train_low_var)
X_test_forward_selected = rfe_forward_selector.transform(X_test_low_var)

rf_model_backward = RandomForestRegressor(random_state=42)
rfe_backward_selector = RFE(estimator=rf_model_backward, n_features_to_select=5, step=1)
rfe_backward_selector.fit(X_train_low_var, y_train)
X_train_backward_selected = rfe_backward_selector.transform(X_train_low_var)
X_test_backward_selected = rfe_backward_selector.transform(X_test_low_var)

rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
important_feature_selector = SelectFromModel(rf_model, threshold="mean", prefit=True)
X_train_rf_selected = important_feature_selector.transform(X_train)
X_test_rf_selected = important_feature_selector.transform(X_test)

def evaluate_model(X_train, X_test, y_train, y_test):
    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    return mse

results = {
    "Baseline (No Feature Selection)": evaluate_model(X_train, X_test, y_train, y_test),
    "High Correlation Filter": evaluate_model(X_train_corr_filtered, X_test_corr_filtered, y_train, y_test),
    "Low Variance Filter": evaluate_model(X_train_low_var, X_test_low_var, y_train, y_test),
    "Forward Selection": evaluate_model(X_train_forward_selected, X_test_forward_selected, y_train, y_test),
    "Backward Elimination": evaluate_model(X_train_backward_selected, X_test_backward_selected, y_train, y_test),
    "Random Forest Selection": evaluate_model(X_train_rf_selected, X_test_rf_selected, y_train, y_test)
}

for method, mse_value in results.items():
    print(f"{method}: MSE = {mse_value:.4f}")




Baseline (No Feature Selection): MSE = 143875373039.6255
High Correlation Filter: MSE = 143875373039.6255
Low Variance Filter: MSE = 143875373039.6255
Forward Selection: MSE = 143875373039.6255
Backward Elimination: MSE = 143875373039.6255
Random Forest Selection: MSE = 147811757361.1377
