In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler # preprocess
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression 
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Load the dataset
dataset = pd.read_csv("prep.csv")

# Convert categorical variables to dummy variables
dataset = pd.get_dummies(dataset, drop_first=True)

# Define independent (features) and dependent (target) variables
X = dataset.drop('classification_yes', axis=1)
y = dataset['classification_yes']

# Function to split and scale data
def split_and_scale(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test, y_train, y_test

# Function to compute R2 score
def compute_r2(model, X_test, y_test):
    y_pred = model.predict(X_test)
    return r2_score(y_test, y_pred)

# Function to train models
def train_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    return compute_r2(model, X_test, y_test)

# Apply RFE to select top features
def rfe_feature_selection(X, y, num_features):
    models = [
        LinearRegression(),
        SVR(kernel='linear'),
        DecisionTreeRegressor(random_state=0),
        RandomForestRegressor(n_estimators=10, random_state=0)
    ]
    selected_features = {}
    for model in models:
        selector = RFE(model, n_features_to_select=num_features)  # Corrected line
        selector.fit(X, y)
        selected_cols = X.columns[selector.support_]
        selected_features[str(model)] = selected_cols
    return selected_features

# Get selected features using RFE
selected_features = rfe_feature_selection(X, y, 3)

# Display selected features for each model
for model, features in selected_features.items():
    print(f"{model}: {list(features)}")

# Train models with selected features and compute R2 scores
results = {}
for model in [LinearRegression(), SVR(kernel='linear'), DecisionTreeRegressor(random_state=0), RandomForestRegressor(n_estimators=10, random_state=0)]:
    X_selected = X[selected_features[str(model)]]
    X_train, X_test, y_train, y_test = split_and_scale(X_selected, y)
    r2 = train_model(model, X_train, y_train, X_test, y_test)
    results[str(model)] = r2

# Display R2 scores
print("\nModel Performance (R2 Scores):")
for model, score in results.items():
    print(f"{model}: {score:.4f}")


LinearRegression(): ['sg_c', 'sg_d', 'sg_e']
SVR(kernel='linear'): ['sg_c', 'sg_d', 'sg_e']
DecisionTreeRegressor(random_state=0): ['hrmo', 'sg_c', 'sg_d']
RandomForestRegressor(n_estimators=10, random_state=0): ['al', 'hrmo', 'sg_d']

Model Performance (R2 Scores):
LinearRegression(): 0.4420
SVR(kernel='linear'): 0.2622
DecisionTreeRegressor(random_state=0): 0.9660
RandomForestRegressor(n_estimators=10, random_state=0): 0.8873
