In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [19]:
df = pd.read_csv('practice_linear_regression_dataset.csv')
df.head()

Unnamed: 0,Age,Experience,EducationYears,HoursPerWeek,Certifications,Projects,Salary
0,56,7,14,46,1,1,70017
1,46,23,19,46,0,0,68168
2,32,10,15,53,4,6,67490
3,60,16,16,40,4,7,76764
4,25,7,13,49,0,6,65783


In [None]:
corr = df.corr()["Salary"].drop("Salary").sort_values(ascending=False)
print(corr)

Experience        0.656428
Age               0.354878
EducationYears    0.333485
HoursPerWeek      0.255432
Projects          0.218445
Certifications    0.148275
Name: Salary, dtype: float64


In [None]:
# Search-Based Feature Selection using filter evaluation
# forward feature selection with redundancy penalty.

def subset_score(features, data, target_name="Salary"):
    """ Score subset = avg corr(feature,target) - avg corr(feature,feature)"""
    if not features:  # empty list
        return 0
    
    subset = data[features]
    corr_with_target = subset.corrwith(data[target_name]).mean()

    # redundancy penalty: average absolute correlation among features
    if len(features) > 1:
        corr_matrix = subset.corr().abs()
        redundancy = (corr_matrix.values.sum() - len(features)) / (len(features)*(len(features)-1))
    else:
        redundancy = 0
    return corr_with_target - redundancy

available = df.drop(columns="Salary").columns.tolist()
selected = []

while available:
    best_feature = None
    best_score = -np.inf
    for f in available:
        score = subset_score(selected + [f], df)
        if score > best_score:
            best_score = score
            best_feature = f
    # stopping rule: only add if it improves
    if best_score > subset_score(selected, df):
        selected.append(best_feature)
        available.remove(best_feature)
    else:
        break

print("Selected features (Search + Filter):", selected)

Selected features (Search + Filter): ['Experience']


In [None]:
# Search-Based Feature Selection using filter evaluation
# Backward Elimination Version.

def subset_score(features, data, target_name="Salary"):
    """ Score subset = avg corr(feature,target) - avg corr(feature,feature)"""
    if not features:  # empty list
        return 0
    
    subset = data[features]
    corr_with_target = subset.corrwith(data[target_name]).mean()

    # redundancy penalty: average absolute correlation among features
    if len(features) > 1:
        corr_matrix = subset.corr().abs()
        redundancy = (corr_matrix.values.sum() - len(features)) / (len(features)*(len(features)-1))
    else:
        redundancy = 0
    return corr_with_target - redundancy


# Start with all features selected
selected = df.drop(columns="Salary").columns.tolist()

while True:
    worst_feature = None
    best_score_after_removal = subset_score(selected, df)  # current score

    for f in selected:
        # Evaluate score if we remove this feature
        score_without_f = subset_score([feat for feat in selected if feat != f], df)
        if score_without_f > best_score_after_removal:
            best_score_after_removal = score_without_f
            worst_feature = f

    if worst_feature:  # removing improves the score
        selected.remove(worst_feature)
    else:
        break  # no improvement possible

print("Selected features (Backward Elimination):", selected)


Selected features (Backward Elimination): ['Experience']


In [None]:
X = df.drop(columns=['Salary'])
y = df['Salary']

In [None]:
# Train - Test Split

rng = np.random.default_rng(42)
row_positions = np.arange(X.shape[0]) # all row indices
rng.shuffle(row_positions) # shuffle in place

split_point = int(0.8 * X.shape[0])  # 80% train, 20% test

X_train_idx, X_test_idx = row_positions[:split_point], row_positions[split_point:]

X_train, X_test = X.iloc[X_train_idx], X.iloc[X_test_idx]
y_train, y_test = y.iloc[X_train_idx], y.iloc[X_test_idx]


In [None]:
# Check range of features

# Checking min/max of all columns before scaling
def check_feature_range(col):
    return (min(col), max(col))

for col in X_train.columns:
    print(col + "  " + str(check_feature_range(X_train[col])))


Age  (18, 64)
Experience  (0, 39)
EducationYears  (10, 19)
HoursPerWeek  (20, 59)
Certifications  (0, 4)
Projects  (0, 9)


In [None]:
# Feature scaling using MinMaxScaler
minmax_scaler = MinMaxScaler()

# Fit on training and transform
X_train_scaled = minmax_scaler.fit_transform(X_train)


# Convert back to DataFrame with same column names to test/see the first few rows
X_train_scaled_copy = X_train_scaled.copy()
X_train_scaled_copy = pd.DataFrame(X_train_scaled_copy, columns=X_train.columns, index=X_train.index)

# Now you can use .head()
print(X_train_scaled_copy.head(2))

         Age  Experience  EducationYears  HoursPerWeek  Certifications  \
59  0.760870    0.692308        0.777778      0.333333            0.00   
21  0.804348    1.000000        0.222222      0.769231            0.25   

    Projects  
59  0.444444  
21  0.222222  


In [None]:
# Use the same scaler to transform test data (this time we are NOT fitting again)
X_test_scaled = minmax_scaler.transform(X_test)