In [9]:
import os
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [3]:
def find_best_features(df, target_col):
    """Find the best single and pair of features based on MSE."""
    feature_cols = ['x1', 'x2', 'x3', 'x4']
    mse_scores = {}
    
    # Evaluate each feature individually
    for col in feature_cols:
        X_train, X_test, y_train, y_test = train_test_split(df[[col]], df[target_col], test_size=0.2, random_state=42)
        model = LinearRegression()
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        mse_scores[col] = mean_squared_error(y_test, predictions)
    
    # Identify the best single feature
    best_single_feature = min(mse_scores, key=mse_scores.get)
    
    # Identify the best pair of features
    best_two_features = sorted(mse_scores, key=mse_scores.get)[:2]
    
    return best_single_feature, best_two_features

In [4]:
def train_model(features, target, model_name):
    """Train a linear regression model and save it."""
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
    model = LinearRegression()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    print(f"{model_name} - MSE: {mse:.4f}")
    
    # Save the model
    os.makedirs("models", exist_ok=True)
    joblib.dump(model, f"models/{model_name}.pkl")

In [5]:
data_path = "sampregdata.csv"
df = pd.read_csv(data_path)

In [6]:
best_single_feature, best_two_features = find_best_features(df, 'y')   

In [7]:
print(f"Best single feature: {best_single_feature}")
print(f"Best two features: {best_two_features}")

Best single feature: x4
Best two features: ['x4', 'x2']


In [10]:
#model1 = train_model(df[[best_single_feature]], df['y'], "linear_model_x1")

linear_model_x1 - MSE: 76.8527


In [11]:
#model2 =  train_model(df[best_two_features], df['y'], "linear_model_x1_x2")

linear_model_x1_x2 - MSE: 53.6211


In [13]:
#To load the pretrained models use this cell
model_x1 = joblib.load("models/linear_model_x1.pkl")
model_x1_x2 = joblib.load("models/linear_model_x1_x2.pkl")

In [14]:
# Make predictions and calculate MSE
y_true = df["y"]

In [15]:
y_pred_x1 = model_x1.predict(df[[best_single_feature]])
mse_x1 = mean_squared_error(y_true, y_pred_x1)
print(f"Model 1 (X1) - MSE: {mse_x1:.4f}")

Model 1 (X1) - MSE: 83.2400


In [16]:
y_pred_x1_x2 = model_x1_x2.predict(df[best_two_features])
mse_x1_x2 = mean_squared_error(y_true, y_pred_x1_x2)
print(f"Model 2 (X1 & X2) - MSE: {mse_x1_x2:.4f}")

Model 2 (X1 & X2) - MSE: 55.8585


In [17]:
print("Models have been trained and saved successfully.")

Models have been trained and saved successfully.
