Implement initial model training with scikit-learn



In [1]:
#Implement initial model training with scikit-learn
try:
    import numpy as np
    import pandas as pd
    from sklearn.linear_model import LinearRegression

    print("Libraries imported successfully!")
except Exception as e:
    print(f"Error importing libraries: {e}")


Libraries imported successfully!


In [2]:
try:
    # Simple dataset: X = feature, y = target
    X = np.array([[1], [2], [3], [4], [5]])
    y = np.array([2, 4, 6, 8, 10])

    print("Data created successfully!")
except Exception as e:
    print(f"Error creating data: {e}")


Data created successfully!


In [3]:
try:
    model = LinearRegression()
    model.fit(X, y)
    print("Model trained successfully!")
    print(f"Î¸0 (Intercept): {model.intercept_}")
    print(f"Î¸1 (Coefficient): {model.coef_[0]}")
except Exception as e:
    print(f"Error during training: {e}")


Model trained successfully!
Î¸0 (Intercept): 0.0
Î¸1 (Coefficient): 2.0


In [4]:
# Assuming 'model' is already trained

try:
    # Example: Ask for input from user
    x_value = float(input("Enter a value for X: "))

    # Reshape because sklearn expects 2D array for prediction
    x_array = [[x_value]]

    # Predict using trained model
    y_pred = model.predict(x_array)

    print(f"Prediction for X = {x_value} is Y = {y_pred[0]}")

except Exception as e:
    print(f"Error during prediction: {e}")


Prediction for X = 2.0 is Y = 4.0


Implement initial model training without scikit-learn

In [5]:
import numpy as np

# Step 1: Dataset
X = np.array([1, 2, 3, 4, 5])
y = np.array([2, 4, 6, 8, 10])
m_samples = len(X)

# Step 2: Initialize parameters
theta_0 = 0  # Intercept
theta_1 = 0  # Slope
alpha = 0.01 # Learning rate
iterations = 1000

# Step 3: Gradient Descent Loop
for _ in range(iterations):
    # Hypothesis function h_theta(x)
    y_pred = theta_0 + theta_1 * X

    # Errors
    error = y_pred - y

    # Gradients (derivatives of cost w.r.t parameters)
    d_theta0 = (1/m_samples) * np.sum(error)      # ?/??0 J(?)
    d_theta1 = (1/m_samples) * np.sum(error * X)  # ?/??1 J(?)

    # Update parameters using update rule
    theta_0 -= alpha * d_theta0
    theta_1 -= alpha * d_theta1

# Step 4: Final Parameters
print(f"Final ?0 (intercept): {theta_0}")
print(f"Final ?1 (slope): {theta_1}")

# Step 5: Prediction
x_value = float(input("Enter a value for X: "))
y_prediction = theta_0 + theta_1 * x_value
print(f"Prediction for X = {x_value} is Y = {y_prediction}")

Final ?0 (intercept): 0.09475321533750963
Final ?1 (slope): 1.9737548787242036


Prediction for X = 4.0 is Y = 7.989772730234324


Enhance data cleaning and tune epochs for accurate predictions

In [1]:
# ====== 1. IMPORT LIBRARIES ======
import os
try:
    from model_storage import ModelStorage
    import numpy as np
    import pandas as pd
    from tkinter import filedialog
    import tkinter as tk
    print("Libraries imported successfully!")
except Exception as e:
    print(f"Error importing libraries: {e}")

# Utility: choose by number OR exact name
def choose_column(df: pd.DataFrame, prompt: str) -> str:
    choice = input(prompt).strip()
    if choice.isdigit():
        idx = int(choice) - 1
        if idx < 0 or idx >= len(df.columns):
            raise ValueError("Column number out of range.")
        return df.columns[idx]
    if choice not in df.columns:
        raise ValueError(f"Column name '{choice}' not in CSV columns.")
    return choice

# ====== 2. LOAD CSV ======
try:
    # Create a Tkinter root window
    root = tk.Tk()
    root.withdraw()
    root.attributes('-topmost', True)
    root.update()
    root.after(100)

    # Open a file picker dialog to select the CSV file
    file_name = filedialog.askopenfilename(filetypes=[("CSV files", "*.csv")])
    if not file_name:
        raise FileNotFoundError("No file selected.")
    
    # Read the CSV file using pandas
    df = pd.read_csv(file_name)

    print("\nCSV loaded successfully!")
    print(f"Using DB at: {os.path.abspath('1models.db')}")
    print("\nAvailable columns:")
    for idx, col in enumerate(df.columns, 1):
        print(f"{idx}. {col}")

    # Ask user which columns to use (number OR name)
    x_col = choose_column(df, "\nEnter the number OR name for X (Input feature): ")
    y_col = choose_column(df, "Enter the number OR name for Y (Target variable): ")

    # Convert chosen columns to numeric, coercing invalid values to NaN
    df[x_col] = pd.to_numeric(df[x_col], errors='coerce')
    df[y_col] = pd.to_numeric(df[y_col], errors='coerce')
    
    # Remove rows with NaN
    df = df.dropna(subset=[x_col, y_col])
    if df.empty:
        raise ValueError("No valid numeric data in selected columns after cleaning.")
    
    print(f"\nSelected X: {x_col}, Y: {y_col}")
except Exception as e:
    print(f"Error loading CSV: {e}")
finally:
    try:
        root.attributes('-topmost', False)
        root.destroy()
    except:
        pass

# ====== 3. DATA CLEANING (drop duplicates + mild outlier removal) ======
try:
    df = df.drop_duplicates()

    # IQR outlier filter (mild)
    for col in [x_col, y_col]:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower) & (df[col] <= upper)]

    df = df.reset_index(drop=True)
    if df.empty:
        raise ValueError("All rows removed by cleaning; relax outlier filter.")
    print("\nData cleaned successfully!")
except Exception as e:
    print(f"Error cleaning data: {e}")

# ====== 4. NORMALIZE (store original stats) ======
try:
    x_mean_original = df[x_col].mean()
    x_std_original  = df[x_col].std()
    y_mean_original = df[y_col].mean()
    y_std_original  = df[y_col].std()

    if np.isclose(x_std_original, 0.0) or np.isclose(y_std_original, 0.0):
        raise ValueError("Std of X or Y is zero; cannot normalize.")

    df['X_norm'] = (df[x_col] - x_mean_original) / x_std_original
    df['Y_norm'] = (df[y_col] - y_mean_original) / y_std_original

    print("\nNormalization complete!")
except Exception as e:
    print(f"Error in normalization: {e}")

# ====== 5. SHUFFLE & SPLIT ======
try:
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    train_frac = 1.0  # set to 0.8 if you want a test set
    train_size = int(train_frac * len(df))
    train_data = df.iloc[:train_size].copy()
    test_data  = df.iloc[train_size:].copy()

    X_train = train_data['X_norm'].to_numpy()
    y_train = train_data['Y_norm'].to_numpy()
    X_test  = test_data['X_norm'].to_numpy()
    y_test  = test_data['Y_norm'].to_numpy()

    print(f"\nData split: Train={len(train_data)}, Test={len(test_data)}")
except Exception as e:
    print(f"Error splitting data: {e}")

# ====== 6. GRADIENT DESCENT ======
theta_0 = 0.0
theta_1 = 0.0
try:
    alpha = 0.01
    m = len(X_train)
    epochs = 0
    max_epochs = 5000
    tolerance = 0
    prev_cost = float('inf')

    while epochs < max_epochs:
        y_pred = theta_0 + theta_1 * X_train
        error = y_pred - y_train
        cost = (1 / (2 * m)) * np.sum(error ** 2)

        if abs(prev_cost - cost) < tolerance:
            print(f"\nConverged at epoch {epochs}, cost={cost}")
            break
        prev_cost = cost

        d_theta0 = (1/m) * np.sum(error)
        d_theta1 = (1/m) * np.sum(error * X_train)

        theta_0 -= alpha * d_theta0
        theta_1 -= alpha * d_theta1
        epochs += 1

        if epochs % 500 == 0:
            print(f"Epoch {epochs}, Cost={cost}")

    print(f"\nTraining complete in {epochs} epochs")
    print(f"Î¸0 (intercept): {theta_0}")
    print(f"Î¸1 (slope): {theta_1}")
except Exception as e:
    print(f"Error during training: {e}")

# ====== 6b. STORE MODEL SAFELY ======
model_id = None
try:
    storage = ModelStorage()  # default: 1models.db in current directory
    model_id = storage.add_model(
        user_id="alice14423",      # TODO: replace with real logged-in user id
        file_path=file_name,     # actual selected file
        x_col=x_col,
        y_col=y_col,
        theta0=theta_0,
        theta1=theta_1,
        epochs=epochs,
        tolerance=float(tolerance)
    )
    print(f"\nModel stored with ID: {model_id}")
    # Optional: fetch back to verify
    rec = storage.get_model(model_id)
    print(f"ðŸ”Ž Stored row: {rec}")
except Exception as db_err:
    print(f"\nFailed to store model in DB: {db_err}")
finally:
    try:
        storage.close()
    except:
        pass

# ====== 7. TESTING ======
try:
    if len(X_test) > 0:
        y_test_pred = theta_0 + theta_1 * X_test
        mse_test = (1 / len(X_test)) * np.sum((y_test_pred - y_test) ** 2)
        print(f"\nTest MSE (normalized space): {mse_test}")
    else:
        print("\nNo test set (all data used for training).")
except Exception as e:
    print(f"Error during testing: {e}")

# ====== 8. PREDICTION FUNCTION ======
def predict_single(raw_x: float) -> float:
    norm_x = (raw_x - x_mean_original) / x_std_original
    norm_y_pred = theta_0 + theta_1 * norm_x
    raw_y_pred = (norm_y_pred * y_std_original) + y_mean_original
    return float(raw_y_pred)

# Example predictions
try:
    test_values = [10, 20, 50]
    print("\nPredictions:")
    for val in test_values:
        print(f"X={val} â†’ Predicted Y={predict_single(val)}")
except Exception as e:
    print(f"Error during prediction: {e}")


Libraries imported successfully!
CSV loaded successfully!
Data cleaned successfully!
Z-score normalization done (original values preserved)!
Data shuffled and split into training/testing sets!
Training complete in 80 epochs
?0 (intercept): -8.802595336650447e-05
?1 (slope): 0.9997589485124853
Test MSE (normalized space): 7.226075964522702e-08
Predicted Y (original scale): 2.040406366369183


Final Predictions using scikit-learn

In [None]:
# ====== 1. IMPORT LIBRARIES ======
try:
    import numpy as np
    import pandas as pd
    from tkinter import filedialog
    import tkinter as tk
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import mean_squared_error
    print("Libraries imported successfully!")
except Exception as e:
    print(f"Error importing libraries: {e}")

# ====== 2. LOAD CSV ======
try:
    # Create a Tkinter root window
    root = tk.Tk()
    root.withdraw()
    root.attributes('-topmost', True)
    root.update()
    root.after(100)

    # Open a file picker dialog to select the CSV file
    file_name = filedialog.askopenfilename(filetypes=[("CSV files", "*.csv")])
    if not file_name:
        raise FileNotFoundError("No file selected.")
    
    # Read the CSV file using pandas
    df = pd.read_csv(file_name)
    print("CSV loaded successfully!")
except FileNotFoundError:
    print("CSV file not found. Please select a valid CSV file.")
except Exception as e:
    print(f"Error reading CSV: {e}")
finally:
    root.attributes('-topmost', False)
    root.destroy()

# ====== 3. DATA CLEANING ======
try:
    df = df.drop_duplicates()
    df = df.dropna()

    # Keep only numeric rows
    numeric_df = pd.DataFrame()
    for col in df.columns:
        numeric_df[col] = pd.to_numeric(df[col], errors='coerce')
    df = numeric_df.dropna()

    print("Data cleaned successfully! Only numeric rows remain.")
except Exception as e:
    print(f"Error cleaning data: {e}")

# ====== 4. SPLIT DATA INTO TRAIN / TEST ======
try:
    X = df[['X']].values  # scikit-learn expects 2D array for features
    y = df['Y'].values

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    print("Data split into training/testing sets!")
except Exception as e:
    print(f"Error splitting data: {e}")

# ====== 5. FEATURE SCALING (Z-score normalization) ======
try:
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()

    X_train_scaled = scaler_X.fit_transform(X_train)
    y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1)).ravel()  # flatten y

    X_test_scaled = scaler_X.transform(X_test)
    y_test_scaled = scaler_y.transform(y_test.reshape(-1, 1)).ravel()

    print("Z-score normalization done (original values preserved)!")
except Exception as e:
    print(f"Error during normalization: {e}")

# ====== 6. TRAIN LINEAR REGRESSION ======
try:
    model = LinearRegression()
    model.fit(X_train_scaled, y_train_scaled)

    print(f"Training complete!")
    print(f"Intercept (Î¸0): {model.intercept_}")
    print(f"Slope (Î¸1): {model.coef_[0]}")
except Exception as e:
    print(f"Error during training: {e}")

# ====== 7. TESTING ======
try:
    y_test_pred_scaled = model.predict(X_test_scaled)
    mse_test = mean_squared_error(y_test_scaled, y_test_pred_scaled)
    print(f"Test MSE (normalized space): {mse_test}")
except Exception as e:
    print(f"Error during testing: {e}")

# ====== 8. PREDICT FOR A SINGLE USER-INPUT VALUE ======
try:
    raw_x = float(input("Enter the value of X (original/raw value): "))

    # Normalize input using scaler
    norm_x = scaler_X.transform([[raw_x]])
    norm_y_pred = model.predict(norm_x)

    # Convert prediction back to original scale
    raw_y_pred = scaler_y.inverse_transform(norm_y_pred.reshape(-1, 1))[0, 0]

    print(f"Predicted Y (original scale): {raw_y_pred}")
except Exception as e:
    print(f"Error during prediction: {e}")

Libraries imported successfully!
CSV loaded successfully!
Data cleaned successfully! Only numeric rows remain.
Data split into training/testing sets!
Z-score normalization done (original values preserved)!
Training complete!
Intercept (Î¸0): 0.0
Slope (Î¸1): 1.0
Test MSE (normalized space): 0.0
Predicted Y (normalized): -1.8380814494296098
Predicted Y (original scale): 8.0
