## 1. Data Preprocessing

In [None]:
import pandas as pd

def load_data(file_path):
    """Load the dataset from the specified CSV file."""
    df = pd.read_csv(file_path, sep="\t", engine='python')
    return df

def encode_categorical_variables(df):
    """Convert categorical variables into numerical format using one-hot encoding."""
    df = pd.get_dummies(df, columns=['qualification', 'skills'], drop_first=True)
    return df

def preprocess_data(file_path):
    """Preprocess the dataset."""
    df = load_data(file_path)

    # Drop the 'name' column as it is not needed for model training
    if 'name' in df.columns:
        df.drop(columns=['name'], inplace=True)

    # Fill missing values for numerical columns
    for column in df.select_dtypes(include=['int64', 'float64']).columns:
        df[column] = df[column].fillna(df[column].mean())

    # Fill missing values for categorical columns
    for column in df.select_dtypes(include=['object']).columns:
        df[column] = df[column].fillna('Unknown')

    # Convert categorical variables into numerical format
    df = encode_categorical_variables(df)

    # Ensure all numeric data types are floats
    for column in df.select_dtypes(include=['int64', 'bool']).columns:
        df[column] = df[column].astype(float)  # Convert int and bool columns to float

    # Save the preprocessed DataFrame to a CSV file
    output_path = '/content/preprocessed_candidates.csv'  # Update with Colab path
    df.to_csv(output_path, index=False)
    print(f"Preprocessed data saved to {output_path}")

    return df

# Specify the dataset path in Colab
DATASET_PATH = '/content/candidates.csv'  # Update with your path
df = preprocess_data(DATASET_PATH)
print("Preprocessed DataFrame:")
print(df.head())


##2. Model Training

In [None]:
import pandas as pd
import numpy as np

def elastic_net(X, y, alpha=1.0, l1_ratio=0.5, num_iterations=1000, learning_rate=0.001):
    """Train the Elastic Net model."""
    num_samples, num_features = X.shape
    weights = np.zeros(num_features)
    bias = 0

    for i in range(num_iterations):
        # Calculate model predictions
        model_predictions = np.dot(X, weights) + bias

        # Calculate gradients
        dw = (1 / num_samples) * np.dot(X.T, (model_predictions - y)) + alpha * (
                    l1_ratio * np.sign(weights) + (1 - l1_ratio) * weights)
        db = (1 / num_samples) * np.sum(model_predictions - y)

        # Update weights and bias
        weights -= learning_rate * dw
        bias -= learning_rate * db

        # Print debugging information every 100 iterations
        if i % 100 == 0:
            print(f"Iteration {i}: Weights: {weights}, Bias: {bias}, dw: {dw}, db: {db}")

    return weights, bias

def main():
    # Load preprocessed dataset
    DATASET_PATH = '/content/preprocessed_candidates.csv'  # Update with Colab path
    df = pd.read_csv(DATASET_PATH)

    # Separate features and target variable
    target_column = 'target'
    X = df.drop(columns=[target_column])
    y = df[target_column].values

    # Print shapes and types for debugging
    print(f"X shape: {X.shape}, y shape: {y.shape}")
    print(f"X data types:\n{X.dtypes}")

    # Train the Elastic Net model
    weights, bias = elastic_net(X.values, y)

    # Save weights, bias, and actual target values
    np.save('/content/weights.npy', weights)
    np.save('/content/bias.npy', bias)
    np.save('/content/y_true.npy', y)

    # Print weights and bias
    print("Weights:", weights)
    print("Bias:", bias)

# Run the main function
if __name__ == "__main__":
    main()


## 3. Train and Save

In [None]:
import pandas as pd
import numpy as np

def elastic_net(X, y, alpha=1.0, l1_ratio=0.5, num_iterations=1000, learning_rate=0.001):
    """Train the Elastic Net model."""
    num_samples, num_features = X.shape
    weights = np.zeros(num_features)
    bias = 0

    for i in range(num_iterations):
        # Calculate model predictions
        model_predictions = np.dot(X, weights) + bias

        # Calculate gradients
        dw = (1 / num_samples) * np.dot(X.T, (model_predictions - y)) + alpha * (
                l1_ratio * np.sign(weights) + (1 - l1_ratio) * weights)
        db = (1 / num_samples) * np.sum(model_predictions - y)

        # Update weights and bias
        weights -= learning_rate * dw
        bias -= learning_rate * db

        # Print debugging information every 100 iterations
        if i % 100 == 0:
            print(f"Iteration {i}: Weights: {weights}, Bias: {bias}, dw: {dw}, db: {db}")

    return weights, bias

def main():
    # Load preprocessed dataset
    DATASET_PATH = '/content/preprocessed_candidates.csv'  # Update with Colab path
    df = pd.read_csv(DATASET_PATH)

    # Separate features and target variable
    target_column = 'target'
    X = df.drop(columns=[target_column])
    y = df[target_column].values

    # Print shapes and types for debugging
    print(f"X shape: {X.shape}, y shape: {y.shape}")
    print(f"X data types:\n{X.dtypes}")

    # Split data into training and testing sets
    num_samples = X.shape[0]
    train_size = int(0.8 * num_samples)

    # Use NumPy for the split to avoid using sklearn
    X_train, X_test = X.values[:train_size], X.values[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]

    # Train the Elastic Net model
    weights, bias = elastic_net(X_train, y_train)

    # Save the weights and bias
    np.save('/content/weights.npy', weights)
    np.save('/content/bias.npy', bias)

    # Print weights and bias
    print("Weights:", weights)
    print("Bias:", bias)

# Run the main function
if __name__ == "__main__":
    main()


## Elasticnet

In [None]:
import numpy as np
import pandas as pd

class ElasticNet:
    def __init__(self, alpha=1.0, l1_ratio=0.5, lr=0.01, iterations=1000):
        self.alpha = alpha            # Regularization strength (lambda)
        self.l1_ratio = l1_ratio      # Mix between L1 and L2 regularization
        self.lr = lr                  # Learning rate
        self.iterations = iterations   # Number of iterations (epochs)

    def _compute_cost(self, X, y, y_pred, theta):
        m = len(y)
        mse = (1 / (2 * m)) * np.sum((y_pred - y) ** 2)
        l1_penalty = self.l1_ratio * np.sum(np.abs(theta))
        l2_penalty = (1 - self.l1_ratio) * np.sum(theta ** 2)
        return mse + self.alpha * (l1_penalty + l2_penalty)

    def _compute_gradient(self, X, y, y_pred, theta):
        m = len(y)
        gradient = (1 / m) * X.T.dot(y_pred - y) + \
                   self.alpha * (self.l1_ratio * np.sign(theta) + (1 - self.l1_ratio) * theta)
        return gradient

    def fit(self, X, y):
        m, n = X.shape
        self.theta = np.zeros(n)  # Initialize theta to zeros
        y = y.astype(float)        # Ensure y is a float

        # Gradient Descent Loop
        for _ in range(self.iterations):
            y_pred = X.dot(self.theta)
            cost = self._compute_cost(X, y, y_pred, self.theta)
            gradient = self._compute_gradient(X, y, y_pred, self.theta)
            self.theta -= self.lr * gradient

    def predict(self, X):
        return X.dot(self.theta)

def main():
    # Load preprocessed dataset
    DATASET_PATH = '/content/preprocessed_candidates.csv'  # Update with Colab path
    df = pd.read_csv(DATASET_PATH)

    # Separate features and target variable
    target_column = 'target'
    X = df.drop(columns=[target_column]).values
    y = df[target_column].values

    # Split data into training and testing sets
    num_samples = X.shape[0]
    train_size = int(0.8 * num_samples)
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]

    # Create and fit the Elastic Net model
    model = ElasticNet(alpha=1.0, l1_ratio=0.5, lr=0.01, iterations=1000)
    model.fit(X_train, y_train)

    # Make predictions on the test set
    predictions = model.predict(X_test)

    # Print the predictions
    print("Predictions on the test set:", predictions)

    # Save the model parameters
    np.save('/content/elastic_net_theta.npy', model.theta)

# Run the main function
if __name__ == "__main__":
    main()


## Evaluate

In [None]:
import pandas as pd
import numpy as np

def elastic_net(X, y, weights, bias, alpha=1.0, l1_ratio=0.5, num_iterations=1000, learning_rate=0.001):
    """Evaluate the Elastic Net model."""
    num_samples = X.shape[0]

    for i in range(num_iterations):
        # Calculate model predictions
        model_predictions = np.dot(X, weights) + bias

        # Calculate gradients
        dw = (1 / num_samples) * np.dot(X.T, (model_predictions - y)) + alpha * (
                    l1_ratio * np.sign(weights) + (1 - l1_ratio) * weights)
        db = (1 / num_samples) * np.sum(model_predictions - y)

        # Update weights and bias
        weights -= learning_rate * dw
        bias -= learning_rate * db

    return model_predictions

def main():
    # Load preprocessed dataset
    DATASET_PATH = '/content/preprocessed_candidates.csv'  # Update with Colab path
    df = pd.read_csv(DATASET_PATH)

    # Separate features and target variable
    target_column = 'target'
    X = df.drop(columns=[target_column]).values
    y = df[target_column].values

    # Load weights and bias
    weights = np.load('/content/weights.npy')  # Update with Colab path
    bias = np.load('/content/bias.npy')  # Update with Colab path

    # Make predictions
    y_pred = elastic_net(X, y, weights, bias)

    # Calculate Mean Squared Error
    mse = np.mean((y_pred - y) ** 2)
    print("Mean Squared Error:", mse)

    # Save predictions
    np.save('/content/y_pred.npy', y_pred)  # Update with Colab path

# Run the main function
if __name__ == "__main__":
    main()


## Train and Evaluate

In [None]:
import pandas as pd
import numpy as np

# Load preprocessed data
data_path = "/content/preprocessed_candidates.csv"  # Update to Colab path
data = pd.read_csv(data_path)

# Prepare features (X) and target (y)
X = data.drop(columns=['target']).values
y = data['target'].values

# Feature scaling (Standardization)
mean = np.mean(X, axis=0)
std = np.std(X, axis=0)
X = (X - mean) / std

# Define Elastic Net
def elastic_net(X, y, learning_rate=0.0001, n_iterations=1000):
    n_samples, n_features = X.shape
    weights = np.zeros(n_features)
    bias = 0.0

    for i in range(n_iterations):
        model_predictions = np.dot(X, weights) + bias

        # Compute gradients
        dw = (1 / n_samples) * np.dot(X.T, (model_predictions - y))
        db = (1 / n_samples) * np.sum(model_predictions - y)

        # Update weights and bias
        weights -= learning_rate * dw
        bias -= learning_rate * db

        if i % 100 == 0:  # Print every 100 iterations
            print(f"Iteration {i}: Weights: {weights}, Bias: {bias}, dw: {dw}, db: {db}")

    return weights, bias

# Train the model
weights, bias = elastic_net(X, y)

# Make predictions
y_pred = np.dot(X, weights) + bias

# Calculate Mean Squared Error
mse = np.mean((y_pred - y) ** 2)
print(f"Mean Squared Error: {mse}")


## Visualization

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def main():
    # Load actual and predicted values
    y_true = np.load('/content/y_true.npy')  # Update to Colab path
    y_pred = np.load('/content/y_pred.npy')  # Update to Colab path

    # Create a scatter plot
    plt.figure(figsize=(10, 6))
    plt.scatter(y_true, y_pred, alpha=0.7)
    plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--')  # line for perfect prediction
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.title('Actual vs Predicted Values')
    plt.grid()
    plt.show()

# Run the main function
if __name__ == "__main__":
    main()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

def main():
    # Load actual and predicted values
    y_true = np.load('/content/y_true.npy')  # Update to Colab path
    y_pred = np.load('/content/y_pred.npy')  # Update to Colab path

    # Create a line plot for actual vs predicted
    plt.figure(figsize=(10, 6))
    plt.plot(y_true, label='Actual', alpha=0.7)
    plt.plot(y_pred, label='Predicted', alpha=0.7)
    plt.xlabel('Sample Index')
    plt.ylabel('Values')
    plt.title('Actual vs Predicted Values Over Samples')
    plt.legend()
    plt.grid()
    plt.show()

# Run the main function
if __name__ == "__main__":
    main()
