In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def train_and_get_weights():
    """
    Loads the feature data and labels, trains a logistic regression model,
    and prints the resulting weights for each feature.
    """
    print("--- ML Model Training Script ---")

    # 1. Get user input for paths
    features_csv_path = input("➡️ Enter the full path to your features.csv file: ")
    labels_csv_path = input("➡️ Enter the full path to your labels.csv file: ")

    try:
        # 2. Load the datasets
        features_df = pd.read_csv(features_csv_path)
        labels_df = pd.read_csv(labels_csv_path)
    except FileNotFoundError as e:
        print(f"❌ Error: Could not find a file. Please check your paths. Details: {e}")
        return

    # 3. Merge features and labels into a single DataFrame
    # This aligns the data correctly based on the 'filename' column
    data_df = pd.merge(features_df, labels_df, on='filename')

    if data_df.empty:
        print("❌ Error: The merged data is empty. Make sure the 'filename' columns match in both CSV files.")
        return

    # 4. Define features (X) and the target (y)
    # X is our set of parameters/heuristics
    # y is the final verdict (phishing or not)
    feature_columns = [col for col in data_df.columns if col not in ['filename', 'label']]
    X = data_df[feature_columns]
    y = data_df['label']

    if len(X) < 2:
        print("❌ Error: Not enough data to train. Please provide at least two labeled examples.")
        return
        
    # Split data for a quick accuracy check (optional but good practice)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y if sum(y)>1 else None)

    # 5. Initialize and train the Logistic Regression model
    print("\n⚙️ Training the model...")
    # C=0.1 and class_weight='balanced' help prevent overfitting on small, imbalanced datasets
    model = LogisticRegression(C=0.1, class_weight='balanced', random_state=42)
    model.fit(X_train, y_train)
    print("✅ Model training complete.")
    
    # Check model accuracy on the test set
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"📊 Model accuracy on test data: {accuracy:.2%}")

    # 6. Extract the coefficients (the weights)
    # These weights are the core output you need for your local heuristic model
    weights = model.coef_[0]

    # 7. Display the weights with their corresponding feature names
    print("\n--- Personalized Parameter Weights ---")
    print("These are the weights for your local heuristic model. Higher positive values indicate a stronger sign of phishing.")
    
    weights_df = pd.DataFrame({
        'Parameter': feature_columns,
        'Weight': weights
    })
    
    # Sort by the absolute value of the weight to see the most impactful parameters first
    weights_df['Impact'] = abs(weights_df['Weight'])
    weights_df = weights_df.sort_values(by='Impact', ascending=False).drop(columns=['Impact'])
    
    print(weights_df.to_string(index=False))
    
    print("\n💡 Interpretation:")
    print("- Positive Weight: This parameter's presence increases the phishing score.")
    print("- Negative Weight: This parameter's presence indicates a safe email (decreases score).")
    print("- Weight near Zero: This parameter was not a strong indicator for this dataset.")


# --- RUN THE SCRIPT ---
# To execute, just run this cell in your Jupyter Notebook.
train_and_get_weights()
