<a href="https://colab.research.google.com/github/Richik06/Invoice-categorizer/blob/main/Standalone_Machine_Learning_Model_Script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
import joblib
import os

# Define a mapping from the class codes to human-readable category names.
# This dictionary links the CLASS-xxxx codes in your dataset to meaningful labels.
class_mapping = {
    'CLASS-1963': 'Creative/Marketing',
    'CLASS-1250': 'Auto Services/Leasing',
    'CLASS-1274': 'Real Estate/Lease',
    'CLASS-1522': 'Construction/Maintenance',
    'CLASS-1376': 'Contingent Labor/HR',
    'CLASS-1567': 'Logistics/Freight',
    'CLASS-1758': 'Travel/Transportation',
    'CLASS-2141': 'Events/Trade Shows',
    'CLASS-1828': 'Agency Fees',
    'CLASS-1249': 'Auto Maintenance',
    'CLASS-2003': 'Employee Benefits',
    'CLASS-1294': 'Insurance',
    'CLASS-1477': 'Digital/Email Marketing',
    'CLASS-1870': 'Shipping/Logistics',
    'CLASS-1805': 'Traditional Media/Advertising',
    'CLASS-1309': 'Print/Catalogue Media',
    'CLASS-1652': 'IT Services',
    'CLASS-1983': 'Real Estate Services',
    'CLASS-2241': 'Workmen\'s Comp/Insurance',
    'CLASS-1964': 'Public Relations',
    'CLASS-2112': 'Store Fixtures/Construction',
    'CLASS-2038': 'Search Marketing/SEO',
    'CLASS-1867': 'Social Media Marketing',
    'CLASS-2146': 'Taxes',
    'CLASS-2015': 'Royalties/Misc',
}

# --- Section 1: Model Training and Persistence ---
# This function encapsulates the entire training process.

def train_and_save_model(data_file='Train.csv', model_path='model.pkl', vectorizer_path='vectorizer.pkl'):
    """
    Loads data from a CSV file, trains a text classification pipeline, and saves
    the trained model and vectorizer to pickle files.

    Args:
        data_file (str): The path to the training data CSV file.
        model_path (str): The filename for the saved classifier model.
        vectorizer_path (str): The filename for the saved TfidfVectorizer.
    """
    print("🚀 Starting model training process...")

    # Load the training data into a pandas DataFrame.
    try:
        df = pd.read_csv(data_file)
        print(f"✅ Successfully loaded training data from '{data_file}'.")
    except FileNotFoundError:
        print(f"❌ Error: The file '{data_file}' was not found. Please ensure it is in the correct directory.")
        return

    # Drop any rows where 'Item_Description' or 'Product_Category' are missing.
    df.dropna(subset=['Item_Description', 'Product_Category'], inplace=True)

    # Define the features (X) and the target labels (y).
    X = df['Item_Description']
    y = df['Product_Category']

    # Create a machine learning pipeline. This is a common practice for text classification
    # as it combines a text pre-processing step (vectorization) and a classification step.
    # - TfidfVectorizer: Converts a collection of raw documents to a matrix of TF-IDF features.
    #   'stop_words' and 'lowercase' are used for basic text cleaning.
    # - LinearSVC: A powerful linear classifier suitable for large-scale text classification.
    model_pipeline = Pipeline([
        ('vectorizer', TfidfVectorizer(stop_words='english', lowercase=True)),
        ('classifier', LinearSVC())
    ])

    # Train the entire pipeline on the full dataset.
    model_pipeline.fit(X, y)
    print("✅ Model training complete.")

    # Save the trained model and vectorizer separately using joblib.
    # This allows them to be loaded later without having to retrain the model.
    joblib.dump(model_pipeline.named_steps['vectorizer'], vectorizer_path)
    joblib.dump(model_pipeline.named_steps['classifier'], model_path)
    print(f"✅ Model components saved as '{model_path}' and '{vectorizer_path}'.")

# Call the training function. This will create the .pkl files.
train_and_save_model()


# --- Section 2: Model Prediction and Categorization ---
# This function demonstrates how to use the saved model files.

def predict_category(line_items, model, vectorizer, mapping):
    """
    Loads a trained model and vectorizer to predict categories for new text.
    It then translates the raw class codes into human-readable names.

    Args:
        line_items (list): A list of strings, where each string is a line item.
        model: The trained classifier model.
        vectorizer: The trained text vectorizer.
        mapping (dict): A dictionary to map class codes to descriptive names.

    Returns:
        pd.DataFrame: A DataFrame showing the original line item and its predicted category name.
    """
    if not line_items:
        return pd.DataFrame({'Line Item': [], 'Category': []})

    # Transform the new text data using the pre-trained vectorizer.
    X_new = vectorizer.transform(line_items)

    # Get the model's predictions, which are the raw class codes.
    predictions = model.predict(X_new)

    # Map the predicted class codes to their human-readable names.
    predicted_categories = [mapping.get(code, 'Unknown') for code in predictions]

    # Create and return a DataFrame with the results.
    return pd.DataFrame({
        'Line Item': line_items,
        'Category': predicted_categories
    })

def run_prediction_demo():
    """
    Main function to load the saved model and run a prediction demonstration.
    """
    print("\n📦 Running model prediction demo...")

    model_path = 'model.pkl'
    vectorizer_path = 'vectorizer.pkl'

    # Ensure model files exist before trying to load them.
    if not os.path.exists(model_path) or not os.path.exists(vectorizer_path):
        print(f"❌ Error: Model files ('{model_path}' and '{vectorizer_path}') not found. Please run the training function first.")
        return

    try:
        # Load the pre-trained model and vectorizer from the saved files.
        model = joblib.load(model_path)
        vectorizer = joblib.load(vectorizer_path)
        print("✅ Model and vectorizer loaded successfully.")
    except Exception as e:
        print(f"❌ Error loading model files: {e}")
        return

    # Sample text representing extracted line items from a new invoice.
    new_invoice_text = [
        'Store Construction General Requirements Final Site Clean Up',
        'Auto Leasing Corporate Services Corning Inc /Ny 2013-Mar',
        'Base Rent Store Management Lease/Rent Real Estate',
        'Magazines Media Buy - Traditional SMAP National Advertising',
        'Ground Transportation Travel and Entertainment Miscellaneous Company Car (Field Only)',
        'Digital Display Digital/Social May-2007',
        'Benefits Retirement and Pension Funds Essex Group Inc Corporate Services',
        'Insurance Building and Property Insurance',
    ]

    # Predict the categories for the new invoice text using our new function.
    results_df = predict_category(new_invoice_text, model, vectorizer, class_mapping)

    print("\n📋 Categorization Results:")
    print(results_df)

if __name__ == "__main__":
    run_prediction_demo()

🚀 Starting model training process...
✅ Successfully loaded training data from 'Train.csv'.
✅ Model training complete.
✅ Model components saved as 'model.pkl' and 'vectorizer.pkl'.

📦 Running model prediction demo...
✅ Model and vectorizer loaded successfully.

📋 Categorization Results:
                                           Line Item                  Category
0  Store Construction General Requirements Final ...  Construction/Maintenance
1  Auto Leasing Corporate Services Corning Inc /N...     Auto Services/Leasing
2  Base Rent Store Management Lease/Rent Real Estate         Real Estate/Lease
3  Magazines Media Buy - Traditional SMAP Nationa...                   Unknown
4  Ground Transportation Travel and Entertainment...     Travel/Transportation
5            Digital Display Digital/Social May-2007                   Unknown
6  Benefits Retirement and Pension Funds Essex Gr...         Employee Benefits
7          Insurance Building and Property Insurance                 Insurance
