In [1]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
     -------------------------------------- 981.5/981.5 kB 6.9 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Installing collected packages: langdetect
  Running setup.py install for langdetect: started
  Running setup.py install for langdetect: finished with status 'done'
Successfully installed langdetect-1.0.9


  DEPRECATION: langdetect is being installed using the legacy 'setup.py install' method, because it does not have a 'pyproject.toml' and the 'wheel' package is not installed. pip 23.1 will enforce this behaviour change. A possible replacement is to enable the '--use-pep517' option. Discussion can be found at https://github.com/pypa/pip/issues/8559

[notice] A new release of pip available: 22.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from langdetect import detect, LangDetectException
import re

In [3]:
data = pd.read_csv('D:\\ticket_system\\ML-training\\datasets\\merged_dataset.csv')

In [4]:
# Function to clean text
def clean_text(text):
    if isinstance(text, str):
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        text = text.lower()  # Convert to lowercase
        return text
    return ''

# Function to detect English language
def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
        return False


In [5]:
# Combine subject and body, clean text
data['text'] = data['subject'].fillna('') + ' ' + data['body'].fillna('')
data['cleaned_text'] = data['text'].apply(clean_text)

# Filter for English-only texts
data['is_english'] = data['cleaned_text'].apply(lambda x: is_english(x) if x.strip() else False)
english_data = data[data['is_english']].copy()


In [6]:
if english_data.empty:
    print("No English data found. Exiting.")
else:
    print(f"Found {len(english_data)} English entries.")

    # Encode the target variable (priority)
    label_encoder = LabelEncoder()
    english_data['priority_encoded'] = label_encoder.fit_transform(english_data['priority'])

    # Split features and target
    X = english_data['cleaned_text']
    y = english_data['priority_encoded']

    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Apply TF-IDF vectorization
    tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
    X_train_tfidf = tfidf.fit_transform(X_train)
    X_test_tfidf = tfidf.transform(X_test)

    # Initialize and train XGBoost
    xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
    xgb.fit(X_train_tfidf, y_train)

    # Predict on test set
    y_pred = xgb.predict(X_test_tfidf)


Found 34909 English entries.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [7]:
# Evaluate the model
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")


Classification Report:
              precision    recall  f1-score   support

        high       0.67      0.66      0.66      2718
         low       0.79      0.27      0.40      1411
      medium       0.57      0.77      0.65      2853

    accuracy                           0.62      6982
   macro avg       0.68      0.56      0.57      6982
weighted avg       0.65      0.62      0.61      6982

Accuracy: 0.6223


In [8]:
  # Save the model and vectorizer (optional)
import joblib
joblib.dump(xgb, 'xgboost_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')

['label_encoder.pkl']

In [9]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
import re
import joblib

# Load the saved models and vectorizer
xgb_model = joblib.load('xgboost_model.pkl')
tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')
label_encoder = joblib.load('label_encoder.pkl')

# Load the dataset
data = pd.read_csv('merged_dataset.csv')

# Function to clean text
def clean_text(text):
    if isinstance(text, str):
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        text = text.lower()  # Convert to lowercase
        return text
    return ''

# Combine subject and body, clean text
data['text'] = data['subject'].fillna('') + ' ' + data['body'].fillna('')
data['cleaned_text'] = data['text'].apply(clean_text)

# Check if data is empty
if data.empty:
    print("No data found. Exiting.")
else:
    print(f"Found {len(data)} entries.")

    # Encode the target variable (priority)
    data['priority_encoded'] = label_encoder.transform(data['priority'])

    # Prepare features and target
    X = data['cleaned_text']
    y = data['priority_encoded']

    # Split into train and test sets (use test set for evaluation)
    _, X_test, _, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Transform test data using the loaded TF-IDF vectorizer
    X_test_tfidf = tfidf_vectorizer.transform(X_test)

    # Predict on test set
    y_pred = xgb_model.predict(X_test_tfidf)

    # Evaluate the model
    print("\nModel Evaluation on Test Set:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

    # Function to predict priority for new tickets
    def predict_ticket_priority(subject, body):
        # Combine and clean input
        text = f"{subject} {body}"
        cleaned_text = clean_text(text)

        # Transform text using TF-IDF
        text_tfidf = tfidf_vectorizer.transform([cleaned_text])

        # Predict priority
        prediction = xgb_model.predict(text_tfidf)
        predicted_priority = label_encoder.inverse_transform(prediction)[0]

        return predicted_priority

    # Example: Test with a new ticket (English)
    new_subject = "App not opening"
    new_body = "Uable to login"
    predicted_priority = predict_ticket_priority(new_subject, new_body)
    print(f"\nNew Ticket Prediction (English):")
    print(f"Subject: {new_subject}")
    print(f"Body: {new_body}")
    print(f"Predicted Priority: {predicted_priority}")

    # Example: Test with a new ticket (Non-English, e.g., Spanish)
    non_english_subject = "Problema Urgente"
    non_english_body = "El servidor principal ha fallado, causando interrupciones en los servicios."
    predicted_priority = predict_ticket_priority(non_english_subject, non_english_body)
    print(f"\nNew Ticket Prediction (Spanish):")
    print(f"Subject: {non_english_subject}")
    print(f"Body: {non_english_body}")
    print(f"Predicted Priority: {predicted_priority}")


FileNotFoundError: [Errno 2] No such file or directory: 'merged_dataset.csv'