In [None]:
# Basic data handling and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine learning libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (classification_report, confusion_matrix, 
                           accuracy_score, precision_recall_fscore_support)

# Text processing
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# Class imbalance handling
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import NearestNeighbors

# Utilities
import time
import joblib
from datetime import datetime

print("IT Support Ticket Classification System")
print("All libraries imported successfully")
print("Starting analysis at:", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))


In [None]:
def load_and_explore_data(filepath="customer_support_tickets.csv"):
    """
    Load the support ticket data and perform initial exploration
    This helps us understand what we're working with
    """
    print("\nSTEP 1: Loading and exploring the data")
    print("-" * 50)

    try:
        # Load the CSV file
        df = pd.read_csv(filepath, encoding='utf-8')
        print(f"Successfully loaded dataset with {len(df):,} tickets")
        
        # Show basic information about the dataset
        print(f"Dataset shape: {df.shape[0]} rows x {df.shape[1]} columns")
        print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

        # Display column names
        print(f"\nColumns in dataset: {list(df.columns)}")
        
        # Show first few rows to understand the data structure
        print(f"\nFirst 3 rows of data:")
        print(df.head(3))
        
        # Check for missing values
        print(f"\nMissing values per column:")
        missing_data = df.isnull().sum()
        for col, missing in missing_data.items():
            if missing > 0:
                print(f"  {col}: {missing} missing ({missing/len(df)*100:.1f}%)")
        
        if missing_data.sum() == 0:
            print("  No missing values found - excellent data quality")
        
        # Check for duplicates
        duplicates = df.duplicated().sum()
        print(f"\nDuplicate records: {duplicates} ({duplicates/len(df)*100:.1f}%)")
        
        return df
        
    except FileNotFoundError:
        print(f"Error: Could not find file '{filepath}'")
        print("Please make sure the CSV file is in the same directory as this notebook")
        return None
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        return None

# Load the data
df = load_and_explore_data()

In [None]:
def clean_text(text):
    """
    Clean the ticket text to make it easier for the computer to understand
    Think of this like fixing spelling mistakes and removing unnecessary words
    """
    if pd.isna(text):
        return ""
    
    # Convert to lowercase (so "HELP" and "help" are treated the same)
    text = str(text).lower()
    
    # Remove website links and emails (they're not useful for classification)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove special characters but keep letters and spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove placeholder text that's not useful
    text = text.replace('product_purchase', '')
    
    return text

def prepare_data(df):
    """
    Prepare our data for machine learning
    This is like organizing your study materials before an exam
    """
    print("\n STEP 2: Cleaning and preparing the data...")
    
    # Clean the ticket descriptions
    df['cleaned_text'] = df['Ticket Description'].apply(clean_text)
    
    # Remove empty tickets (ones with no useful text)
    df = df[df['cleaned_text'].str.len() > 0].reset_index(drop=True)
    
    print(f" Cleaned {len(df):,} tickets")
    
    # Show some basic statistics about our text
    avg_length = df['cleaned_text'].str.len().mean()
    print(f" Average ticket length: {avg_length:.0f} characters")
    
    return df

# Clean our data
df = prepare_data(df)

In [None]:
def create_features(df):
    """
    Convert text to numbers that the computer can understand
    Think of this like translating from English to Math
    """
    print("\n STEP 3: Converting text to numbers...")
    
    # Use TF-IDF to convert text to numbers
    # TF-IDF finds the most important words in each ticket
    vectorizer = TfidfVectorizer(
        max_features=1000,  # Use the 1000 most important words
        stop_words='english',  # Ignore common words like "the", "and"
        ngram_range=(1, 2),  # Look at single words and word pairs
        min_df=2  # Only use words that appear at least 2 times
    )
    
    # Transform our text data
    X = vectorizer.fit_transform(df['cleaned_text'])
    
    print(f" Created {X.shape[1]} features from text")
    print(f" Feature matrix shape: {X.shape}")
    
    return X.toarray(), vectorizer  # Convert to regular array format

# Create features from our text
X, vectorizer = create_features(df)

In [None]:
def prepare_labels(df):
    """
    Prepare the things we want to predict (ticket type and priority)
    This is like organizing the answer key for a test
    """
    print("\n STEP 4: Preparing labels...")
    
    # Convert text labels to numbers
    le_type = LabelEncoder()  # For ticket types
    le_priority = LabelEncoder()  # For priorities
    
    # Transform the labels
    y_type = le_type.fit_transform(df['Ticket Type'])
    y_priority = le_priority.fit_transform(df['Ticket Priority'])
    
    # Combine both labels
    y = np.column_stack((y_type, y_priority))
    
    # Show what we're predicting
    print(f" Ticket Types: {list(le_type.classes_)}")
    print(f" Priority Levels: {list(le_priority.classes_)}")
    
    # Show distribution of each type
    print("\n How many tickets of each type:")
    for i, ticket_type in enumerate(le_type.classes_):
        count = (y_type == i).sum()
        print(f"   {ticket_type}: {count}")
    
    return y, le_type, le_priority

# Prepare our labels
y, le_type, le_priority = prepare_labels(df) 

In [None]:
def split_and_balance_data(X, y):
    """
    Split data into training and testing sets, then balance classes
    This is like dividing your study material into practice and final exam
    """
    print("\n STEP 5: Splitting and balancing data...")
    
    # Split the data (80% for training, 20% for testing)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    print(f" Training data: {X_train.shape[0]} tickets")
    print(f" Testing data: {X_test.shape[0]} tickets")
    
    # Handle imbalanced classes using SMOTE
    # SMOTE creates synthetic examples of minority classes
    print("\n Balancing classes...")
    
    smote = SMOTE(random_state=42)
    
    # Balance the ticket types
    X_train_balanced, y_train_type_balanced = smote.fit_resample(X_train, y_train[:, 0])
    
    # For priorities, we'll use a simple approach
    # Map the balanced ticket types to their most common priorities
    from sklearn.neighbors import NearestNeighbors
    nn = NearestNeighbors(n_neighbors=1)
    nn.fit(X_train)
    
    # Find closest original samples for balanced data
    distances, indices = nn.kneighbors(X_train_balanced)
    y_train_priority_balanced = y_train[:, 1][indices.flatten()]
    
    # Combine balanced labels
    y_train_balanced = np.column_stack((y_train_type_balanced, y_train_priority_balanced))
    
    print(f" Balanced training data: {X_train_balanced.shape[0]} tickets")
    
    return X_train_balanced, X_test, y_train_balanced, y_test

# Split and balance our data  
X_train, X_test, y_train, y_test = split_and_balance_data(X, y)



def train_model(X_train, y_train):
    """
    Train our machine learning model
    """
    print("\n STEP 6: Training the machine learning model...")
    
    # Create a model that can predict both ticket type and priority
    model = MultiOutputClassifier(
        LogisticRegression(max_iter=1000, random_state=42)
    )
    
    # Train the model (this is where the learning happens)
    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    print(f" Model trained successfully in {training_time:.2f} seconds")
    
    return model

# Train our model
model = train_model(X_train, y_train)

In [None]:
def evaluate_model(model, X_test, y_test, le_type, le_priority):
    """
    Test how well our model performs
    """
    print("\n STEP 7: Evaluating model performance...")
    
    # Make predictions on test data
    y_pred = model.predict(X_test)
    
    # Calculate accuracy for each task
    type_accuracy = accuracy_score(y_test[:, 0], y_pred[:, 0])
    priority_accuracy = accuracy_score(y_test[:, 1], y_pred[:, 1])
    
    print(f" Ticket Type Accuracy: {type_accuracy:.3f} ({type_accuracy*100:.1f}%)")
    print(f" Priority Accuracy: {priority_accuracy:.3f} ({priority_accuracy*100:.1f}%)")
    
    # Show detailed results for ticket types
    print("\n Detailed Results for Ticket Types:")
    print(classification_report(y_test[:, 0], y_pred[:, 0], 
                              target_names=le_type.classes_, zero_division=0))
    
    # Show detailed results for priorities
    print("\n Detailed Results for Priorities:")
    print(classification_report(y_test[:, 1], y_pred[:, 1], 
                              target_names=le_priority.classes_, zero_division=0))
    
    # Create confusion matrices (visual representation of results)
    plot_confusion_matrices(y_test, y_pred, le_type, le_priority)
    
    return y_pred

def plot_confusion_matrices(y_test, y_pred, le_type, le_priority):
    """
    Create visual charts showing where the model makes mistakes
    """
    print("\n Creating confusion matrices...")
    
    # Create side-by-side plots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Ticket Type confusion matrix
    cm_type = confusion_matrix(y_test[:, 0], y_pred[:, 0])
    sns.heatmap(cm_type, annot=True, fmt='d', ax=ax1, cmap='Blues',
                xticklabels=le_type.classes_, yticklabels=le_type.classes_)
    ax1.set_title('Ticket Type Predictions')
    ax1.set_xlabel('Predicted')
    ax1.set_ylabel('Actual')
    
    # Priority confusion matrix
    cm_priority = confusion_matrix(y_test[:, 1], y_pred[:, 1])
    sns.heatmap(cm_priority, annot=True, fmt='d', ax=ax2, cmap='Oranges',
                xticklabels=le_priority.classes_, yticklabels=le_priority.classes_)
    ax2.set_title('Priority Level Predictions')
    ax2.set_xlabel('Predicted')
    ax2.set_ylabel('Actual')
    
    plt.tight_layout()
    plt.show()

# Evaluate our model
y_pred = evaluate_model(model, X_test, y_test, le_type, le_priority)



In [None]:
def predict_new_ticket(text, model, vectorizer, le_type, le_priority):
    """
    Predict the type and priority of a new ticket
    This is like using our trained model on new data
    """
    # Clean the input text
    cleaned_text = clean_text(text)
    
    # Convert text to numbers using our trained vectorizer
    text_features = vectorizer.transform([cleaned_text])
    
    # Make prediction
    prediction = model.predict(text_features)
    probabilities = model.predict_proba(text_features)
    
    # Convert numbers back to text labels
    predicted_type = le_type.inverse_transform([prediction[0][0]])[0]
    predicted_priority = le_priority.inverse_transform([prediction[0][1]])[0]
    
    # Get confidence scores
    type_confidence = probabilities[0].max()
    priority_confidence = probabilities[1].max()
    
    return {
        'type': predicted_type,
        'priority': predicted_priority,
        'type_confidence': type_confidence,
        'priority_confidence': priority_confidence
    }


In [None]:
def test_examples():
    """
    Test our model with some example tickets
    """
    print("\n STEP 8: Testing with example tickets...")
    
    # Example tickets to test
    example_tickets = [
        "My email is not working and I cannot send any messages",
        "I need to cancel my subscription and get a refund",
        "The software keeps crashing when I try to save files",
        "I forgot my password and need help logging in",
        "The printer is not connecting to my computer"
    ]
    
    print("\n Predictions for example tickets:")
    print("=" * 60)
    
    for i, ticket in enumerate(example_tickets, 1):
        result = predict_new_ticket(ticket, model, vectorizer, le_type, le_priority)
        
        print(f"\n Example {i}:")
        print(f"Ticket: '{ticket}'")
        print(f"  Type: {result['type']} (confidence: {result['type_confidence']:.2f})")
        print(f" Priority: {result['priority']} (confidence: {result['priority_confidence']:.2f})")

# Test with examples
test_examples()

In [None]:
def save_model():
    """
    Save our trained model so we can use it later
    """
    print("\n STEP 9: Saving the model...")
    
    import joblib
    
    # Save all the components we need
    model_components = {
        'model': model,
        'vectorizer': vectorizer,
        'le_type': le_type,
        'le_priority': le_priority
    }
    
    joblib.dump(model_components, 'ticket_classifier_model.pkl')
    print(" Model saved as 'ticket_classifier_model.pkl'")

# Save our model
save_model()