In [None]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Basic machine learning imports
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

# For handling imbalanced data
from imblearn.over_sampling import SMOTE

# For text processing
import re
import time


In [None]:
def load_and_explore_data(filepath="customer_support_tickets.csv"):
    """
    Load the support ticket data and perform initial exploration
    This helps us understand what we're working with
    """
    print("\nSTEP 1: Loading and exploring the data")
    print("-" * 50)

    try:
        # Load the CSV file
        df = pd.read_csv(filepath, encoding='utf-8')
        print(f"Successfully loaded dataset with {len(df):,} tickets")
        
        # Show basic information about the dataset
        print(f"Dataset shape: {df.shape[0]} rows x {df.shape[1]} columns")
        print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

        # Display column names
        print(f"\nColumns in dataset: {list(df.columns)}")
        
        # Show first few rows to understand the data structure
        print(f"\nFirst 3 rows of data:")
        print(df.head(3))
        
        # Check for missing values
        print(f"\nMissing values per column:")
        missing_data = df.isnull().sum()
        for col, missing in missing_data.items():
            if missing > 0:
                print(f"  {col}: {missing} missing ({missing/len(df)*100:.1f}%)")
        
        if missing_data.sum() == 0:
            print("  No missing values found - excellent data quality")
        
        # Check for duplicates
        duplicates = df.duplicated().sum()
        print(f"\nDuplicate records: {duplicates} ({duplicates/len(df)*100:.1f}%)")
        
        return df
        
    except FileNotFoundError:
        print(f"Error: Could not find file '{filepath}'")
        print("Please make sure the CSV file is in the same directory as this notebook")
        return None
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        return None

# Load the data
df = load_and_explore_data()

In [None]:
def clean_text(text):
    """
    Clean the ticket text to make it easier for the computer to understand
    Think of this like fixing spelling mistakes and removing unnecessary words
    """
    if pd.isna(text):
        return ""
    
    # Convert to lowercase (so "HELP" and "help" are treated the same)
    text = str(text).lower()
    
    # Remove website links and emails (they're not useful for classification)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove special characters but keep letters and spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove placeholder text that's not useful
    text = text.replace('product_purchase', '')
    
    return text

def prepare_data(df):
    """
    Prepare our data for machine learning
    This is like organizing your study materials before an exam
    """
    print("\n STEP 2: Cleaning and preparing the data...")
    
    # Clean the ticket descriptions
    df['cleaned_text'] = df['Ticket Description'].apply(clean_text)
    
    # Remove empty tickets (ones with no useful text)
    df = df[df['cleaned_text'].str.len() > 0].reset_index(drop=True)
    
    print(f" Cleaned {len(df):,} tickets")
    
    # Show some basic statistics about our text
    avg_length = df['cleaned_text'].str.len().mean()
    print(f" Average ticket length: {avg_length:.0f} characters")
    
    return df

# Clean our data
df = prepare_data(df)

In [None]:
def create_features(df):
    """
    Convert text to numbers that the computer can understand
    Think of this like translating from English to Math
    """
    print("\n STEP 3: Converting text to numbers...")
    
    # Use TF-IDF to convert text to numbers
    # TF-IDF finds the most important words in each ticket
    vectorizer = TfidfVectorizer(
        max_features=1000,  # Use the 1000 most important words
        stop_words='english',  # Ignore common words like "the", "and"
        ngram_range=(1, 2),  # Look at single words and word pairs
        min_df=2  # Only use words that appear at least 2 times
    )
    
    # Transform our text data
    X = vectorizer.fit_transform(df['cleaned_text'])
    
    print(f" Created {X.shape[1]} features from text")
    print(f" Feature matrix shape: {X.shape}")
    
    return X.toarray(), vectorizer  # Convert to regular array format

# Create features from our text
X, vectorizer = create_features(df)

In [None]:
def prepare_labels(df):
    """
    Prepare the things we want to predict (ticket type and priority)
    This is like organizing the answer key for a test
    """
    print("\n STEP 4: Preparing labels...")
    
    # Convert text labels to numbers
    le_type = LabelEncoder()  # For ticket types
    le_priority = LabelEncoder()  # For priorities
    
    # Transform the labels
    y_type = le_type.fit_transform(df['Ticket Type'])
    y_priority = le_priority.fit_transform(df['Ticket Priority'])
    
    # Combine both labels
    y = np.column_stack((y_type, y_priority))
    
    # Show what we're predicting
    print(f" Ticket Types: {list(le_type.classes_)}")
    print(f" Priority Levels: {list(le_priority.classes_)}")
    
    # Show distribution of each type
    print("\n How many tickets of each type:")
    for i, ticket_type in enumerate(le_type.classes_):
        count = (y_type == i).sum()
        print(f"   {ticket_type}: {count}")
    
    return y, le_type, le_priority

# Prepare our labels
y, le_type, le_priority = prepare_labels(df) //