# AI TICKET CLASSIFIER - COMPLETE ASSESMENT SOLUTION

This script implements a complete ticket classification system with:
1. Data setup and analysis
2. NLP pipeline with text processing
3. Machine Learning model training and evaluation
4. Named Entity Recognition(NER)

# Importing Everything we need

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.tree import Tree
import re

# Download required NLTK data

In [3]:
try:
  nltk.data.find('tokenizers/punkt')
except LookupError:
  nltk.download('punkt')

try:
  nltk.data.find('tokenizers/punkt_tab')
except LookupError:
  nltk.download('punkt_tab')

try:
  nltk.data.find('corpora/stopwords')
except LookupError:
  nltk.download('stopwords')

try:
  nltk.data.find('corpora/wordnet')
except LookupError:
  nltk.download('wordnet')

try:
  nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
  nltk.download('averaged_perceptron_tagger')

try:
  nltk.data.find('chunkers/maxent_ne_chunker')
except LookupError:
  nltk.download('maxent_ne_chunker')

try:
  nltk.data.find('corpora/words')
except LookupError:
  nltk.download('words')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


# Creating Ticket Classifier Class

In [9]:
class TicketClassifier:
  def __init__(self):   # javascript we use this and in python we use self
     self.vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
     self.classifier = MultinomialNB()
     self.lemmatizer = WordNetLemmatizer()
     self.stop_words = set(stopwords.words('english'))

  def load_and_analyze_data(self, csv_file=None):
    """TASK 1: Load an analyze the ticket data"""
    print("=== TASK 1: AUTOMATE TICKET CLASSIFICATION ===")
    print("Objective: Set up the dataset \n")

    if csv_file:
      try:
        #Load actual CSV file
        self.df = pd.read_csv(csv_file)
        print(f"Loaded {len(self.df)} tickets from {csv_file}")

        #Add Data validation
        #check for missing values in ticket_text
        if self.df['ticket_text'].isna().any():
          print("Found missing ticket_text values, filling with default text")
          self.df['ticket_text'] = self.df['ticket_text'].fillna('No description provided')

        #ensure all ticket_text entries are strings
        self.df['ticket_text'] = self.df['ticket_text'].astype(str)

        #fill any empty categories with empty string
        self.df['category'] = self.df['category'].fillna('')


      except FileNotFoundError:
        print("Could not find csv file exception")
    else:
      print("Could not find csv file")


    print("\n Data Collection Complete:")
    print(f" Total ticket loaded: {len(self.df)}")

    print("\n 2. Quick check - Category Distribution:")
    category_counts = self.df['category'].value_counts()
    print(f" Technical: {category_counts.get('Technical', 0)} tickets")
    print(f" Billing: {category_counts.get('Billing', 0)} tickets")
    print(f" General: {category_counts.get('General', 0)} tickets")

    # Count unlabeled tickets
    unlabeled_count = len(self.df[self.df['category'] == ''])
    labeled_count = len(self.df[self.df['category'] != ''])

    print(f"\n Unlabeled tickets: {unlabeled_count}")
    print(f" Labeled tickets: {labeled_count}")

    #Separate labeled and unlabeled data
    self.labeled_df = self.df[self.df['category'] != ''].copy()
    self.unlabeled_df = self.df[self.df['category'] == ''].copy()

    print("\n Data Split:")
    print(f" Labeled tickets: {len(self.labeled_df)}")
    print(f" Unlabeled tickets: {len(self.unlabeled_df)}\n")

    #





# Invoking our class

In [10]:
classifier = TicketClassifier()

classifier.load_and_analyze_data(csv_file='/content/tickets.csv')

=== TASK 1: AUTOMATE TICKET CLASSIFICATION ===
Objective: Set up the dataset 

Loaded 20 tickets from /content/tickets.csv

 Data Collection Complete:
 Total ticket loaded: 20

 2. Quick check - Category Distribution:
 Technical: 5 tickets
 Billing: 5 tickets
 General: 5 tickets

 Unlabeled tickets: 5
 Labeled tickets: 15

 Data Split:
 Labeled tickets: 15
 Unlabeled tickets: 5

