# Imports and Setup

In [38]:
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [39]:
import pandas as pd
import numpy as np
import re
import string

# NLP libraries
import contractions
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet


# One-time downloads (only if not already done)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Set options
pd.set_option('display.max_colwidth', 200)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Scott\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Scott\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Scott\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Load and Inspect the Data

In [40]:
# Cell 2: Load and inspect the data

# Load the CSV
df = pd.read_csv("../data/tickets_medium.csv", index_col="ticket_id", header=0)

# Preview the dataset
print("Shape:", df.shape)
df.head(10)


Shape: (100, 2)


Unnamed: 0_level_0,department,description
ticket_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,IT,VPN disconnects randomly during video calls. It started after the last system update.
2,IT,My laptop battery dies within 30 minutes even after a full charge.
3,IT,Outlook keeps freezing when I try to search for emails.
4,IT,Wi-Fi drops every time I join a Teams meeting.
5,IT,Laptop won’t connect to the company network. Tried restarting and updating drivers.
6,IT,Teams notifications don’t show up on my desktop until hours later.
7,IT,Unable to install Zoom due to admin restrictions.
8,IT,My external keyboard stopped working after plugging into the docking station.
9,IT,Outlook calendar events are missing after syncing with mobile.
10,IT,System is extremely slow after the last antivirus update.


# Clean Data

In [41]:
# Cell 3: Basic text cleaning

def clean_text(text):
    # Expand contractions (e.g., "can't" → "cannot")
    text = contractions.fix(text)
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Remove digits
    text = re.sub(r"\d+", "", text)
    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Apply cleaning
df["clean_text"] = df["description"].apply(clean_text)

# Show sample cleaned data
df[["description", "clean_text"]].head(10)


Unnamed: 0_level_0,description,clean_text
ticket_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,VPN disconnects randomly during video calls. It started after the last system update.,vpn disconnects randomly during video calls it started after the last system update
2,My laptop battery dies within 30 minutes even after a full charge.,my laptop battery dies within minutes even after a full charge
3,Outlook keeps freezing when I try to search for emails.,outlook keeps freezing when i try to search for emails
4,Wi-Fi drops every time I join a Teams meeting.,wifi drops every time i join a teams meeting
5,Laptop won’t connect to the company network. Tried restarting and updating drivers.,laptop will not connect to the company network tried restarting and updating drivers
6,Teams notifications don’t show up on my desktop until hours later.,teams notifications do not show up on my desktop until hours later
7,Unable to install Zoom due to admin restrictions.,unable to install zoom due to admin restrictions
8,My external keyboard stopped working after plugging into the docking station.,my external keyboard stopped working after plugging into the docking station
9,Outlook calendar events are missing after syncing with mobile.,outlook calendar events are missing after syncing with mobile
10,System is extremely slow after the last antivirus update.,system is extremely slow after the last antivirus update


# Tokenization and Stopword Removal

In [42]:
stop_words = set(stopwords.words('english'))

# Domain-specific stopwords (add/remove based on your data)
domain_stopwords = {
    "please", "help", "thanks", "thank", "issue", "problem", 
    "anyone", "someone", "team", "hi", "hello"
}

# Combine sets
all_stopwords = stop_words.union(domain_stopwords)

def tokenize_and_remove_stopwords(text):
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in all_stopwords]
    return filtered_tokens

# Apply to cleaned text
df["tokens"] = df["clean_text"].apply(tokenize_and_remove_stopwords)

# View results
df[["clean_text", "tokens"]].head(10)


Unnamed: 0_level_0,clean_text,tokens
ticket_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,vpn disconnects randomly during video calls it started after the last system update,"[vpn, disconnects, randomly, video, calls, started, last, system, update]"
2,my laptop battery dies within minutes even after a full charge,"[laptop, battery, dies, within, minutes, even, full, charge]"
3,outlook keeps freezing when i try to search for emails,"[outlook, keeps, freezing, try, search, emails]"
4,wifi drops every time i join a teams meeting,"[wifi, drops, every, time, join, teams, meeting]"
5,laptop will not connect to the company network tried restarting and updating drivers,"[laptop, connect, company, network, tried, restarting, updating, drivers]"
6,teams notifications do not show up on my desktop until hours later,"[teams, notifications, show, desktop, hours, later]"
7,unable to install zoom due to admin restrictions,"[unable, install, zoom, due, admin, restrictions]"
8,my external keyboard stopped working after plugging into the docking station,"[external, keyboard, stopped, working, plugging, docking, station]"
9,outlook calendar events are missing after syncing with mobile,"[outlook, calendar, events, missing, syncing, mobile]"
10,system is extremely slow after the last antivirus update,"[system, extremely, slow, last, antivirus, update]"


# Lemmatization

In [43]:
# Download if not already
nltk.download('averaged_perceptron_tagger_eng')

lemmatizer = WordNetLemmatizer()

# Map NLTK POS tags to WordNet POS tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # fallback to noun

def lemmatize_with_pos(tokens):
    tagged_tokens = pos_tag(tokens)
    return [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in tagged_tokens]

# Apply to tokens
df["lemmatized"] = df["tokens"].apply(lemmatize_with_pos)

# Preview changes
df[["tokens", "lemmatized"]].head(10)


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Scott\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


Unnamed: 0_level_0,tokens,lemmatized
ticket_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"[vpn, disconnects, randomly, video, calls, started, last, system, update]","[vpn, disconnect, randomly, video, call, start, last, system, update]"
2,"[laptop, battery, dies, within, minutes, even, full, charge]","[laptop, battery, dy, within, minute, even, full, charge]"
3,"[outlook, keeps, freezing, try, search, emails]","[outlook, keep, freeze, try, search, email]"
4,"[wifi, drops, every, time, join, teams, meeting]","[wifi, drop, every, time, join, team, meeting]"
5,"[laptop, connect, company, network, tried, restarting, updating, drivers]","[laptop, connect, company, network, try, restart, update, driver]"
6,"[teams, notifications, show, desktop, hours, later]","[team, notification, show, desktop, hour, later]"
7,"[unable, install, zoom, due, admin, restrictions]","[unable, install, zoom, due, admin, restriction]"
8,"[external, keyboard, stopped, working, plugging, docking, station]","[external, keyboard, stop, work, plug, dock, station]"
9,"[outlook, calendar, events, missing, syncing, mobile]","[outlook, calendar, event, miss, sync, mobile]"
10,"[system, extremely, slow, last, antivirus, update]","[system, extremely, slow, last, antivirus, update]"


# Save the Cleaned Data

In [44]:
# Join lemmatized tokens back into a single string
df["processed_text"] = df["lemmatized"].apply(lambda tokens: " ".join(tokens))

# Save to CSV
df.to_csv("../data/tickets_preprocessed.csv", index=True)

# Confirm output
df[["department","description", "processed_text"]].head(10)

Unnamed: 0_level_0,department,description,processed_text
ticket_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,IT,VPN disconnects randomly during video calls. It started after the last system update.,vpn disconnect randomly video call start last system update
2,IT,My laptop battery dies within 30 minutes even after a full charge.,laptop battery dy within minute even full charge
3,IT,Outlook keeps freezing when I try to search for emails.,outlook keep freeze try search email
4,IT,Wi-Fi drops every time I join a Teams meeting.,wifi drop every time join team meeting
5,IT,Laptop won’t connect to the company network. Tried restarting and updating drivers.,laptop connect company network try restart update driver
6,IT,Teams notifications don’t show up on my desktop until hours later.,team notification show desktop hour later
7,IT,Unable to install Zoom due to admin restrictions.,unable install zoom due admin restriction
8,IT,My external keyboard stopped working after plugging into the docking station.,external keyboard stop work plug dock station
9,IT,Outlook calendar events are missing after syncing with mobile.,outlook calendar event miss sync mobile
10,IT,System is extremely slow after the last antivirus update.,system extremely slow last antivirus update
