# Imports and Setup

In [30]:
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
import pandas as pd
import numpy as np
import re
import string

# NLP libraries
import contractions
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet


# One-time downloads (only if not already done)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Set options
pd.set_option('display.max_colwidth', 200)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Scott\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Scott\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Scott\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Load and Inspect the Data

In [32]:
# Cell 2: Load and inspect the data

# Load the CSV
df = pd.read_csv("../data/tickets_small.csv", index_col="ticket_id", header=0)

# Preview the dataset
print("Shape:", df.shape)
df.head(10)


Shape: (60, 2)


Unnamed: 0_level_0,department,description
ticket_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,IT,Unable to connect to VPN from home.
2,IT,Outlook crashes every time I open an email with an attachment.
3,IT,My laptop battery only lasts 20 minutes after a full charge.
4,IT,"Teams isn't picking up my microphone, even though it's working in other apps."
5,IT,Wi-Fi disconnects randomly throughout the day. Restarting doesn't help.
6,IT,Blue screen appeared during a presentation and the system rebooted.
7,IT,Keyboard keys are sticking and occasionally not registering.
8,IT,"After installing the latest Windows update, my mouse is lagging badly."
9,IT,Can't print from my laptop to the office printer.
10,IT,System is extremely slow when running multiple browser tabs.


# Clean Data

In [33]:
# Cell 3: Basic text cleaning

def clean_text(text):
    # Expand contractions (e.g., "can't" → "cannot")
    text = contractions.fix(text)
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Remove digits
    text = re.sub(r"\d+", "", text)
    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Apply cleaning
df["clean_text"] = df["description"].apply(clean_text)

# Show sample cleaned data
df[["description", "clean_text"]].head(10)


Unnamed: 0_level_0,description,clean_text
ticket_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Unable to connect to VPN from home.,unable to connect to vpn from home
2,Outlook crashes every time I open an email with an attachment.,outlook crashes every time i open an email with an attachment
3,My laptop battery only lasts 20 minutes after a full charge.,my laptop battery only lasts minutes after a full charge
4,"Teams isn't picking up my microphone, even though it's working in other apps.",teams is not picking up my microphone even though it is working in other apps
5,Wi-Fi disconnects randomly throughout the day. Restarting doesn't help.,wifi disconnects randomly throughout the day restarting does not help
6,Blue screen appeared during a presentation and the system rebooted.,blue screen appeared during a presentation and the system rebooted
7,Keyboard keys are sticking and occasionally not registering.,keyboard keys are sticking and occasionally not registering
8,"After installing the latest Windows update, my mouse is lagging badly.",after installing the latest windows update my mouse is lagging badly
9,Can't print from my laptop to the office printer.,cannot print from my laptop to the office printer
10,System is extremely slow when running multiple browser tabs.,system is extremely slow when running multiple browser tabs


# Tokenization and Stopword Removal

In [34]:
stop_words = set(stopwords.words('english'))

# Domain-specific stopwords (add/remove based on your data)
domain_stopwords = {
    "please", "help", "thanks", "thank", "issue", "problem", 
    "anyone", "someone", "team", "hi", "hello"
}

# Combine sets
all_stopwords = stop_words.union(domain_stopwords)

def tokenize_and_remove_stopwords(text):
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in all_stopwords]
    return filtered_tokens

# Apply to cleaned text
df["tokens"] = df["clean_text"].apply(tokenize_and_remove_stopwords)

# View results
df[["clean_text", "tokens"]].head(10)


Unnamed: 0_level_0,clean_text,tokens
ticket_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,unable to connect to vpn from home,"[unable, connect, vpn, home]"
2,outlook crashes every time i open an email with an attachment,"[outlook, crashes, every, time, open, email, attachment]"
3,my laptop battery only lasts minutes after a full charge,"[laptop, battery, lasts, minutes, full, charge]"
4,teams is not picking up my microphone even though it is working in other apps,"[teams, picking, microphone, even, though, working, apps]"
5,wifi disconnects randomly throughout the day restarting does not help,"[wifi, disconnects, randomly, throughout, day, restarting]"
6,blue screen appeared during a presentation and the system rebooted,"[blue, screen, appeared, presentation, system, rebooted]"
7,keyboard keys are sticking and occasionally not registering,"[keyboard, keys, sticking, occasionally, registering]"
8,after installing the latest windows update my mouse is lagging badly,"[installing, latest, windows, update, mouse, lagging, badly]"
9,cannot print from my laptop to the office printer,"[print, laptop, office, printer]"
10,system is extremely slow when running multiple browser tabs,"[system, extremely, slow, running, multiple, browser, tabs]"


# Lemmatization

In [35]:
# Download if not already
nltk.download('averaged_perceptron_tagger_eng')

lemmatizer = WordNetLemmatizer()

# Map NLTK POS tags to WordNet POS tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # fallback to noun

def lemmatize_with_pos(tokens):
    tagged_tokens = pos_tag(tokens)
    return [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in tagged_tokens]

# Apply to tokens
df["lemmatized"] = df["tokens"].apply(lemmatize_with_pos)

# Preview changes
df[["tokens", "lemmatized"]].head(10)


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Scott\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


Unnamed: 0_level_0,tokens,lemmatized
ticket_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,"[unable, connect, vpn, home]","[unable, connect, vpn, home]"
2,"[outlook, crashes, every, time, open, email, attachment]","[outlook, crash, every, time, open, email, attachment]"
3,"[laptop, battery, lasts, minutes, full, charge]","[laptop, battery, last, minute, full, charge]"
4,"[teams, picking, microphone, even, though, working, apps]","[team, pick, microphone, even, though, work, apps]"
5,"[wifi, disconnects, randomly, throughout, day, restarting]","[wifi, disconnect, randomly, throughout, day, restart]"
6,"[blue, screen, appeared, presentation, system, rebooted]","[blue, screen, appear, presentation, system, reboot]"
7,"[keyboard, keys, sticking, occasionally, registering]","[keyboard, key, stick, occasionally, register]"
8,"[installing, latest, windows, update, mouse, lagging, badly]","[instal, late, window, update, mouse, lag, badly]"
9,"[print, laptop, office, printer]","[print, laptop, office, printer]"
10,"[system, extremely, slow, running, multiple, browser, tabs]","[system, extremely, slow, run, multiple, browser, tab]"


# Save the Cleaned Data

In [37]:
# Join lemmatized tokens back into a single string
df["processed_text"] = df["lemmatized"].apply(lambda tokens: " ".join(tokens))

# Save to CSV
df.to_csv("../data/tickets_preprocessed.csv", index=True)

# Confirm output
df[["department","description", "processed_text"]].head(10)

Unnamed: 0_level_0,department,description,processed_text
ticket_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,IT,Unable to connect to VPN from home.,unable connect vpn home
2,IT,Outlook crashes every time I open an email with an attachment.,outlook crash every time open email attachment
3,IT,My laptop battery only lasts 20 minutes after a full charge.,laptop battery last minute full charge
4,IT,"Teams isn't picking up my microphone, even though it's working in other apps.",team pick microphone even though work apps
5,IT,Wi-Fi disconnects randomly throughout the day. Restarting doesn't help.,wifi disconnect randomly throughout day restart
6,IT,Blue screen appeared during a presentation and the system rebooted.,blue screen appear presentation system reboot
7,IT,Keyboard keys are sticking and occasionally not registering.,keyboard key stick occasionally register
8,IT,"After installing the latest Windows update, my mouse is lagging badly.",instal late window update mouse lag badly
9,IT,Can't print from my laptop to the office printer.,print laptop office printer
10,IT,System is extremely slow when running multiple browser tabs.,system extremely slow run multiple browser tab
