In [20]:
%run config.py

In [2]:
import numpy as np
import pandas as pd
from copy import deepcopy

#visualization packages
import seaborn as sns
import matplotlib.pyplot as plt

# NLP modules we will use for text normalization
import re #regex 
import nltk # the natural language toolkit
from nltk.tokenize import word_tokenize
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag

# feature construction
from sklearn.feature_extraction.text import TfidfVectorizer #use this to create BoW matrix

In [3]:
df = pd.read_csv('Data/customer_support_tickets.csv')

In [4]:
import pyLDAvis.lda_model


#modeling and dimensionality reduction for visuaization
from sklearn.decomposition import NMF
from sklearn.manifold import TSNE

In [5]:
df['Ticket Description'] = df.apply(lambda x: x['Ticket Description'].replace("{product_purchased}", x['Product Purchased']), axis=1)


In [6]:
df['Ticket Description'] = df['Ticket Description'].str.lower()

In [7]:
cleaned_df = df[['Ticket ID','Product Purchased','Ticket Type','Ticket Description']]

In [8]:
cleaned_df

Unnamed: 0,Ticket ID,Product Purchased,Ticket Type,Ticket Description
0,1,GoPro Hero,Technical issue,i'm having an issue with the gopro hero. pleas...
1,2,LG Smart TV,Technical issue,i'm having an issue with the lg smart tv. plea...
2,3,Dell XPS,Technical issue,i'm facing a problem with my dell xps. the del...
3,4,Microsoft Office,Billing inquiry,i'm having an issue with the microsoft office....
4,5,Autodesk AutoCAD,Billing inquiry,i'm having an issue with the autodesk autocad....
...,...,...,...,...
8464,8465,LG OLED,Product inquiry,my lg oled is making strange noises and not fu...
8465,8466,Bose SoundLink Speaker,Technical issue,i'm having an issue with the bose soundlink sp...
8466,8467,GoPro Action Camera,Technical issue,i'm having an issue with the gopro action came...
8467,8468,PlayStation,Product inquiry,i'm having an issue with the playstation. plea...


In [9]:
def process_ticket(ticket_text, min_length):
    
    # get common stop words that we'll remove during tokenization/text normalization
    stop_words = stopwords.words('english')

    #initialize lemmatizer
    wnl = WordNetLemmatizer()

    # helper function to change nltk's part of speech tagging to a wordnet format.
    def pos_tagger(nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:         
            return None
   

    # lower case everything
    ticket_lower = ticket_text.lower()

    ticket_lower = re.sub(r"@[a-z0-9_]+|#[a-z0-9_]+|http\S+", "", ticket_lower).strip().replace("\r", "").replace("\n", "").replace("\t", "")

    # remove stop words and punctuations 
    
    extra_stop_words = ['issue','help','problem','review','name','seem','experience','please']
    
    ticket_norm = [x for x in word_tokenize(ticket_lower) if ((x.isalpha()) & (x not in stop_words)) ]
    ticket_norm = [x for x in word_tokenize(ticket_lower) if ((x.isalpha()) & (x not in extra_stop_words)) ]

    #  POS detection on the result will be important in telling Wordnet's lemmatizer how to lemmatize
    
    # creates list of tuples with tokens and POS tags in wordnet format
    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tag(ticket_norm))) 

    # now we are going to have a cutoff here. any tokenized cocument with length < min length will be removed from corpus
    if len(wordnet_tagged) <= min_length:
        return ''
    else:
         # rejoins lemmatized sentence 
        ticket_norm = " ".join([wnl.lemmatize(x[0], x[1]) for x in wordnet_tagged if x[1] is not None])
        return ticket_norm

In [10]:
cleaned_df['Ticket Description'] = cleaned_df['Ticket Description'].apply(process_ticket, args = [10])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['Ticket Description'] = cleaned_df['Ticket Description'].apply(process_ticket, args = [10])


In [11]:
canon_df = cleaned_df[cleaned_df['Product Purchased']=='Canon EOS']

In [12]:
for type in canon_df['Ticket Type'].unique():
    print(type)

Refund request
Product inquiry
Technical issue
Billing inquiry
Cancellation request


In [13]:
for ticket_type in canon_df['Ticket Type'].unique():
    corpus = canon_df[canon_df['Ticket Type']==ticket_type]['Ticket Description']
    print(ticket_type)
    print('-------')
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(corpus)
    X_train

    topic_model = NMF(n_components = 5)
    topic_model.fit(X_train)

    # to get H
    H = topic_model.transform(X_train) # transform document into topic vector representation

    # to get W 
    W = topic_model.components_ # word component weights for each topic

    for index,topic in enumerate(W):
        print(f'THE TOP 10 WORDS FOR TOPIC #{index}')
        print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-25:]])
        print('\n')

Refund request
-------
THE TOP 10 WORDS FOR TOPIC #0
['message', 'error', 'mean', 'screen', 'up', 'join', 'resolve', 'notice', 'troubleshoot', 'assist', 'be', 'order', 'connect', 'network', 'do', 'have', 'use', 'try', 'peripheral', 'cable', 'different', 'adapter', 'eos', 'persists', 'canon']


THE TOP 10 WORDS FOR TOPIC #1
['page', 'be', 'tell', 'type', 'need', 'do', 'store', 'back', 'want', 'something', 'see', 'search', 'get', 'have', 'assist', 'eos', 'canon', 'step', 'option', 'find', 'perform', 'guide', 'unable', 'action', 'desired']


THE TOP 10 WORDS FOR TOPIC #2
['webapp', 'hand', 'xbox', 'team', 'official', 'live', 'information', 'widget', 'label', 'title', 'show', 'account', 'log', 'have', 'use', 'issue', 'be', 'work', 'need', 'assistance', 'affect', 'productivity', 'possible', 'as', 'soon']


THE TOP 10 WORDS FOR TOPIC #3
['there', 'shop', 'id', 'check', 'cost', 'account', 'purchase', 'make', 'occur', 'way', 'recover', 'software', 'lose', 'have', 'like', 'update', 'security', 

In [14]:
# Check if 'Assigned Topic' and 'Topic Words' columns exist, if not, create them
if 'Assigned Topic' not in cleaned_df.columns:
    cleaned_df['Assigned Topic'] = None
if 'Topic Words' not in cleaned_df.columns:
    cleaned_df['Topic Words'] = None

# Initialize a dictionary to store top words for each topic of each product and ticket type
topic_words_dict = {}

# Iterate over each product
for product in cleaned_df['Product Purchased'].unique():
    # Filter for tickets related to the current product
    product_df = cleaned_df[cleaned_df['Product Purchased'] == product]
    
    for ticket_type in product_df['Ticket Type'].unique():
        # Selecting the subset of the DataFrame for the current ticket type
        subset_df = product_df[product_df['Ticket Type'] == ticket_type]
        corpus = subset_df['Ticket Description']
        ticket_ids = subset_df.index  # Using DataFrame index as a proxy for ticket ID if 'Ticket ID' column does not exist

        # Initialize TF-IDF Vectorizer and NMF Model
        vectorizer = TfidfVectorizer()
        X_train = vectorizer.fit_transform(corpus)
        topic_model = NMF(n_components=5, random_state=42)  # Added random_state for reproducibility
        topic_model.fit(X_train)

        # Transform document into topic vector representation
        H = topic_model.transform(X_train)  # Document-topic matrix
        W = topic_model.components_  # Topic-term matrix

        # For each topic, store the top words
        for topic_idx, topic in enumerate(W):
            top_words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-20:]]  # Get top 10 words
            topic_key = f"{product}_{ticket_type}_Topic{topic_idx}"
            topic_words_dict[topic_key] = ', '.join(top_words)  # Store as a comma-separated string

        # Assign tickets to topics based on the highest topic weight
        for ticket_index, topic_vector in zip(ticket_ids, H):
            # Find the topic with the highest weight for this ticket
            assigned_topic = topic_vector.argmax()
            topic_key = f"{product}_{ticket_type}_Topic{assigned_topic}"
            # Update the 'Assigned Topic' in cleaned_df directly
            cleaned_df.at[ticket_index, 'Assigned Topic'] = topic_key

# Map the 'Assigned Topic' to its corresponding top words
cleaned_df['Topic Words'] = cleaned_df['Assigned Topic'].map(topic_words_dict)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['Assigned Topic'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['Topic Words'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['Topic Words'] = cleaned_df['Assigned Topic'].map(topic_words_dict)


In [15]:
cleaned_df

Unnamed: 0,Ticket ID,Product Purchased,Ticket Type,Ticket Description,Assigned Topic,Topic Words
0,1,GoPro Hero,Technical issue,i have gopro hero billing zip code be apprecia...,GoPro Hero_Technical issue_Topic0,"pop, mean, notice, setting, configuration, run..."
1,2,LG Smart TV,Technical issue,i have lg smart tv need change exist have lg s...,LG Smart TV_Technical issue_Topic0,"customer, make, do, tvlg, product, sure, devic..."
2,3,Dell XPS,Technical issue,i face dell dell xps be not turn be work fine ...,Dell XPS_Technical issue_Topic1,"support, review, official, other, time, dell, ..."
3,4,Microsoft Office,Billing inquiry,i have microsoft office have interested i love...,Microsoft Office_Billing inquiry_Topic2,"firmware, started, relate, recently, happen, o..."
4,5,Autodesk AutoCAD,Billing inquiry,i have autodesk autocad seller be not responsi...,Autodesk AutoCAD_Billing inquiry_Topic1,"time, now, add, fine, not, autocad, product, m..."
...,...,...,...,...,...,...
8464,8465,LG OLED,Product inquiry,lg oled be make strange noise not function pro...,LG OLED_Product inquiry_Topic1,"lg, oled, properly, function, hardware, time, ..."
8465,8466,Bose SoundLink Speaker,Technical issue,i have bose soundlink speaker i also buy i get...,Bose SoundLink Speaker_Technical issue_Topic1,"frequently, product, have, price, note, list, ..."
8466,8467,GoPro Action Camera,Technical issue,i have gopro action camera assist be use diffe...,GoPro Action Camera_Technical issue_Topic1,"unable, product, find, br, one, browser, diffe..."
8467,8468,PlayStation,Product inquiry,i have playstation assist i do think product b...,PlayStation_Product inquiry_Topic0,"use, specific, update, do, notice, same, consi..."


In [16]:
cleaned_df[cleaned_df['Product Purchased']=='Canon EOS']['Topic Words']

19      account, purchase, make, occur, way, recover, ...
36      join, resolve, notice, troubleshoot, assist, b...
85      be, use, troubleshoot, password, review, eos, ...
144     do, store, back, want, something, see, search,...
199     payment, uipybar, software, run, already, late...
                              ...                        
8222    support, step, eos, canon, have, website, revi...
8229    be, purchase, product, item, use, like, ensure...
8324    intermittent, respond, face, eos, canon, yeste...
8341    support, step, eos, canon, have, website, revi...
8349    report, be, do, account, see, assist, have, th...
Name: Topic Words, Length: 240, dtype: object

In [17]:
cleaned_df

Unnamed: 0,Ticket ID,Product Purchased,Ticket Type,Ticket Description,Assigned Topic,Topic Words
0,1,GoPro Hero,Technical issue,i have gopro hero billing zip code be apprecia...,GoPro Hero_Technical issue_Topic0,"pop, mean, notice, setting, configuration, run..."
1,2,LG Smart TV,Technical issue,i have lg smart tv need change exist have lg s...,LG Smart TV_Technical issue_Topic0,"customer, make, do, tvlg, product, sure, devic..."
2,3,Dell XPS,Technical issue,i face dell dell xps be not turn be work fine ...,Dell XPS_Technical issue_Topic1,"support, review, official, other, time, dell, ..."
3,4,Microsoft Office,Billing inquiry,i have microsoft office have interested i love...,Microsoft Office_Billing inquiry_Topic2,"firmware, started, relate, recently, happen, o..."
4,5,Autodesk AutoCAD,Billing inquiry,i have autodesk autocad seller be not responsi...,Autodesk AutoCAD_Billing inquiry_Topic1,"time, now, add, fine, not, autocad, product, m..."
...,...,...,...,...,...,...
8464,8465,LG OLED,Product inquiry,lg oled be make strange noise not function pro...,LG OLED_Product inquiry_Topic1,"lg, oled, properly, function, hardware, time, ..."
8465,8466,Bose SoundLink Speaker,Technical issue,i have bose soundlink speaker i also buy i get...,Bose SoundLink Speaker_Technical issue_Topic1,"frequently, product, have, price, note, list, ..."
8466,8467,GoPro Action Camera,Technical issue,i have gopro action camera assist be use diffe...,GoPro Action Camera_Technical issue_Topic1,"unable, product, find, br, one, browser, diffe..."
8467,8468,PlayStation,Product inquiry,i have playstation assist i do think product b...,PlayStation_Product inquiry_Topic0,"use, specific, update, do, notice, same, consi..."


In [18]:
import random
index = random.randint(0, cleaned_df.shape[0])

# cleaned_df.iloc[[index]]

product = cleaned_df.iloc[index]['Product Purchased']
issue_type = cleaned_df.iloc[index]['Ticket Type']
topic_words = cleaned_df.iloc[index]['Topic Words']
description = cleaned_df.iloc[index]['Ticket Description']

print(product)
print(issue_type)
print(topic_words)
print()
print(description)


Nikon D
Billing inquiry
be, resolve, do, perform, nikon, hop, reset, factory, use, software, make, thank, seem, provide, there, difficult, frequently, glitch, solution, freeze

there seem be glitch nikon d software freeze frequently make difficult use provide solution unfortunately do have bug system be i try clear cache data nikon d app persists


In [21]:
load_dotenv()

api_key = os.getenv('OPENAI_API_KEY')

client = OpenAI(api_key = api_key)

response = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "You are a customer support bot. Your job is to give helpful advice when a customer writes in with a given issue. We have used a machine learning model to assign a topic to their specific request based on the top 20 words associated with that topic. The response should always contain a resolution, there will be no opportunity for followup from the user."},
    {"role": "user", "content": f"Product is {product}, Issue type is {issue_type}, Topic words are {topic_words}"}
  ]
)

print(response.choices[0].message.content)

If you are experiencing billing issues with your Nikon D product, first ensure that you have been charged correctly for your purchase. If there appears to be an error, you may want to check with your bank or contact Nikon's customer support for further assistance. Make sure to have all relevant billing information on hand when discussing the issue.
