### Read in data

In [14]:
import pandas as pd
import plotly.express as px
from ydata_profiling import ProfileReport

# Set the notebook to display all columns of a dataframe
pd.set_option('display.max_columns', None)

def get_nonnull_cols(in_df):

    not_null_cols = [col for col in in_df.columns if len(in_df[in_df[col].notnull()]) >= 1]

    return not_null_cols

def read_inc_data(infile):

    # Read in INC and category df
    inc_df = pd.read_csv(infile)

    # remove columns that are 100% null
    not_null_cols = get_nonnull_cols(inc_df)
    inc_df = inc_df[not_null_cols]

    # Remove training issue, user error
    inc_df = inc_df[~inc_df['u_cause_code'].isin(['Training issue', 'User error'])]

    # Drop missing incident short_descriptions
    inc_df = inc_df[inc_df['short_description_NER'].notnull()]
    inc_df =inc_df.reset_index()

    return inc_df

# Get incident data 
inc_df = read_inc_data('assets/ServiceNow_Incident.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'assets/ServiceNow_Incident.csv'

### Use ydata profiling for exploratory data analysis

In [2]:
# Create the ydata profiling report
profile = ProfileReport(inc_df, title="ServiceNow Incident Profiling Report")

# Export to html
profile.to_file('assets/ServiceNow_Incident_ProfilingReport.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Vectorize incident descriptions with Tfidf and reduce the doc-term matrix using LSI with TruncatedSVD

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

def get_doc_term_matrix(corpus):

    # Create and fit vectorizer
    vectorizer =TfidfVectorizer(tokenizer=str.split, stop_words='english')
    doc_term_matrix = vectorizer.fit_transform(corpus)

    return doc_term_matrix

def run_dimreduction(doc_term_matrix, num_components):

    # Run Latent Semantic Indexing on the doc term matrix using SVD
    lsi = TruncatedSVD(n_components=num_components)
    description_vecs = lsi.fit_transform(doc_term_matrix)

    return description_vecs

description_term_mat = get_doc_term_matrix(inc_df['short_description_NER'])
out_vecs = run_dimreduction(description_term_mat, 2)



### Join SVD results with INC dataframe

In [4]:
# Convert vectors to dataframe and join with incident df
reduced_df = pd.DataFrame(out_vecs, columns=['Component_1', 'Component_2'])

inc_df2 = pd.merge(inc_df, reduced_df, left_index=True, right_index=True)

### Visualize dimensionality reduction results with short_description and category tooltips 

In [5]:
def plot_labelled_scatter(in_dat, in_title = "",):

    in_dat['category'] = in_dat['category'].astype(str)

    out_fig = px.scatter(in_dat, x = 'Component_1', 
                         y = "Component_2",
                         hover_name="category",
                         hover_data=['short_description_NER'], 
                         opacity = 0.60,
                         template = 'simple_white',
                         width = 800,
                         height = 600,
                         title=in_title)
    
    out_fig.update_traces(marker=dict(size=6, 
                                      line = dict(width =.5,
                                                  color = 'black')),
                          selector=dict(mode='markers'))

    return out_fig

# Show 2-dimensional LSI results
chart = plot_labelled_scatter(inc_df2, "LSI results on short_description using tfidf word vector")
chart