# Tekstanalyse og enkel maskinlæring
I denne notebooken skal vi gå gjennom hvordan man kan utforske tekstdata, og sette opp en enkel klassifisering av dataen.
Den første oppgaven er å generere et datasett. Dette kan du enkelt gjøre ved å gi ChatGPT et par eksempel, og be

In [None]:
!pip install wordcloud
!pip install plotly-express
!pip install umap-learn
!pip install scikit-learn
!pip install nltk #natural language toolkit
!pip install sentence-transformers

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import nltk
from sentence_transformers import SentenceTransformer
import plotly.express as px
import umap.umap_ as umap
from collections import Counter

nltk.download('stopwords')

from nltk.corpus import stopwords

pd.set_option('display.max_colwidth', None)

In [None]:
df = pd.read_csv('data/dataset.csv')
df_data = df['complaint']
print(df_data)

In [None]:
stops = set(stopwords.words('english'))

print(stops)

In [None]:
# Remove stopwords from the column values
def remove_stopwords(values):
    stop_words = set(stopwords.words('english'))
    cleaned_values = []
    for value in values:
        words = value.split()
        cleaned_words = [word for word in words if word.lower() not in stop_words]
        cleaned_values.append(' '.join(cleaned_words))
    return cleaned_values

column_values = df_data.tolist()
cleaned_values = remove_stopwords(column_values)

# Create Wordcloud
wordcloud = WordCloud(width=800, height=400, background_color='white')
wordcloud.generate(' '.join(cleaned_values))

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
def generate_wordcloud_from_column(df):
    # Get the column values as a list
    column_values = df["complaint"].tolist()

    #Remove stopwords from the column values
    cleaned_values = remove_stopwords(column_values)

    # Combine two words separated by a space
    word_pairs = []
    for value in cleaned_values:
        words = value.split()
        pairs = [words[i] + ' ' + words[i+1] for i in range(len(words) - 1)]
        word_pairs.extend(pairs)

    # Create the word cloud from the frequencies
    word_pair_freq = Counter(word_pairs)
    wordcloud.generate_from_frequencies(dict(word_pair_freq))

    # Display the word cloud using matplotlib
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

generate_wordcloud_from_column(df)

In [None]:
filter_word = "login form"

df_filtered = df.loc[df.complaint.str.contains(filter_word)]
generate_wordcloud_from_column(df_filtered)

In [None]:
print(df_filtered.values[0:5])

In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
embeddings = model.encode(df_data)
print(embeddings)

In [None]:
def plot_word_embeddings(df, embeddings):
    # Apply UMAP dimensionality reduction
    umap_embeddings = umap.UMAP(n_components=2, metric='cosine', min_dist=0.0, random_state=10).fit(embeddings)

    # Create a scatter plot using Plotly
    fig = px.scatter(
    x=umap_embeddings.embedding_[:,0], 
    y=umap_embeddings.embedding_[:,1], 
    color=df.category, 
    hover_data=[df.complaint],
    color_discrete_sequence=px.colors.qualitative.Alphabet,
    width=1000, height=800
    )

    fig.show()
plot_word_embeddings(df, embeddings)

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(embeddings, df.category, test_size=0.2, random_state=42)

svc = SVC()
svc.fit(X_train, y_train)

# Make predictions on the test data
y_pred = svc.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


In [None]:
text_to_predict = "When i signed into the page, i got an error message, and the page wouldn't load."
cmp_pred = svc.predict(model.encode(text_to_predict).reshape(1, -1))
print(cmp_pred[0])

In [None]:
text_to_predict = "The text is too small"
cmp_pred = svc.predict(model.encode(text_to_predict).reshape(1, -1))
print(cmp_pred[0])

Det virker innledende ut som at modellen er veldig bra, men ettersom teksten er generert av ChatGPT så risikerer vi at dataen muligens ikke gjenspeiler ekte data, eller at de respektive kategoriene er veldig homogene, som gjør at det er enkelt å klassifisere. Her burde man aktivt gå gjennom dataen man har generert og forsikre seg at kvaliteten er god.

In [None]:
def find_duplicates(filename, column_name):
    df = pd.read_csv(filename)
    duplicates = df[df.duplicated(subset=column_name, keep=False)]
    return duplicates
dupes = find_duplicates("data/dataset.csv", "complaint")
len(dupes)

In [None]:
df.category.value_counts()

Se her ja! Det er en skjev fordelig i antall klager per kategori. La oss se hva som skjer når vi jevner det ut

In [None]:
df_equal = pd.read_csv("data/dataset_equal.csv")

embeddings = model.encode(df_equal.complaint)

X_train, X_test, y_train, y_test = train_test_split(embeddings, df_equal.category, test_size=0.2, random_state=42)

svc = SVC()
svc.fit(X_train, y_train)

# Make predictions on the test data
y_pred = svc.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Da har vi ihvertfall fått jevnet ut kategoriene, men scoren er fremdeles mistenkelig høy. La oss se på hva de forskjellige kategoriene inneholder

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(df):
    # Read the CSV file

    # Separate complaints for each category
    ux_complaints = df[df['category'] == 'UX']['complaint'].tolist()
    page_error_complaints = df[df['category'] == 'page error']['complaint'].tolist()
    ui_complaints = df[df['category'] == 'UI']['complaint'].tolist()

    # Initialize TfidfVectorizer to convert text into numerical features
    vectorizer = TfidfVectorizer()

    # Fit and transform the complaints for each category
    ux_tfidf_matrix = vectorizer.fit_transform(ux_complaints)
    page_error_tfidf_matrix = vectorizer.transform(page_error_complaints)
    ui_tfidf_matrix = vectorizer.transform(ui_complaints)

    # Calculate similarity between different categories
    ux_page_error_similarity = cosine_similarity(ux_tfidf_matrix, page_error_tfidf_matrix)
    ux_ui_similarity = cosine_similarity(ux_tfidf_matrix, ui_tfidf_matrix)
    page_error_ui_similarity = cosine_similarity(page_error_tfidf_matrix, ui_tfidf_matrix)

    # Calculate average similarity within each category
    ux_similarity = np.mean(cosine_similarity(ux_tfidf_matrix))
    page_error_similarity = np.mean(cosine_similarity(page_error_tfidf_matrix))
    ui_similarity = np.mean(cosine_similarity(ui_tfidf_matrix))

    return {
        "UX similarity": ux_similarity,
        "Page error similarity": page_error_similarity,
        "UI similarity": ui_similarity,
        "UX vs. Page error similarity": np.mean(ux_page_error_similarity),
        "UX vs. UI similarity": np.mean(ux_ui_similarity),
        "Page error vs. UI similarity": np.mean(page_error_ui_similarity),
    }


In [None]:
# Calculate similarity within and between categories
similarity_results = calculate_similarity(df_equal)

# Print the similarity results
for key, value in similarity_results.items():
    print(key + ":", value)

Her ser vi ja at det er likheter innad i kategoriene, mens den er generelt lav mellom kategoriene. Om dette er noe som bør fikses eller ei er noe som man må utforske. Dette tilfellet er ekstra ekstremt med vilje, ettersom vi ikke ga noen informasjon om hvordan klagene skulle se ut slik at vi kunne utforske denne problemstillingen. I den ekte verden så er det jo tilfeller der kategoriene rett og slett er veldig adskilte. Food for thought ;)