### Import libraries

In [1]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
import unicodedata
import nltk

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sawitt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sawitt\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Download dataset

In [2]:
df = pd.read_csv("temp.csv")

df.head()

Unnamed: 0,index,acts,prompt,response,Word count,Grammar,Fluency,Clarity,Engagement,Overall Score,AI Similarity,Sources Found,Overall Plagiarism Score
0,151,Chief Executive Officer,I want you to act as a Chief Executive Officer...,I’m structuring a crisis-response execution pl...,578,86,46,73,74,69,88%,2,8%
1,152,Chief Executive Officer,I want you to act as a Chief Executive Officer...,Team assembled. Moving to in‑progress risk ass...,223,90,40,66,78,67,81%,0,0%
2,153,Chief Executive Officer,I want you to act as a Chief Executive Officer...,I’ll update the recall plan to reflect progres...,114,98,68,74,76,80,83%,0,0%
3,154,Chief Executive Officer,I want you to act as a Chief Executive Officer...,Distribution halt completed; moving through re...,398,86,44,71,74,68,90%,0,0%
4,155,Chief Executive Officer,I want you to act as a Chief Executive Officer...,Progress:\n- Regulatory notifications complete...,115,90,45,77,77,71,96%,0,0%


### Check dataframe information 

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   index                     50 non-null     int64 
 1   acts                      50 non-null     object
 2   prompt                    50 non-null     object
 3   response                  50 non-null     object
 4   Word count                50 non-null     int64 
 5   Grammar                   50 non-null     int64 
 6   Fluency                   50 non-null     int64 
 7   Clarity                   50 non-null     int64 
 8   Engagement                50 non-null     int64 
 9   Overall Score             50 non-null     int64 
 10  AI Similarity             50 non-null     object
 11  Sources Found             50 non-null     int64 
 12  Overall Plagiarism Score  50 non-null     object
dtypes: int64(8), object(5)
memory usage: 5.2+ KB


In [4]:
non_stopword_df = df.copy()

non_stopword_df.head()

Unnamed: 0,index,acts,prompt,response,Word count,Grammar,Fluency,Clarity,Engagement,Overall Score,AI Similarity,Sources Found,Overall Plagiarism Score
0,151,Chief Executive Officer,I want you to act as a Chief Executive Officer...,I’m structuring a crisis-response execution pl...,578,86,46,73,74,69,88%,2,8%
1,152,Chief Executive Officer,I want you to act as a Chief Executive Officer...,Team assembled. Moving to in‑progress risk ass...,223,90,40,66,78,67,81%,0,0%
2,153,Chief Executive Officer,I want you to act as a Chief Executive Officer...,I’ll update the recall plan to reflect progres...,114,98,68,74,76,80,83%,0,0%
3,154,Chief Executive Officer,I want you to act as a Chief Executive Officer...,Distribution halt completed; moving through re...,398,86,44,71,74,68,90%,0,0%
4,155,Chief Executive Officer,I want you to act as a Chief Executive Officer...,Progress:\n- Regulatory notifications complete...,115,90,45,77,77,71,96%,0,0%


### Data Cleaning

In [5]:
def clean_text(text):
    # Convert to lowercase
    if isinstance(text, str):
        text = text.lower()
        return text
    return text

def clean_punctuation(text):
    # Remove punctuation characters
    if isinstance(text, str):
        text = re.sub(r'’', '', text)
        text = re.sub(r'[^\s\w]', ' ', text)
        text = re.sub(r'\n', ' ', text)
        text = re.sub(r'[^\D]', ' ', text)
        text = re.sub(r'_+', '', text)
        text = re.sub(r'\b[a-zA-Z]\b', '', text)
        return text
    return text

def normalize_characters(text):
    if isinstance(text, str):
        normalized_text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')
        return normalized_text
    return text

def remove_stopword(text):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    filtered_words = [word for word in text.split() if word not in stopwords]
    return ' '.join(filtered_words)

In [6]:
df['response'] = df['response'].apply(clean_text).apply(clean_punctuation).apply(normalize_characters)

non_stopword_df['response'] = non_stopword_df['response'].apply(clean_text).apply(clean_punctuation).apply(normalize_characters).apply(remove_stopword)

In [7]:
df['response'].head()

0    im structuring  crisis response execution plan...
1    team assembled  moving to in progress risk ass...
2    ill update the recall plan to reflect progress...
3    distribution halt completed  moving through re...
4    progress    regulatory notifications completed...
Name: response, dtype: object

In [8]:
non_stopword_df['response'].head()

0    im structuring crisis response execution plan ...
1    team assembled moving progress risk assessment...
2    ill update recall plan reflect progress comple...
3    distribution halt completed moving regulatory ...
4    progress regulatory notifications completed sc...
Name: response, dtype: object

### Data Preprocessing

In [9]:
df['response'] = df['response'].apply(word_tokenize)

In [10]:
non_stopword_df['response'] = non_stopword_df['response'].apply(
    lambda x: word_tokenize(x) if isinstance(x, str) else x
)

### Word Frequencies

#### With stopwords

In [16]:
freq_df = df[['acts', 'response']]
topics = freq_df['acts'].unique()

for topic in topics:
    all_words = []
    responses = freq_df[freq_df['acts'] == topic]['response']
    for tokens in responses:
        all_words.extend(tokens)

    word_counts = {}
    for word in all_words:
        if word in word_counts:
            word_counts[word] += 1
        else:
            word_counts[word] = 1
        
    print(f"\n================================= Topic: {topic} =================================")
    for word in sorted(word_counts, key=word_counts.get, reverse=True)[:10]:
        if word_counts[word] >= 5:
            print(f"{word}: {word_counts[word]}")
        else:
            continue
    if word_counts:
        avg_repetition = sum(word_counts.values()) / len(word_counts)
        print(f"\nAverage of duplicated words (≥5 reps): {avg_repetition:.2f}")
    else:
        print("\nNo words repeated ≥ 5 times.")


and: 31
to: 24
customer: 20
if: 17
regulatory: 15
risk: 14
for: 13
scope: 12
recall: 11
vs: 11

Average of duplicated words (≥5 reps): 2.07

course: 73
min: 65
deep: 36
day: 35
week: 31
and: 29
weekly: 29
blocks: 27
retrieval: 24
to: 23

Average of duplicated words (≥5 reps): 4.74

target: 20
role: 19
to: 16
summary: 13
bullets: 12
and: 12
impact: 11
tech: 10
experience: 10
time: 10

Average of duplicated words (≥5 reps): 2.68

stress: 32
after: 30
sleep: 25
to: 24
min: 18
caffeine: 15
walk: 15
work: 15
wake: 14
top: 13

Average of duplicated words (≥5 reps): 3.72

pizza: 35
to: 30
th: 29
the: 26
italian: 24
bite: 18
in: 17
latin: 16
flatbread: 16
southern: 15

Average of duplicated words (≥5 reps): 3.72

and: 54
heat: 38
clean: 24
to: 22
is: 16
with: 16
climate: 15
the: 15
power: 15
carbon: 15

Average of duplicated words (≥5 reps): 3.36

watch: 38
the: 36
and: 24
vanish: 22
in: 15
silk: 14
reveal: 13
time: 13
cloth: 13
your: 12

Average of duplicated words (≥5 reps): 3.20

and: 28
c

#### Without stopword

In [17]:
non_stopword_freq_df = non_stopword_df[['acts', 'response']]
topics = non_stopword_freq_df['acts'].unique()

for topic in topics:
    all_words = []
    responses = non_stopword_freq_df[non_stopword_freq_df['acts'] == topic]['response']
    for tokens in responses:
        all_words.extend(tokens)

    word_counts = {}
    for word in all_words:
        if word in word_counts:
            word_counts[word] += 1
        else:
            word_counts[word] = 1
        
    print(f"\n================================= Topic: {topic} =================================")
    for word in sorted(word_counts, key=word_counts.get, reverse=True)[:10]:
        if word_counts[word] >= 5:
            print(f"{word}: {word_counts[word]}")
        else:
            continue
    if word_counts:
        avg_repetition = sum(word_counts.values()) / len(word_counts)
        print(f"\nAverage of duplicated words (≥5 reps): {avg_repetition:.2f}")
    else:
        print("\nNo words repeated ≥ 5 times.")


customer: 20
regulatory: 15
risk: 14
scope: 12
recall: 11
vs: 11
next: 11
progress: 10
product: 9
severity: 8

Average of duplicated words (≥5 reps): 1.90

course: 73
min: 65
deep: 36
day: 35
week: 31
weekly: 29
blocks: 27
retrieval: 24
project: 22
work: 20

Average of duplicated words (≥5 reps): 4.60

target: 20
role: 19
summary: 13
bullets: 12
impact: 11
tech: 10
experience: 10
time: 10
core: 9
skills: 9

Average of duplicated words (≥5 reps): 2.61

stress: 32
sleep: 25
min: 18
caffeine: 15
walk: 15
work: 15
wake: 14
top: 13
time: 12
habit: 12

Average of duplicated words (≥5 reps): 3.53

pizza: 35
th: 29
italian: 24
bite: 18
latin: 16
flatbread: 16
southern: 15
baked: 15
pitta: 15
germanic: 15

Average of duplicated words (≥5 reps): 3.41

heat: 38
clean: 24
climate: 15
power: 15
carbon: 15
management: 13
energy: 12
methane: 12
flood: 11
plans: 11

Average of duplicated words (≥5 reps): 3.09

watch: 38
vanish: 22
silk: 14
reveal: 13
time: 13
cloth: 13
envelope: 11
audience: 9
wrap: 