### Import libraries

In [19]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
import unicodedata
import nltk

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sawitt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sawitt\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Download dataset

In [20]:
df = pd.read_csv("dataset.csv")

df.head()

Unnamed: 0,acts,prompt,response,Word count,Grammar,Fluency,Clarity,Engagement,Overall Score,AI Similarity,Sources Found,Overall Plagiarism Score
0,Tech Writer,I want you to act as a tech writer. You will a...,"Quick guide — Download, install and open the a...",575,82.0,61.0,72.0,72.0,72.0,48%,2.0,5%
1,Tech Writer,I want you to act as a tech writer. You will a...,"Quick guide — Download, install, and open the ...",448,81.0,53.0,69.0,73.0,68.0,48%,0.0,0%
2,Tech Writer,I want you to act as a tech writer. You will a...,"How to download, install, and open the app — a...",426,83.0,40.0,71.0,72.0,65.0,86%,0.0,0%
3,Tech Writer,I want you to act as a tech writer. You will a...,"Quick guide — Download, install, and open the ...",505,82.0,48.0,70.0,73.0,67.0,30%,0.0,0%
4,Tech Writer,I want you to act as a tech writer. You will a...,"How to download, install, and open the app — a...",561,82.0,47.0,69.0,73.0,67.0,30%,0.0,0%


### Check dataframe information 

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   acts                      250 non-null    object 
 1   prompt                    250 non-null    object 
 2   response                  250 non-null    object 
 3   Word count                250 non-null    int64  
 4   Grammar                   248 non-null    float64
 5   Fluency                   248 non-null    float64
 6   Clarity                   248 non-null    float64
 7   Engagement                248 non-null    float64
 8   Overall Score             248 non-null    float64
 9   AI Similarity             247 non-null    object 
 10  Sources Found             248 non-null    float64
 11  Overall Plagiarism Score  248 non-null    object 
dtypes: float64(6), int64(1), object(5)
memory usage: 23.6+ KB


In [22]:
non_stopword_df = df.copy()

non_stopword_df.head()

Unnamed: 0,acts,prompt,response,Word count,Grammar,Fluency,Clarity,Engagement,Overall Score,AI Similarity,Sources Found,Overall Plagiarism Score
0,Tech Writer,I want you to act as a tech writer. You will a...,"Quick guide — Download, install and open the a...",575,82.0,61.0,72.0,72.0,72.0,48%,2.0,5%
1,Tech Writer,I want you to act as a tech writer. You will a...,"Quick guide — Download, install, and open the ...",448,81.0,53.0,69.0,73.0,68.0,48%,0.0,0%
2,Tech Writer,I want you to act as a tech writer. You will a...,"How to download, install, and open the app — a...",426,83.0,40.0,71.0,72.0,65.0,86%,0.0,0%
3,Tech Writer,I want you to act as a tech writer. You will a...,"Quick guide — Download, install, and open the ...",505,82.0,48.0,70.0,73.0,67.0,30%,0.0,0%
4,Tech Writer,I want you to act as a tech writer. You will a...,"How to download, install, and open the app — a...",561,82.0,47.0,69.0,73.0,67.0,30%,0.0,0%


### Data Cleaning

In [23]:
def clean_text(text):
    # Convert to lowercase
    if isinstance(text, str):
        text = text.lower()
        return text
    return text

def clean_punctuation(text):
    # Remove punctuation characters
    if isinstance(text, str):
        text = re.sub(r'’', '', text)
        text = re.sub(r'[^\s\w]', ' ', text)
        text = re.sub(r'\n', ' ', text)
        text = re.sub(r'[^\D]', ' ', text)
        text = re.sub(r'_+', '', text)
        text = re.sub(r'\b[a-zA-Z]\b', '', text)
        return text
    return text

def normalize_characters(text):
    if isinstance(text, str):
        normalized_text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')
        return normalized_text
    return text

def remove_stopword(text):
    stopwords = set(nltk.corpus.stopwords.words('english'))
    filtered_words = [word for word in text.split() if word not in stopwords]
    return ' '.join(filtered_words)

In [24]:
df['response'] = df['response'].apply(clean_text).apply(clean_punctuation).apply(normalize_characters)

non_stopword_df['response'] = non_stopword_df['response'].apply(clean_text).apply(clean_punctuation).apply(normalize_characters).apply(remove_stopword)

In [25]:
df['response'].head()

0    quick guide   download  install and open the a...
1    quick guide   download  install  and open the ...
2    how to download  install  and open the app    ...
3    quick guide   download  install  and open the ...
4    how to download  install  and open the app    ...
Name: response, dtype: object

In [26]:
non_stopword_df['response'].head()

0    quick guide download install open app cross pl...
1    quick guide download install open app cross pl...
2    download install open app quick cross platform...
3    quick guide download install open app cross pl...
4    download install open app clean cross platform...
Name: response, dtype: object

### Data Preprocessing

In [27]:
df['response'] = df['response'].apply(word_tokenize)

In [28]:
non_stopword_df['response'] = non_stopword_df['response'].apply(
    lambda x: word_tokenize(x) if isinstance(x, str) else x
)

### Word Frequencies

#### With stopwords

In [29]:
freq_df = df[['acts', 'response']]
topics = freq_df['acts'].unique()

for topic in topics:
    all_words = []
    responses = freq_df[freq_df['acts'] == topic]['response']
    for tokens in responses:
        all_words.extend(tokens)

    word_counts = {}
    for word in all_words:
        if word in word_counts:
            word_counts[word] += 1
        else:
            word_counts[word] = 1
        
    print(f"\n================================= Topic: {topic} =================================")
    for word in sorted(word_counts, key=word_counts.get, reverse=True)[:10]:
        if word_counts[word] >= 5:
            print(f"{word}: {word_counts[word]}")
        else:
            continue
    if word_counts:
        avg_repetition = sum(word_counts.values()) / len(word_counts)
        print(f"\nAverage of duplicated words (≥5 reps): {avg_repetition:.2f}")
    else:
        print("\nNo words repeated ≥ 5 times.")


the: 142
or: 86
app: 74
download: 58
screenshot: 57
and: 55
install: 49
run: 45
if: 44
to: 43

Average of duplicated words (≥5 reps): 6.59

and: 74
the: 30
action: 19
that: 17
film: 13
with: 12
matrix: 10
reality: 10
effects: 10
concept: 9

Average of duplicated words (≥5 reps): 3.25

and: 49
tbsp: 29
to: 28
salt: 21
quinoa: 18
with: 18
olive: 17
oil: 17
tahini: 17
pepper: 16

Average of duplicated words (≥5 reps): 5.26

to: 22
and: 21
for: 17
or: 16
bedtime: 15
with: 14
snack: 13
simple: 13
calm: 13
quiet: 13

Average of duplicated words (≥5 reps): 3.76

and: 48
for: 44
cream: 35
with: 28
to: 26
apply: 23
on: 23
soft: 20
hydrating: 18
skin: 15

Average of duplicated words (≥5 reps): 4.47

and: 31
for: 28
with: 27
to: 27
of: 22
opening: 21
use: 20
script: 17
run: 17
stage: 15

Average of duplicated words (≥5 reps): 3.93

and: 64
for: 54
room: 27
or: 26
curator: 21
works: 19
text: 19
with: 18
audio: 18
mvp: 18

Average of duplicated words (≥5 reps): 3.88

and: 91
the: 27
cities: 25
of:

#### Without stopword

In [30]:
non_stopword_freq_df = non_stopword_df[['acts', 'response']]
topics = non_stopword_freq_df['acts'].unique()

for topic in topics:
    all_words = []
    responses = non_stopword_freq_df[non_stopword_freq_df['acts'] == topic]['response']
    for tokens in responses:
        all_words.extend(tokens)

    word_counts = {}
    for word in all_words:
        if word in word_counts:
            word_counts[word] += 1
        else:
            word_counts[word] = 1
        
    print(f"\n================================= Topic: {topic} =================================")
    for word in sorted(word_counts, key=word_counts.get, reverse=True)[:10]:
        if word_counts[word] >= 5:
            print(f"{word}: {word_counts[word]}")
        else:
            continue
    if word_counts:
        avg_repetition = sum(word_counts.values()) / len(word_counts)
        print(f"\nAverage of duplicated words (≥5 reps): {avg_repetition:.2f}")
    else:
        print("\nNo words repeated ≥ 5 times.")


app: 74
download: 58
screenshot: 57
install: 49
run: 45
click: 39
linux: 38
open: 35
windows: 34
installer: 34

Average of duplicated words (≥5 reps): 5.71

action: 19
film: 13
matrix: 10
reality: 10
effects: 10
concept: 9
neo: 9
exposition: 9
spectacle: 9
influential: 8

Average of duplicated words (≥5 reps): 2.82

tbsp: 29
salt: 21
quinoa: 18
olive: 17
oil: 17
tahini: 17
pepper: 16
tsp: 16
cooked: 14
sweet: 14

Average of duplicated words (≥5 reps): 5.02

bedtime: 15
snack: 13
simple: 13
calm: 13
quiet: 13
rules: 12
dinner: 12
use: 11
authorized: 11
parents: 10

Average of duplicated words (≥5 reps): 3.53

cream: 35
apply: 23
soft: 20
hydrating: 18
skin: 15
satin: 14
lashes: 14
lift: 13
brown: 13
liner: 13

Average of duplicated words (≥5 reps): 4.01

opening: 21
use: 20
script: 17
run: 17
stage: 15
one: 15
rehearsal: 15
slides: 14
core: 13
short: 13

Average of duplicated words (≥5 reps): 3.61

room: 27
curator: 21
works: 19
text: 19
audio: 18
mvp: 18
material: 17
artist: 17
event: