In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [31]:
data = pd.read_csv('train_v3_drcat_02.csv')
data = data.drop_duplicates(subset=['text']) # drop duplicates
data = data.dropna(subset=['text']) # drop rows with missing text

In [32]:
data.label.value_counts()

label
1    37897
0    27370
Name: count, dtype: int64

In [33]:
data.prompt_name.value_counts()

prompt_name
Car-free cities                          7567
Does the electoral college work?         7262
Facial action coding system              5923
Distance learning                        5639
Seeking multiple opinions                5235
Driverless cars                          5107
Exploring Venus                          5017
"A Cowboy Who Rode the Waves"            4732
The Face on Mars                         4731
Mandatory extracurricular activities     3144
Summer projects                          2768
Cell phones at school                    2182
Grades for extracurricular activities    2178
Community service                        2151
Phones and driving                       1631
Name: count, dtype: int64

In [34]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
import re
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuations
    words = text.split()
    words = [word.lower() for word in words if word.isalpha()]  # Lowercase and remove non-alphabetic words
    words = [word for word in words if word not in stop_words]  # Remove stop words
    return ' '.join(words)

[nltk_data] Downloading package stopwords to /Users/a1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/a1/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [35]:
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
w_tokenizer = WhitespaceTokenizer()
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    if isinstance(text, str):
        st = ""
        for w in w_tokenizer.tokenize(text):
            st = st + lemmatizer.lemmatize(w) + " "
        return st.strip()
    else:
        return ""


In [36]:
data['cleaned_text'] = data['text'].apply(clean_text)
data['cleaned_text'] = data['cleaned_text'].apply(lemmatize_text)

In [39]:
# data.to_csv('Preprocessed_data.csv', index=False)
data2 = data.iloc[:1000]
data2.to_csv('processed_data2.csv', index = False)

In [10]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True)
X = vectorizer.fit_transform(data['cleaned_text'])

In [11]:
Y = data['label']
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=42)

In [12]:
data.head()

Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven,model,cleaned_text
0,Phones\n\nModern humans today are always on th...,0,Phones and driving,persuade_corpus,False,human,phone modern human today always phone always p...
1,This essay will explain if drivers should or s...,0,Phones and driving,persuade_corpus,False,human,essay explain driver able use electronic devic...
2,Driving while the use of cellular devices\n\nT...,0,Phones and driving,persuade_corpus,False,human,driving use cellular device today society thou...
3,Phones & Driving\n\nDrivers should not be able...,0,Phones and driving,persuade_corpus,False,human,phone driving driver able use phone operating ...
4,Cell Phone Operation While Driving\n\nThe abil...,0,Phones and driving,persuade_corpus,False,human,cell phone operation driving ability stay conn...


In [21]:
name = data['model'].unique()

In [29]:
cnt = {}
for i in data['model']:
    if i not in cnt.keys():
        cnt[i] = 1
    else:
        cnt[i] += 1
        

{'human': 27373,
 'mistral': 13439,
 'llama': 7490,
 'gpt': 4161,
 'claude': 2000,
 'falcon': 4536,
 'palm': 1733,
 'cohere': 350,
 'ada': 692,
 'babbage': 698,
 'curie': 696,
 'davinci': 2099}

In [13]:
data['prompt_name'].unique()

array(['Phones and driving', 'Car-free cities', 'Summer projects',
       '"A Cowboy Who Rode the Waves"',
       'Mandatory extracurricular activities', 'Exploring Venus',
       'Facial action coding system', 'The Face on Mars',
       'Community service', 'Grades for extracurricular activities',
       'Driverless cars', 'Does the electoral college work?',
       'Cell phones at school', 'Distance learning',
       'Seeking multiple opinions'], dtype=object)

In [14]:
# more_data = pd.read_csv('data.csv')

In [18]:
# more_data.head()
# more_data['source'].unique()

Unnamed: 0,text,source,prompt_id,text_length,word_count
0,"Federal law supersedes state law, and cannabis...",Bloom-7B,0,967,157
1,Miles feels restless after working all day. He...,Bloom-7B,0,5068,778
2,So first of I am danish. That means that I fol...,Bloom-7B,0,1602,267
3,In this paper we present a novel rule-based ap...,Bloom-7B,0,5469,848
4,"Most social progressives, love democracy, and ...",Bloom-7B,0,2379,380
