In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
import string
from nltk import pos_tag, ne_chunk
from gensim import corpora
from gensim.models import LdaModel
import textstat
from language_tool_python import LanguageTool
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
df = pd.read_csv('DATA_SET/final_test_v2.csv')

### Basic NLP Features

In [78]:
df['char_count'] = df['cleaned_text'].apply(len)

df['word_count'] = df['cleaned_text'].apply(lambda x: len(word_tokenize(x))) # word count

df['word_density'] = df['word_count'] / df['char_count'] # word density

# punctuation count
def punctuation_count(text):
    return sum(1 for char in text if char in string.punctuation)

df['punctuation_count'] = df['text'].apply(punctuation_count)

# Upper case count
def upper_case_count(text):
    return sum(1 for char in text if char.isupper())

df['upper_case_count'] = df['text'].apply(upper_case_count)

def title_word_count(text):
    return sum(1 for word in text.split() if word.istitle())

df['title_word_count'] = df['text'].apply(title_word_count)

# parts of speech
def parts_of_speech(text):
    pos_tags = pos_tag(word_tokenize(text))
    
    noun_count = sum(1 for tag in pos_tags if tag[1] in ['NN', 'NNS', 'NNP', 'NNPS'])
    adv_count = sum(1 for tag in pos_tags if tag[1] in ['RB', 'RBR', 'RBS'])
    verb_count = sum(1 for tag in pos_tags if tag[1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'])
    adj_count = sum(1 for tag in pos_tags if tag[1] in ['JJ', 'JJR', 'JJS'])
    pro_count = sum(1 for tag in pos_tags if tag[1] in ['PRP', 'PRP$', 'WP', 'WP$'])
    return pd.Series([noun_count, adv_count, verb_count, adj_count, pro_count], index=['noun_count','adv_count','verb_count','adj_count','pro_count'])

df[['noun_count','adv_count','verb_count','adj_count','pro_count']] = df['cleaned_text'].apply(lambda x: parts_of_speech(x))

### Topic Modeling

In [80]:
corpus = [text.split() for text in df['cleaned_text']]

dictionary = corpora.Dictionary(corpus)

corpus_bow = [dictionary.doc2bow(text) for text in corpus]

# Training
num_topics = 20
lda_model = LdaModel(corpus_bow, num_topics=num_topics, id2word=dictionary, passes=15)

topic_distribution = lda_model.get_document_topics(corpus_bow)

for topic in range(num_topics):
    df[f'topic_{topic + 1}_score'] = [next((t[1] for t in topic_dist if t[0] == topic), 0) for topic_dist in topic_distribution]

In [81]:
df.head(-1)

Unnamed: 0,text,cleaned_text,char_count,word_count,word_density,punctuation_count,upper_case_count,title_word_count,noun_count,adv_count,...,topic_11_score,topic_12_score,topic_13_score,topic_14_score,topic_15_score,topic_16_score,topic_17_score,topic_18_score,topic_19_score,topic_20_score


### Readability Scores

In [82]:
df['flesch_kincaid_score'] = df['cleaned_text'].apply(lambda x: textstat.flesch_kincaid_grade(x))

df['flesch_score'] = df['cleaned_text'].apply(lambda x: textstat.flesch_reading_ease(x))

df['gunning_fog_score'] = df['cleaned_text'].apply(lambda x: textstat.gunning_fog(x))

df['coleman_liau_score'] = df['cleaned_text'].apply(lambda x: textstat.coleman_liau_index(x))

df['dale_chall_score'] = df['cleaned_text'].apply(lambda x: textstat.dale_chall_readability_score(x))

df['ari_score'] = df['cleaned_text'].apply(lambda x: textstat.automated_readability_index(x))

df['linsear_write_score'] = df['cleaned_text'].apply(lambda x: textstat.linsear_write_formula(x))

df['spache_score'] = df['cleaned_text'].apply(lambda x: textstat.spache_readability(x))

In [83]:
df.head(-1)

Unnamed: 0,text,cleaned_text,char_count,word_count,word_density,punctuation_count,upper_case_count,title_word_count,noun_count,adv_count,...,topic_19_score,topic_20_score,flesch_kincaid_score,flesch_score,gunning_fog_score,coleman_liau_score,dale_chall_score,ari_score,linsear_write_score,spache_score


### Named Entity Recognition

In [84]:
def ner_count(text):
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    ner_tags = ne_chunk(pos_tags)
    ner_count = sum(1 for chunk in ner_tags if hasattr(chunk, 'label'))
    return ner_count

df['ner_count'] = df['text'].apply(ner_count)

In [2]:
import language_tool_python
correcter = language_tool_python.LanguageTool('en-US')

In [3]:
df = pd.read_csv('C:\\Users\\User\\OneDrive\\desktop\\HireMeModel\\feature_Named_Entity.csv')

### Text error length

In [4]:
tool = LanguageTool('en-US')

def error_length(text):
    matches = tool.check(text)
    return len(matches)

df['error_length'] = df['text'].apply(error_length)

In [5]:
df.head(-1)

Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven,model,cleaned_text,char_count,word_count,word_density,...,flesch_kincaid_score,flesch_score,gunning_fog_score,coleman_liau_score,dale_chall_score,ari_score,linsear_write_score,spache_score,ner_count,error_length
0,Phones Modern humans today are always on their...,0,Phones and driving,persuade_corpus,False,human,phones modern humans today are always on their...,1972,378,0.191684,...,147.2,-286.82,151.94,8.68,24.77,187.4,53.0,54.82,5,55
1,This essay will explain if drivers should or s...,0,Phones and driving,persuade_corpus,False,human,this essay will explain if drivers should or s...,2018,366,0.181368,...,143.7,-283.10,148.70,10.42,24.34,182.9,57.0,53.46,0,55
2,Driving while the use of cellular devices Toda...,0,Phones and driving,persuade_corpus,False,human,driving while the use of cellular devices toda...,1024,178,0.173828,...,71.5,-100.74,73.45,11.51,15.39,90.0,61.0,27.14,0,16
3,Phones & Driving Drivers should not be able to...,0,Phones and driving,persuade_corpus,False,human,phones driving drivers should not be able to u...,1183,207,0.174979,...,82.8,-130.18,85.70,11.58,16.57,104.3,64.0,31.06,4,21
4,Cell Phone Operation While Driving The ability...,0,Phones and driving,persuade_corpus,False,human,cell phone operation while driving the ability...,1872,332,0.177350,...,131.6,-257.05,136.66,11.11,23.72,166.4,60.0,49.31,3,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73567,im in middle school and I think that the princ...,1,Grades for extracurricular activities,persuade_finetuned_llamas,False,llama,im in middle school and i think that the princ...,411,73,0.177616,...,29.4,14.30,31.39,10.82,12.88,36.9,40.5,13.02,3,21
73568,I am writing you today to disagree with your t...,1,Grades for extracurricular activities,persuade_finetuned_llamas,False,llama,i am writing you today to disagree with your t...,1634,213,0.130355,...,89.9,-170.11,95.15,22.94,24.58,116.5,72.0,34.79,9,105
73569,"Dear Principal , In conclusion , I would obser...",1,Grades for extracurricular activities,persuade_finetuned_llamas,False,llama,dear principal in conclusion i would observe t...,2569,381,0.148307,...,153.1,-323.71,157.96,17.55,31.78,196.2,59.0,57.79,14,101
73570,"Dear Mrs . Principal , in these kinds of consi...",1,Grades for extracurricular activities,persuade_finetuned_llamas,False,llama,dear mrs principal in these kinds of considera...,2357,346,0.146797,...,139.4,-288.18,144.64,17.96,29.93,179.0,67.0,52.78,21,142


In [None]:
df.to_csv('feature_text_error.csv', index=False)

### Count Vectorizer

In [None]:
count_vect = df['cleaned_text'].tolist()

count_vectorizer = CountVectorizer(max_features=5000)

count_matrix = count_vectorizer.fit_transform(count_vect)

count_df = pd.DataFrame(count_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())

df = pd.concat([df, count_df], axis=1)

In [None]:
joblib.dump(count_vectorizer, 'tools/count_vectorizer_50k.pkl')

### N-Grams

In [None]:
# Bigram Vectorizer

bigram_vect = df['cleaned_text'].tolist()

bigram_vectorizer = TfidfVectorizer(ngram_range=(2, 2), max_features=5000)

bigram_matrix = bigram_vectorizer.fit_transform(bigram_vect)

bigram_df = pd.DataFrame(bigram_matrix.toarray(), columns=bigram_vectorizer.get_feature_names_out())

df = pd.concat([df, bigram_df], axis=1)

In [None]:
joblib.dump(bigram_vectorizer, 'tools/bigram_vectorizer_50k.pkl')

In [None]:
# Trigram Vectorizer

trigram_vect = df['cleaned_text'].tolist()

trigram_vectorizer = TfidfVectorizer(ngram_range=(3, 3), max_features=5000)

trigram_matrix = trigram_vectorizer.fit_transform(trigram_vect)

trigram_df = pd.DataFrame(trigram_matrix.toarray(), columns=trigram_vectorizer.get_feature_names_out())

df = pd.concat([df, trigram_df], axis=1)

In [None]:
joblib.dump(trigram_vectorizer, 'tools/trigram_vectorizer_50k.pkl')

In [None]:
# Bi-Trigram Vectorizer

bitri_vect = df['cleaned_text'].str.strip().tolist()

bitri_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 3), max_features=5000)

bichar_matrix = bitri_vectorizer.fit_transform(bitri_vect)

bichar_df = pd.DataFrame(bichar_matrix.toarray(), columns=bitri_vectorizer.get_feature_names_out())

bichar_df.columns = bichar_df.columns.str.strip()

bichar_df = bichar_df.loc[:, ~bichar_df.columns.duplicated()]

df = pd.concat([df, bichar_df], axis=1)

In [None]:
joblib.dump(bitri_vectorizer, 'tools/bitri_vectorizer_v2.pkl')

In [None]:
# Lexical Diversity

ttr_list = [len(set(word_tokenize(text.lower()))) / len(word_tokenize(text.lower())) for text in df['cleaned_text']]

ttr_df = pd.DataFrame({'lexical_diversity': ttr_list})

df['lexical_diversity'] = ttr_df['lexical_diversity']

### Save dataset

In [98]:
df.to_csv('DATA_SET/final_test_v2.csv', index=False)