In [8]:
!pip install convokit
!pip install textstat
!pip install spacy



In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from convokit import Corpus, download


In [3]:
corpus = Corpus(download("subreddit-Cornell"))

No configuration file found at /root/.convokit/config.yml; writing with contents: 
# Default Backend Parameters
db_host: localhost:27017
data_directory: ~/.convokit/saved-corpora
model_directory: ~/.convokit/saved-models
default_backend: mem
Downloading subreddit-Cornell to /root/.convokit/saved-corpora/subreddit-Cornell
Downloading subreddit-Cornell from http://zissou.infosci.cornell.edu/convokit/datasets/subreddit-corpus/corpus-zipped/CookingScrewups~-~CrappyDesign/Cornell.corpus.zip (11.2MB)... Done


In [27]:
N_SAMPLES = 2000

In [28]:
utt_df = corpus.get_utterances_dataframe()
conv_df = corpus.get_conversations_dataframe()
ai_df = pd.read_csv("./llama3_2_ai_responses_2k.csv")
merged_list = []
i = 0
for row in conv_df[:N_SAMPLES].itertuples():
    # Prompt is meta.title
    prompt = row[2]
    # utterances with converation_id = current conversation's id are the reddit comments
    joined_df = utt_df[utt_df.conversation_id == row.Index]["text"]
    human_resp = " ".join(joined_df.values.flatten().tolist())
    # ai_df has column "ai_responses" for those conversations
    merged_list.append({"id": row.Index, "text": prompt, "ai_resp": ai_df["ai_response"].iloc[i], "human_resp": human_resp})
    i +=1

merged_df = pd.DataFrame(merged_list)
merged_df = merged_df.set_index("id")

In [29]:
merged_df.head()

Unnamed: 0_level_0,text,ai_resp,human_resp
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
nyx4d,So I was away this past semester - What's goin...,"Occupied Cornell! So, you know how it is when ...",I was just reading about the Princeton Mic-Che...
o0145,course schedule planner for Cornell University,**title:** New Course Schedule Planner for Cor...,I have added support for Cornell to courseoff....
o1gca,should we advertise ourselves on facebook?,The eternal conundrum: do we risk infecting ou...,"i don't have a facebook, so we'd need a volunt..."
o0ss4,oh look.. a picture!,**throwaway_2023**\n\n omg yes!!! I've been me...,"so, i'm starting to mess with some of the css ..."
o31u0,"Cornell Scientists create ""hole"" in time where...",**Title:** Just saw this on the NYU feed and t...,wooo! aep! it's always nice to see Cornell in...


In [22]:
import re
import numpy as np
import convokit
import spacy
from collections import Counter, defaultdict
from textstat import textstat

# Load spaCy model
nlp = spacy.load("en_core_web_sm")
def extract_sentence_features_spacy(doc):
    sentences = list(doc.sents)
    tokens = [token for token in doc if not token.is_space]

    if len(sentences) == 0 or len(tokens) == 0:
        return None

    # Sentence length metrics
    sentence_lengths = [len(list(sent)) for sent in sentences]

    # POS tag distribution
    pos_counts = Counter([token.pos_ for token in doc])
    total_tokens = len(tokens)

    # Dependency depth (syntactic complexity)
    def get_tree_depth(token):
        depth = 0
        while token.head != token:
            depth += 1
            token = token.head
        return depth

    dependency_depths = [get_tree_depth(token) for token in doc if not token.is_punct]

    # Entity density
    entities = list(doc.ents)

    features = {
        # Sentence metrics
        'num_sentences': len(sentences),
        'avg_sentence_length': np.mean(sentence_lengths),
        'std_sentence_length': np.std(sentence_lengths),
        'max_sentence_length': np.max(sentence_lengths),
        'min_sentence_length': np.min(sentence_lengths),

        # Token metrics
        'num_tokens': len(tokens),
        'avg_word_length': np.mean([len(token.text) for token in tokens]),
        'std_word_length': np.std([len(token.text) for token in tokens]),

        # Lexical diversity
        'type_token_ratio': len(set([t.text.lower() for t in tokens])) / len(tokens),
        'lemma_diversity': len(set([t.lemma_ for t in tokens])) / len(tokens),

        # POS distribution
        'noun_ratio': pos_counts['NOUN'] / total_tokens,
        'verb_ratio': pos_counts['VERB'] / total_tokens,
        'adj_ratio': pos_counts['ADJ'] / total_tokens,
        'adv_ratio': pos_counts['ADV'] / total_tokens,
        'pron_ratio': pos_counts['PRON'] / total_tokens,

        # Syntactic complexity
        'avg_dependency_depth': np.mean(dependency_depths) if dependency_depths else 0,
        'max_dependency_depth': np.max(dependency_depths) if dependency_depths else 0,

        # Entity metrics
        'num_entities': len(entities),
        'entity_density': len(entities) / len(sentences) if len(sentences) > 0 else 0,

        # Readability (using original text)
        'flesch_reading_ease': textstat.flesch_reading_ease(doc.text),
        'flesch_kincaid_grade': textstat.flesch_kincaid_grade(doc.text),

        # Punctuation
        'punctuation_density': sum(1 for t in doc if t.is_punct) / len(tokens),
        'comma_density': sum(1 for t in doc if t.text == ',') / len(tokens),
    }

    return features



### Human response features

In [None]:
# Extract human resp features for corpus
sentence_features = []

for utterance in corpus.iter_utterances():
    text = utterance.text
    if len(text) > 0:
        doc = nlp(text)
        features = extract_sentence_features_spacy(doc)
        if features:
            features['utterance_id'] = utterance.id
            sentence_features.append(features)


human_feat_df = pd.DataFrame(sentence_features)
human_feat_df = human_feat_df.set_index("utterance_id")
human_feat_df.to_csv("./cornell_utt_human_features.csv")

### AI response features

In [34]:
# Extract human resp features for corpus
sentence_features = []

for row in merged_df.itertuples():
    ai_text = row.ai_resp
    if len(ai_text) > 0:
        doc = nlp(ai_text)
        features = extract_sentence_features_spacy(doc)
        if features:
            features['utterance_id'] = row.Index
            sentence_features.append(features)


ai_feat_df = pd.DataFrame(sentence_features)
ai_feat_df = ai_feat_df.set_index("utterance_id")
ai_feat_df.to_csv("./cornell_utt_ai_features.csv")

In [38]:
features_df

Unnamed: 0_level_0,num_sentences,avg_sentence_length,std_sentence_length,max_sentence_length,min_sentence_length,num_tokens,avg_word_length,std_word_length,type_token_ratio,lemma_diversity,...,adv_ratio,pron_ratio,avg_dependency_depth,max_dependency_depth,num_entities,entity_density,flesch_reading_ease,flesch_kincaid_grade,punctuation_density,comma_density
utterance_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
nyx4d,4,17.250000,17.049560,46,4,68,4.308824,4.466402,0.735294,0.720588,...,0.014706,0.088235,2.959184,7,1,0.250000,29.261250,11.227500,0.294118,0.000000
o0145,9,12.222222,3.735053,18,6,108,4.092593,3.515935,0.685185,0.675926,...,0.046296,0.166667,2.347368,9,3,0.333333,72.229173,6.072782,0.138889,0.037037
o1gca,3,17.666667,5.185450,25,14,53,3.830189,2.560482,0.735849,0.716981,...,0.094340,0.113208,2.382979,6,0,0.000000,79.260000,4.440000,0.113208,0.018868
o0ss4,4,9.000000,5.244044,17,3,36,3.750000,2.542691,0.944444,0.944444,...,0.027778,0.138889,2.100000,6,1,0.250000,61.397381,7.014286,0.166667,0.027778
o4ipd,3,30.333333,29.691001,72,5,89,4.247191,5.062044,0.719101,0.719101,...,0.022472,0.089888,3.393939,8,3,1.000000,14.908750,14.098883,0.280899,0.011236
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
e8tjh94,1,2.000000,0.000000,2,2,2,1.500000,0.500000,1.000000,1.000000,...,0.000000,0.000000,1.000000,1,1,1.000000,121.220000,-3.400000,0.500000,0.000000
e8tjyg1,1,10.000000,0.000000,10,10,9,11.444444,19.528390,0.666667,0.666667,...,0.000000,0.000000,1.833333,3,2,2.000000,49.480000,7.600000,0.444444,0.000000
e8tkb66,1,1.000000,0.000000,1,1,1,6.000000,0.000000,1.000000,1.000000,...,0.000000,0.000000,0.000000,0,0,0.000000,36.620000,8.400000,0.000000,0.000000
e8tkctl,1,9.000000,0.000000,9,9,9,3.777778,1.474055,1.000000,1.000000,...,0.222222,0.222222,1.333333,3,0,0.000000,103.700000,1.031111,0.000000,0.000000
