In [1]:
!pip install convokit

Collecting convokit
  Downloading convokit-3.3.0.tar.gz (206 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/206.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m206.2/206.2 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting msgpack-numpy>=0.4.3.2 (from convokit)
  Downloading msgpack_numpy-0.4.8-py2.py3-none-any.whl.metadata (5.0 kB)
Collecting clean-text>=0.6.0 (from convokit)
  Downloading clean_text-0.6.0-py3-none-any.whl.metadata (6.6 kB)
Collecting unidecode>=1.1.1 (from convokit)
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Collecting pymongo>=4.0 (from convokit)
  Downloading pymongo-4.13.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting dnspython>=1.16.0 (from 

In [2]:
from convokit import Corpus, download, Coordination

Error from Unsloth: NotImplementedError: Unsloth currently only works on NVIDIA GPUs and Intel GPUs.



Please restructure your imports with 'import unsloth' at the top of your file.
  import unsloth


In [None]:
corpus = Corpus(filename=download("winning-args-corpus"))

In [63]:
from convokit import Corpus

class CMVDataFilter:
    """
    Filters CMV corpus to remove conversations with problematic utterances
    that would cause coordination analysis to fail.
    """

    def __init__(self, corpus):
        self.corpus = corpus
        self.valid_utterance_ids = set(corpus.get_utterance_ids())
        self.removed_conversations = []
        self.removal_reasons = []

    def is_utterance_valid(self, utterance):
        """Test if an utterance would cause coordination to fail"""
        try:
            # Test 1: Valid text content
            text = utterance.text
            if not text or not text.strip() or text in ['[removed]', '[deleted]']:
                return False, "empty_or_removed_text"

            # Test 2: Valid speaker
            if utterance.speaker is None:
                return False, "null_speaker"

            # Test 3: If it's a reply, check if the target utterance exists and is valid
            if utterance.reply_to is not None:
                try:
                    reply_utterance = self.corpus.get_utterance(utterance.reply_to)
                    if reply_utterance.speaker is None:
                        return False, "reply_target_null_speaker"
                    if not reply_utterance.text or not reply_utterance.text.strip():
                        return False, "reply_target_empty_text"
                except KeyError:
                    return False, "reply_target_missing"

            return True, "valid"

        except Exception as e:
            return False, f"exception_{type(e).__name__}"

    def remove_conversations_with_bad_utterances(self):
        """Remove conversations that contain any problematic utterances"""
        valid_conversation_ids = []
        removal_stats = {}

        print(f"Starting with {len(list(self.corpus.iter_conversations()))} conversations")

        for convo in self.corpus.iter_conversations():
            conversation_valid = True
            removal_reason = None

            try:
                for utterance in convo.iter_utterances():
                    is_valid, reason = self.is_utterance_valid(utterance)
                    if not is_valid:
                        conversation_valid = False
                        removal_reason = reason
                        break

                if conversation_valid:
                    valid_conversation_ids.append(convo.id)
                else:
                    self.removed_conversations.append(convo.id)
                    self.removal_reasons.append(removal_reason)

                    # Track removal statistics
                    if removal_reason not in removal_stats:
                        removal_stats[removal_reason] = 0
                    removal_stats[removal_reason] += 1

            except Exception as e:
                # If any error occurs while processing conversation, remove it
                conversation_valid = False
                removal_reason = f"processing_error_{type(e).__name__}"
                self.removed_conversations.append(convo.id)
                self.removal_reasons.append(removal_reason)

                if removal_reason not in removal_stats:
                    removal_stats[removal_reason] = 0
                removal_stats[removal_reason] += 1

        print(f"\nFiltering Results:")
        print(f"Valid conversations: {len(valid_conversation_ids)}")
        print(f"Removed conversations: {len(self.removed_conversations)}")
        print(f"\nRemoval reasons:")
        for reason, count in removal_stats.items():
            print(f"  {reason}: {count}")

        # Create new corpus with only valid conversations by collecting their utterances
        if valid_conversation_ids:
            valid_utterances = []
            for convo_id in valid_conversation_ids:
                convo = self.corpus.get_conversation(convo_id)
                for utterance in convo.iter_utterances():
                    valid_utterances.append(utterance)

            # Create filtered corpus from valid utterances
            filtered_corpus = Corpus(utterances=valid_utterances)
            print(f"\nFiltered corpus contains {len(list(filtered_corpus.iter_conversations()))} conversations")
            return filtered_corpus
        else:
            print("WARNING: No valid conversations found!")
            return self.corpus

    def get_filtering_report(self):
        """Get detailed report of what was filtered and why"""
        if not self.removed_conversations:
            return "No conversations were removed."

        report = f"Removed {len(self.removed_conversations)} conversations:\n"
        for conv_id, reason in zip(self.removed_conversations, self.removal_reasons):
            report += f"  {conv_id}: {reason}\n"

        return report


# Example usage and conversation counting
def count_conversations(corpus):
    """Count conversations in a corpus"""
    return len(list(corpus.iter_conversations()))

def count_conversations_fast(corpus):
    """Faster way to count conversations"""
    return len(corpus.get_conversation_ids())

# Usage example:
if __name__ == "__main__":
    # Load your corpus
    # corpus = Corpus(download('winning-args'))

    # Count conversations before filtering
    # print(f"Original corpus has {count_conversations_fast(corpus)} conversations")

    # Apply filtering
    # filter_tool = CMVDataFilter(corpus)
    # clean_corpus = filter_tool.remove_conversations_with_bad_utterances()

    # Count after filtering
    # print(f"Filtered corpus has {count_conversations_fast(clean_corpus)} conversations")

    # Get detailed report
    # print(filter_tool.get_filtering_report())

    pass

In [61]:
# Get first 3000 conversation IDs
def create_subset(corpus, max=3000):
  conv_ids = list(corpus.conversations.keys())[:max]

  # Collect all utterances from those conversations
  small_utterances = []
  for conv_id in conv_ids:
      conversation = corpus.get_conversation(conv_id)
      for utterance in conversation.iter_utterances():
          small_utterances.append(utterance)

  # Create new corpus from utterances
  small_corpus = Corpus(utterances=small_utterances)

  return small_corpus

In [46]:
corpus = create_subset(corpus)

In [64]:
filter = CMVDataFilter(corpus)
corpus = filter.remove_conversations_with_bad_utterances()

Starting with 2751 conversations

Filtering Results:
Valid conversations: 1219
Removed conversations: 1532

Removal reasons:
  empty_or_removed_text: 1532

Filtered corpus contains 1219 conversations


In [65]:
coord = Coordination()

In [66]:
coord.fit(corpus)

In [67]:
corpus = coord.transform(corpus)

In [68]:
print(len(list(corpus.iter_conversations())))

1219


In [74]:
# Check a few speakers to see what metadata exists
for i, speaker in enumerate(corpus.iter_speakers()):
    if i >= 3:  # Just check first 3 speakers
        break
    print(f"Speaker {speaker.id} metadata keys: {list(speaker.meta.keys())}")

# Also check utterances
for i, utterance in enumerate(corpus.iter_utterances()):
    if i >= 3:  # Just check first 3 utterances
        break
    print(f"Utterance {utterance.id} metadata keys: {list(utterance.meta.keys())}")

Speaker GoldenTaint metadata keys: ['coord']
Speaker Legomystrudel metadata keys: []
Speaker Bradm77 metadata keys: ['coord']
Utterance t3_2ro0ti metadata keys: ['pair_ids', 'success', 'approved_by', 'author_flair_css_class', 'author_flair_text', 'banned_by', 'controversiality', 'distinguished', 'downs', 'edited', 'gilded', 'likes', 'mod_reports', 'num_reports', 'replies', 'report_reasons', 'saved', 'score', 'score_hidden', 'subreddit', 'subreddit_id', 'ups', 'user_reports', 'liwc-categories']
Utterance t1_cnhpddf metadata keys: ['pair_ids', 'success', 'approved_by', 'author_flair_css_class', 'author_flair_text', 'banned_by', 'controversiality', 'distinguished', 'downs', 'edited', 'gilded', 'likes', 'mod_reports', 'num_reports', 'replies', 'report_reasons', 'saved', 'score', 'score_hidden', 'subreddit', 'subreddit_id', 'ups', 'user_reports', 'liwc-categories']
Utterance t1_cnhpqan metadata keys: ['pair_ids', 'success', 'approved_by', 'author_flair_css_class', 'author_flair_text', 'bann