In [None]:
import csv
import nltk
import pandas as pd

In [None]:
def df_to_disk(df, file_name, mode="w", header=True):
    "Writes a dataframe to disk as a tab delimited file."
    
    df.to_csv(file_name, sep='\t', mode=mode, header=header, encoding='utf-8', index=False, quoting=csv.QUOTE_NONE, quotechar="",  escapechar="\\")
    if mode == "w":
        print(f"Results saved to {file_name}")

In [None]:
df = pd.read_csv('enISEAR_validation.tsv', sep='\t', encoding='utf-8')[['Sentence', 'Annotation']].sort_values('Sentence')

# Raw data has duplicate records from multiple annotators. Take modal label
df = df.groupby('Sentence').agg(pd.Series.mode).reset_index()

# Remove rows with multiple labels
df = df[df.Annotation.apply(isinstance, args=(str,))]

# Use only sentences label as joy, sadness, guilt, or shame
df = df[df.Annotation.isin(['joy', 'sadness', 'guilt', 'shame'])]

def clean(row):
    # Interpolate emotions
    row.Sentence = row.Sentence.replace('...', row.Annotation)
    row.Sentence = row.Sentence.replace('…', row.Annotation)
    # Only use first sentence
    row.Sentence = nltk.sent_tokenize(row.Sentence)[0]
    return row

# Interpolate emotions back into sentences only keep the first sentence
df.apply(lambda x: clean(x), axis=1)

df.Annotation.value_counts()

guilt      179
sadness    165
joy        148
shame       79
Name: Annotation, dtype: int64

In [None]:
# Write seed topics files from the first 50 sentences
seed_df = df[:50]
for label in set(seed_df.Annotation):
    label_df = seed_df[seed_df.Annotation == label]
    file_name = f"seed_topics/{label}.txt"
    df_to_disk(label_df['Sentence'], file_name, header=False)

Results saved to seed_topics/guilt.txt
Results saved to seed_topics/shame.txt
Results saved to seed_topics/joy.txt
Results saved to seed_topics/sadness.txt


In [None]:
# Create 10 documents with 20 sentences each
gold_df = None
start = 100
for i in range(10):
    end = start + 20
    doc_df = df[start:end].copy()
    doc_df.reset_index(drop=True, inplace=True)
    doc_df.reset_index(inplace = True)
    
    # Combine sentences and write to a document
    corpus_file = f"corpus/doc{i}.txt"
    with open(corpus_file, 'w+', encoding="utf-8") as f:
        f.write(' '.join(doc_df.Sentence))
        
    # Write gold standard file
    doc_df['id'] = [f"doc.{i}.sent.{j}" for j in doc_df['index']]
    if gold_df is None:
        gold_df = doc_df[['id', 'Annotation']]
    else:
        gold_df = pd.concat([gold_df, doc_df[['id', 'Annotation']]])

    start = end
    
# Write gold standard file
gold_file = f"gold.txt"    
df_to_disk(gold_df, gold_file, header=False)

Results saved to gold.txt
