In [5]:
!pip install --upgrade pip

Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.0
    Uninstalling pip-24.0:
      Successfully uninstalled pip-24.0
Successfully installed pip-25.0.1


## Load the data and drop individual #2

In [2]:
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import os

# Load the WASSA 2023 dataset
wassa_file_path = "/home/ajha/AP2/data/WASSA23_conv_level_with_labels_train.tsv"
wassa_df = pd.read_csv(wassa_file_path, sep="\t")

# Load the articles dataset
articles_file_path = "/home/ajha/AP2/data/articles_adobe_AMT.csv"
articles_df = pd.read_csv(articles_file_path)

# Separate out the turns spoken by the first individual (speaker_number == 1) in each conversation
first_individual_turns = wassa_df[wassa_df["speaker_number"] == 1]

# Sort the first individual's turns within each conversation by turn_id in ascending order
first_individual_turns_sorted = first_individual_turns.sort_values(by=["conversation_id", "turn_id"])

# Rename the text column in the articles dataset for clarity
articles_df.rename(columns={"text": "article"}, inplace=True)

# Merge the first individual's turns dataset with the articles dataset using article_id
first_individual_turns_sorted = first_individual_turns_sorted.merge(articles_df, on="article_id", how="left")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
first_individual_turns_sorted[:50]

Unnamed: 0,conversation_id,turn_id,text,EmotionalPolarity,Emotion,Empathy,speaker_number,article_id,speaker_id,essay_id,article
0,2,0,I feel very sad for the people.,2.0,3.0,3.3333,1,35.0,30.0,1.0,"A month after Hurricane Matthew, 800,000 Haiti..."
1,2,2,"Yeah, the whole situation is horrible.",2.0,3.6667,3.6667,1,35.0,30.0,1.0,"A month after Hurricane Matthew, 800,000 Haiti..."
2,2,4,"I know we can donate, but it's hard to know ex...",1.6667,3.0,2.3333,1,35.0,30.0,1.0,"A month after Hurricane Matthew, 800,000 Haiti..."
3,2,6,"No, and You?",1.3333,1.0,0.3333,1,35.0,30.0,1.0,"A month after Hurricane Matthew, 800,000 Haiti..."
4,2,8,"Sorry to hear that, I'm glad that you are okay!",0.3333,3.0,0.3333,1,35.0,30.0,1.0,"A month after Hurricane Matthew, 800,000 Haiti..."
5,2,10,I felt really sorry for the sister that now ha...,2.0,3.6667,2.6667,1,35.0,30.0,1.0,"A month after Hurricane Matthew, 800,000 Haiti..."
6,2,12,"Yeah, we never know what we can do unless we a...",0.3333,2.3333,1.3333,1,35.0,30.0,1.0,"A month after Hurricane Matthew, 800,000 Haiti..."
7,2,14,Tornado when I was young. They suck too.,2.0,2.0,1.6667,1,35.0,30.0,1.0,"A month after Hurricane Matthew, 800,000 Haiti..."
8,2,16,The power of nature is truly scary.,1.6667,3.3333,1.3333,1,35.0,30.0,1.0,"A month after Hurricane Matthew, 800,000 Haiti..."
9,2,18,"Me too, I wish there was a direct way to aid t...",1.0,2.6667,3.6667,1,35.0,30.0,1.0,"A month after Hurricane Matthew, 800,000 Haiti..."


## Load the model for inference

In [5]:
# Load LLaMA 3.1 8B model and tokenizer
model_name = "/chronos_data/pretrained_models/llama3.1-8b-Instruct-hf/"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="cuda:0")

Loading checkpoint shards: 100%|██████████| 4/4 [00:07<00:00,  1.76s/it]


In [6]:
def query_model(article, conversation_history, latest_dialogue):
    prompt = f"""
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are participating in a two-person conversation about the provided article.  
Respond only to your human partner's latest dialogue as if you are having a conversation. 
Keep your response natural and concise.

Article:
{article}

Conversation so far:
{conversation_history if conversation_history else 'START'}

<|start_header_id|>partner<|end_header_id|>
{latest_dialogue}

<|start_header_id|>you<|end_header_id|>
"""

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=100, temperature=0.8, pad_token_id=tokenizer.eos_token_id)
    
    return tokenizer.decode(output[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True).strip()


def query_model(article, conversation_history, latest_dialogue):
    prompt = f"""
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a thoughtful and emotionally intelligent individual participating in a conversation about the article below.
Respond to your partner in a way that reflects your unique personality, opinions, and emotional reactions.
Try to show your values, empathy, and worldview through what you say.
Keep your response concise—roughly matching your partner’s message length.

Article:
{article}

Conversation so far:
{conversation_history if conversation_history else 'START'}

<|start_header_id|>partner<|end_header_id|>
{latest_dialogue}

<|start_header_id|>you<|end_header_id|>
"""

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=100, temperature=0.8, pad_token_id=tokenizer.eos_token_id)
    
    return tokenizer.decode(output[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True).strip()


In [7]:
# Select a single conversation to test
iloc = 493
sample_conversation_id = 492
sample_group = first_individual_turns_sorted[first_individual_turns_sorted["conversation_id"] == sample_conversation_id]

conversation_history = ""
article = sample_group["article"].iloc[0]

print(f"Running conversation ID: {sample_conversation_id}\n")

for _, row in sample_group.iterrows():
    latest_dialogue = row["text"]
    print(f"Individual 1: {latest_dialogue}\n awaiting response")
    model_response = query_model(article, conversation_history, latest_dialogue)
    print(f"Model: {model_response}\n")
    
    # Update conversation history
    conversation_history += f"Individual 1: {latest_dialogue}\nModel: {model_response}\n"


Running conversation ID: 492

Individual 1: This article is so concerning.
 awaiting response
Model: I know, it's terrifying. The fact that 18 million people are exposed to lead in their drinking water is just unacceptable. And to think that so few water suppliers are being held accountable for these violations is even more disturbing. It's like, what's the point of having regulations if they're not being enforced?

Individual 1: Why do all the work to see if drinking water is safe, and then assess no fines or penalties?
 awaiting response


KeyboardInterrupt: 

## Generating normal responses to individual#1 dialogues

In [9]:
# Initialize storage for the generated conversations
conversation_results = []
save_interval = 10

# Process each conversation
for i, (conversation_id, group) in enumerate(first_individual_turns_sorted.groupby("conversation_id")):
    print(f'currently processing {i}th conversation')
    conversation_history = ""
    article = group["article"].iloc[0]
    
    for _, row in group.iterrows():
        latest_dialogue = row["text"]
        model_response = query_model(article, conversation_history, latest_dialogue)
        
        # Update conversation history
        conversation_history += f"Individual 1: {latest_dialogue}\nModel: {model_response}\n"
        
        # Store the conversation turn
        conversation_results.append({
            "conversation_id": conversation_id,
            "turn_id": row["turn_id"],
            "individual_1": latest_dialogue,
            "model_response": model_response,
            "article": article
        })
    
    # Save every 10 conversations
    if (i + 1) % save_interval == 0:
        temp_output_file = f"/home/ajha/AP2/data/interim_data/generated_expressive_conversations_llama_part_{i // save_interval}.csv"
        pd.DataFrame(conversation_results).to_csv(temp_output_file, index=False)
        print(f"Intermediate results saved to {temp_output_file}")
        conversation_results = []  # Clear results after saving

# Save any remaining results
if conversation_results:
    final_output_file = "/home/ajha/AP2/data/generated_expressive_conversations_llama_final.csv"
    pd.DataFrame(conversation_results).to_csv(final_output_file, index=False)
    print(f"Final results saved to {final_output_file}")


currently processing 380th conversation
currently processing 381th conversation
currently processing 382th conversation
currently processing 383th conversation
currently processing 384th conversation
currently processing 385th conversation
Final results saved to /home/ajha/AP2/data/generated_conversations_llama_final.csv


## Generate Expressive Conversations

In [8]:
import os

conversation_results = []
save_interval = 10
output_file = "/home/ajha/AP2/data/generated_expressive_conversations_llama_full.csv"

file_exists = os.path.exists(output_file)

# Process each conversation
for i, (conversation_id, group) in enumerate(first_individual_turns_sorted.groupby("conversation_id")):
    print(f'currently processing {i}th conversation')
    conversation_history = ""
    article = group["article"].iloc[0]
    
    for _, row in group.iterrows():
        latest_dialogue = row["text"]
        model_response = query_model(article, conversation_history, latest_dialogue)
        
        # Update conversation history
        conversation_history += f"Individual 1: {latest_dialogue}\nModel: {model_response}\n"
        
        # Store the conversation turn
        conversation_results.append({
            "conversation_id": conversation_id,
            "turn_id": row["turn_id"],
            "individual_1": latest_dialogue,
            "model_response": model_response,
            "article": article
        })
    
    # Append every 10 conversations
    if (i + 1) % save_interval == 0:
        pd.DataFrame(conversation_results).to_csv(
            output_file,
            mode='a',
            index=False,
            header=not file_exists  # Write header only once
        )
        file_exists = True
        print(f"Appended results to {output_file}")
        conversation_results = []  # Clear results after saving

# Save any remaining results
if conversation_results:
    pd.DataFrame(conversation_results).to_csv(
        output_file,
        mode='a',
        index=False,
        header=not file_exists
    )
    print(f"Final results appended to {output_file}")


currently processing 0th conversation
currently processing 1th conversation
currently processing 2th conversation
currently processing 3th conversation
currently processing 4th conversation
currently processing 5th conversation
currently processing 6th conversation
currently processing 7th conversation
currently processing 8th conversation
currently processing 9th conversation
Appended results to /home/ajha/AP2/data/generated_expressive_conversations_llama_full.csv
currently processing 10th conversation
currently processing 11th conversation
currently processing 12th conversation
currently processing 13th conversation
currently processing 14th conversation
currently processing 15th conversation
currently processing 16th conversation
currently processing 17th conversation
currently processing 18th conversation
currently processing 19th conversation
Appended results to /home/ajha/AP2/data/generated_expressive_conversations_llama_full.csv
currently processing 20th conversation
currently p

### combining

In [12]:
import pandas as pd
import glob
import os

# Path to directory
base_path = "/home/ajha/AP2/data/interim_data"

# Get all part_*.csv files
part_files = sorted(glob.glob(os.path.join(base_path, "generated_conversations_llama_part_*.csv")))

# Add the final file
part_files.append(os.path.join(base_path, "generated_conversations_llama_final.csv"))

# Read and concatenate
df_all = pd.concat((pd.read_csv(f) for f in part_files), ignore_index=True)

# Save combined file
df_all.to_csv(os.path.join(base_path, "generated_conversations_llama_combined.csv"), index=False)


In [15]:
combined_df = pd.read_csv('/home/ajha/AP2/data/interim_data/generated_conversations_llama_combined.csv')
actual_df = pd.read_csv('/home/ajha/AP2/data/WASSA23_conv_level_with_labels_train.tsv', delimiter='\t')

In [14]:
combined_df

Unnamed: 0,conversation_id,turn_id,individual_1,model_response,article
0,2,0,I feel very sad for the people.,"Me too, it's heartbreaking to see families str...","A month after Hurricane Matthew, 800,000 Haiti..."
1,2,2,"Yeah, the whole situation is horrible.",And the fact that there's a lack of coordinati...,"A month after Hurricane Matthew, 800,000 Haiti..."
2,2,4,"I know we can donate, but it's hard to know ex...",I completely understand. It's frustrating when...,"A month after Hurricane Matthew, 800,000 Haiti..."
3,2,6,"No, and You?","I completely agree, it's frustrating when we d...","A month after Hurricane Matthew, 800,000 Haiti..."
4,2,8,"Sorry to hear that, I'm glad that you are okay!","Yeah, I'm good, just trying to process what's ...","A month after Hurricane Matthew, 800,000 Haiti..."
...,...,...,...,...,...
4450,500,12,I have never fell while drinking have you?,"No, fortunately not! But I've heard it's a pos...",Death of former Putin aide at D.C. hotel is ru...
4451,500,14,Yeah it could have been murder,"That's definitely a possibility, especially gi...",Death of former Putin aide at D.C. hotel is ru...
4452,500,16,Do you think maybe he was force to kill himself,"It's possible, the article mentions that he wa...",Death of former Putin aide at D.C. hotel is ru...
4453,500,18,Or he could have slipped on something. Either ...,"I know, it's sad. He was a pretty influential ...",Death of former Putin aide at D.C. hotel is ru...


In [18]:
# Compute average word count for each column
avg_individual = combined_df['individual_1'].apply(lambda x: len(str(x).split())).mean()
avg_model = combined_df['model_response'].apply(lambda x: len(str(x).split())).mean()

# Final averaged result
avg_words_per_turn = (avg_individual + avg_model) / 2

print(f"Average number of words per turn: {avg_words_per_turn:.2f}")


Average number of words per turn: 26.73


In [16]:
actual_df

Unnamed: 0,conversation_id,turn_id,text,EmotionalPolarity,Emotion,Empathy,speaker_number,article_id,speaker_id,essay_id
0,2,0,I feel very sad for the people.,2.0000,3.0000,3.3333,1,35.0,30.0,1.0
1,2,1,It's terrible. Not only the people but the ani...,2.0000,4.0000,3.3333,2,35.0,17.0,501.0
2,2,10,I felt really sorry for the sister that now ha...,2.0000,3.6667,2.6667,1,35.0,30.0,1.0
3,2,11,"Yeah, it's going to be tough but i am sure she...",0.6667,3.0000,2.0000,2,35.0,17.0,501.0
4,2,12,"Yeah, we never know what we can do unless we a...",0.3333,2.3333,1.3333,1,35.0,30.0,1.0
...,...,...,...,...,...,...,...,...,...,...
8771,58,5,I'm sure that would go a long way. It's at lea...,2.0000,3.0000,3.0000,2,72.0,46.0,557.0
8772,58,6,"Oh exactly, it always comes down to the money....",0.0000,2.0000,2.0000,1,72.0,44.0,57.0
8773,58,7,And when it's the officials who are getting pa...,2.0000,2.0000,2.0000,2,72.0,46.0,557.0
8774,58,8,Yes corruption I'm sure is very huge. It has t...,2.0000,3.0000,3.0000,1,72.0,44.0,57.0


In [17]:
# Compute average number of words per turn
avg_words = actual_df['text'].apply(lambda x: len(str(x).split())).mean()

print(f"Average number of words per turn: {avg_words:.2f}")


Average number of words per turn: 17.98


## combining turns to avoid sparsity

In [2]:
import pandas as pd

In [3]:
expressive_hl_df = pd.read_csv('/home/ajha/AP2/data/generated_expressive_conversations_llama_full.csv')

In [4]:
expressive_hl_df

Unnamed: 0,conversation_id,turn_id,individual_1,model_response,article
0,2,0,I feel very sad for the people.,"I know, it's heartbreaking. The images of the ...","A month after Hurricane Matthew, 800,000 Haiti..."
1,2,2,"Yeah, the whole situation is horrible.",It's absolutely tragic. The way the storm hit ...,"A month after Hurricane Matthew, 800,000 Haiti..."
2,2,4,"I know we can donate, but it's hard to know ex...",I completely understand your concern. It's nat...,"A month after Hurricane Matthew, 800,000 Haiti..."
3,2,6,"No, and You?","I know, it's frustrating to not know exactly h...","A month after Hurricane Matthew, 800,000 Haiti..."
4,2,8,"Sorry to hear that, I'm glad that you are okay!",I'm still trying to process the scale of the d...,"A month after Hurricane Matthew, 800,000 Haiti..."
...,...,...,...,...,...
4450,500,12,I have never fell while drinking have you?,"No, I'm a bit clumsy sometimes, but I've never...",Death of former Putin aide at D.C. hotel is ru...
4451,500,14,Yeah it could have been murder,"I know, it's easy to get caught up in conspira...",Death of former Putin aide at D.C. hotel is ru...
4452,500,16,Do you think maybe he was force to kill himself,"I understand why you'd think that, given the p...",Death of former Putin aide at D.C. hotel is ru...
4453,500,18,Or he could have slipped on something. Either ...,"I know, it's a shame that his life ended in su...",Death of former Putin aide at D.C. hotel is ru...


In [5]:
import pandas as pd

# Step 1: Melt the wide columns into long format
df_long = pd.melt(
    expressive_hl_df,
    id_vars=['conversation_id', 'turn_id', 'article'],
    value_vars=['individual_1', 'model_response'],
    var_name='speaker',
    value_name='utterance'
)


speaker_order = {'individual_1': 0, 'model_response': 1}
df_long['speaker_order'] = df_long['speaker'].map(speaker_order)
df_long = df_long.sort_values(by=['conversation_id', 'turn_id', 'speaker_order'])


df_long['speaker'] = df_long['speaker'].replace({'model_response': 'model'})


df_long = df_long.drop(columns='speaker_order').reset_index(drop=True)

df_long.head()


Unnamed: 0,conversation_id,turn_id,article,speaker,utterance
0,2,0,"A month after Hurricane Matthew, 800,000 Haiti...",individual_1,I feel very sad for the people.
1,2,0,"A month after Hurricane Matthew, 800,000 Haiti...",model,"I know, it's heartbreaking. The images of the ..."
2,2,2,"A month after Hurricane Matthew, 800,000 Haiti...",individual_1,"Yeah, the whole situation is horrible."
3,2,2,"A month after Hurricane Matthew, 800,000 Haiti...",model,It's absolutely tragic. The way the storm hit ...
4,2,4,"A month after Hurricane Matthew, 800,000 Haiti...",individual_1,"I know we can donate, but it's hard to know ex..."


In [8]:
df_long['turn_id'] = df_long.groupby('conversation_id').cumcount()

df_long.head()

Unnamed: 0,conversation_id,turn_id,article,speaker,utterance
0,2,0,"A month after Hurricane Matthew, 800,000 Haiti...",individual_1,I feel very sad for the people.
1,2,1,"A month after Hurricane Matthew, 800,000 Haiti...",model,"I know, it's heartbreaking. The images of the ..."
2,2,2,"A month after Hurricane Matthew, 800,000 Haiti...",individual_1,"Yeah, the whole situation is horrible."
3,2,3,"A month after Hurricane Matthew, 800,000 Haiti...",model,It's absolutely tragic. The way the storm hit ...
4,2,4,"A month after Hurricane Matthew, 800,000 Haiti...",individual_1,"I know we can donate, but it's hard to know ex..."


In [17]:
df_long

Unnamed: 0,conversation_id,turn_id,article,speaker,utterance
0,2,0,"A month after Hurricane Matthew, 800,000 Haiti...",individual_1,I feel very sad for the people.
1,2,1,"A month after Hurricane Matthew, 800,000 Haiti...",model,"I know, it's heartbreaking. The images of the ..."
2,2,2,"A month after Hurricane Matthew, 800,000 Haiti...",individual_1,"Yeah, the whole situation is horrible."
3,2,3,"A month after Hurricane Matthew, 800,000 Haiti...",model,It's absolutely tragic. The way the storm hit ...
4,2,4,"A month after Hurricane Matthew, 800,000 Haiti...",individual_1,"I know we can donate, but it's hard to know ex..."
...,...,...,...,...,...
8905,500,17,Death of former Putin aide at D.C. hotel is ru...,model,"I understand why you'd think that, given the p..."
8906,500,18,Death of former Putin aide at D.C. hotel is ru...,individual_1,Or he could have slipped on something. Either ...
8907,500,19,Death of former Putin aide at D.C. hotel is ru...,model,"I know, it's a shame that his life ended in su..."
8908,500,20,Death of former Putin aide at D.C. hotel is ru...,individual_1,ok thank you. bye!


In [2]:
hh_df = pd.read_csv('/home/ajha/AP2/data/human-human-ocean.csv')

In [3]:
hh_df

Unnamed: 0,conversation_id,turn_id,text,Extroversion,Neuroticism,Agreeableness,Conscientiousness,Openness
0,2,0,I feel very sad for the people.,-0.317092,0.186396,-0.138068,-0.609187,-0.174868
1,2,1,It's terrible. Not only the people but the ani...,0.125101,0.284468,-0.213354,-0.817233,0.181743
2,2,2,"Yeah, the whole situation is horrible.",-0.142737,0.286374,-0.277611,-0.814114,-0.079439
3,2,3,I really wish there was something i could do.,-0.320769,0.238013,-0.138089,-0.382868,-0.183656
4,2,4,"I know we can donate, but it's hard to know ex...",-0.532262,0.060398,0.366289,0.625097,-0.191630
...,...,...,...,...,...,...,...,...
8771,500,16,Do you think maybe he was force to kill himself,-0.345693,0.122637,-0.187016,-0.777191,-0.108613
8772,500,17,I think somebody killed him,-0.394094,0.135951,-0.205856,-0.821231,-0.125072
8773,500,18,Or he could have slipped on something. Either ...,0.013900,0.252739,-0.116522,-0.589204,0.052598
8774,500,19,I don't know if he was nice or not. It was nic...,0.151929,0.284722,-0.258086,-0.807358,0.118684


In [9]:
import pandas as pd


def combine_turns(df, group_size=4):
    combined_rows = []

    for conversation_id, group in df.groupby('conversation_id'):
        group = group.sort_values('turn_id').reset_index(drop=True)
        num_turns = len(group)

        i = 0
        while i < num_turns:
            # Check if remaining turns are less than group_size
            if (num_turns - i) <= group_size and i != 0:
                # Merge remaining turns with previous group
                prev = combined_rows.pop()

                combined_text = prev['text'] + " " + " ".join(group.iloc[i:]['utterance'])
                combined_turns = prev['combined_turns'] + "," + ",".join(group.iloc[i:]['turn_id'].astype(str))
                
                combined_rows.append({
                    'conversation_id': conversation_id,
                    'combined_turns': combined_turns,
                    'text': combined_text
                })
                break
            else:
                combined_text = " ".join(group.iloc[i:i + group_size]['utterance'])
                combined_turns = ",".join(group.iloc[i:i + group_size]['turn_id'].astype(str))
                
                combined_rows.append({
                    'conversation_id': conversation_id,
                    'combined_turns': combined_turns,
                    'text': combined_text
                })
                i += group_size

    return pd.DataFrame(combined_rows)

combined_expressive_hl_df = combine_turns(df_long)

combined_expressive_hl_df.head()

Unnamed: 0,conversation_id,combined_turns,text
0,2,123,"I feel very sad for the people. I know, it's h..."
1,2,4567,"I know we can donate, but it's hard to know ex..."
2,2,891011,"Sorry to hear that, I'm glad that you are okay..."
3,2,12131415,"Yeah, we never know what we can do unless we a..."
4,2,161718192021,The power of nature is truly scary. It's a sob...


In [10]:
combined_expressive_hl_df.to_csv('/home/ajha/AP2/data/combined_turns_expressive_hl_df.csv', index=False)

In [19]:

combined_hh_df[combined_hh_df['conversation_id']==3]

Unnamed: 0,conversation_id,combined_turns,text
5,3,123,what did you think about the article It was su...
6,3,4567,yeah some countries are so corrupt though true...
7,3,891011,yeah wonder what was going through is head pro...
8,3,12131415,makes sense. also find better ways rather than...
9,3,16171819,thats a great idea hopefully these people get ...
10,3,2021222324,were lucky to live here i agree makes my troub...


In [6]:
combined_hh_df.to_csv('/home/ajha/AP2/data/combined_turns_hh-df.csv', index=False)

In [8]:
hl_df

Unnamed: 0,conversation_id,turn_id,speaker,utterance,article,Extroversion,Neuroticism,Agreeableness,Conscientiousness,Openness
0,2,0,individual_1,I feel very sad for the people.,"A month after Hurricane Matthew, 800,000 Haiti...",-0.317092,0.186396,-0.138068,-0.609187,-0.174868
1,2,1,model,"Me too, it's heartbreaking to see families str...","A month after Hurricane Matthew, 800,000 Haiti...",-0.036390,0.121401,0.225625,0.073282,0.148655
2,2,4,individual_1,"Yeah, the whole situation is horrible.","A month after Hurricane Matthew, 800,000 Haiti...",-0.142737,0.286374,-0.277611,-0.814114,-0.079439
3,2,5,model,And the fact that there's a lack of coordinati...,"A month after Hurricane Matthew, 800,000 Haiti...",-0.025742,0.030563,-0.047665,-0.430450,0.059493
4,2,8,individual_1,"I know we can donate, but it's hard to know ex...","A month after Hurricane Matthew, 800,000 Haiti...",-0.532262,0.060398,0.366289,0.625097,-0.191630
...,...,...,...,...,...,...,...,...,...,...
8905,500,33,model,"It's possible, the article mentions that he wa...",Death of former Putin aide at D.C. hotel is ru...,-0.006357,0.034805,0.017900,-0.243165,0.068544
8906,500,36,individual_1,Or he could have slipped on something. Either ...,Death of former Putin aide at D.C. hotel is ru...,0.013900,0.252739,-0.116522,-0.589204,0.052598
8907,500,37,model,"I know, it's sad. He was a pretty influential ...",Death of former Putin aide at D.C. hotel is ru...,-0.166833,0.092569,0.069657,-0.238407,0.055997
8908,500,40,individual_1,ok thank you. bye!,Death of former Putin aide at D.C. hotel is ru...,0.188732,0.279920,-0.293802,-0.831882,0.121357


In [16]:
hl_df = pd.read_csv('/home/ajha/AP2/data/human-llm-ocean.csv')

combined_hl_df = combine_turns(hl_df)
combined_hl_df.head()

Unnamed: 0,conversation_id,combined_turns,text
0,2,145,"I feel very sad for the people. Me too, it's h..."
1,2,891213,"I know we can donate, but it's hard to know ex..."
2,2,16172021,"Sorry to hear that, I'm glad that you are okay..."
3,2,24252829,"Yeah, we never know what we can do unless we a..."
4,2,323336374041,"The power of nature is truly scary. I know, ri..."


In [18]:
combined_hl_df[combined_hl_df['conversation_id']==3]

Unnamed: 0,conversation_id,combined_turns,text
5,3,145,what did you think about the article I was sho...
6,3,891213,yeah some countries are so corrupt though It's...
7,3,16172021,yeah wonder what was going through is head I k...
8,3,24252829,makes sense. also find better ways rather than...
9,3,32333637,thats a great idea I think it would be a good ...
10,3,404144454849,"were lucky to live here I know, we have so man..."


In [20]:
combined_hl_df.to_csv('/home/ajha/AP2/data/combined_turns_hl-df.csv', index=False)

## checking the average turn length

In [12]:
conv_ids = set(hh_df.conversation_id.values)
turn_length = 0
for conv_id in conv_ids:
    turn_length += len(hh_df[hh_df['conversation_id']==3])

In [13]:
turn_length/len(conv_ids)

25.0

## Adding inferred dataset in the form of tables to DB for running trait inference using DLATK

In [19]:
import pandas as pd
import mysql.connector
import pymysql
from sqlalchemy import create_engine

# Load CSVs
# hh_df = pd.read_csv('/home/ajha/AP2/data/combined_turns_hh-df.csv')
# hl_df = pd.read_csv('/home/ajha/AP2/data/combined_turns_hl-df.csv')




# Step 1: Create the 'conversations_combined_turns' database
conn = pymysql.connect(user='root', unix_socket='/home/ajha/mysql.sock', port=3307)
cursor = conn.cursor()
cursor.execute("CREATE DATABASE IF NOT EXISTS expressive_conversations;")
conn.commit()
cursor.close()
conn.close()

# Step 2: Connect to 'conversations_combined_turns' DB
engine = create_engine("mysql+pymysql://root@localhost:3307/expressive_conversations?unix_socket=/home/ajha/mysql.sock")

# Step 3: Insert DataFrames as tables
combined_expressive_hl_df.to_sql(name='human_llm', con=engine, index=False, if_exists='replace')
# hl_df.to_sql(name='human_llm', con=engine, index=False, if_exists='replace')


1940