# Package and Format

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk import ngrams, FreqDist
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import re
from collections import Counter
from nltk.tokenize import RegexpTokenizer
import unicodedata



# Data

In [2]:
df = pd.read_csv('/Users/ningyuhan/Desktop/combined_dataframe.csv')

In [3]:
def standardize_text(text):
    if pd.isna(text):
        return text  
    # Remove non-ASCII characters directly
    text = ''.join([char for char in text if ord(char) < 128])
    text = text.strip().lower()  
    # Remove all non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

df['Standardized_Sentence'] = df['Sentence'].apply(standardize_text)

def standardize_tag(tag):
    if pd.isna(tag):
        return tag
    # Remove all spaces and extra characters
    tag = tag.replace(' ', '')
    # Ensure consistency in formatting
    tag = ','.join(tag.split(','))
    return tag

df['Tag'] = df['Tag'].apply(standardize_tag)

df['Tag'] = df['Tag'].apply(
    lambda x: x.replace('4h,30]', '4h,03]') if '4h,30]' in x else x
)

def correct_typo(tag):
    parts = tag.strip('[]').split(',')
    if len(parts) > 3 and len(parts[3]) == 1:
        parts[3] = '0' + parts[3]  # Add a leading zero if it's a single digit
    return '[' + ','.join(parts) + ']'

df['Tag'] = df['Tag'].apply(correct_typo)

In [4]:
contain_4h = df[df['Tag'].str.contains(',4h,')]
contain_4h

Unnamed: 0,Tag,Sentence,Standardized_Sentence
12,"[c:13,13,4h,01]",Dodson got mad at Ralph,dodson got mad at ralph
49,"[c:50,02,4h,09]","Everybody talked about the ""armed camp"" atmosp...",everybody talked about the armed camp atmosphe...
52,"[c:03,01,4h,01]","Swisher argues that if one wants ain't it, the...",swisher argues that if one wants aint it then ...
68,"[c:19,04,4h,05]",John Lambert thinks this is an unlikely problem.,john lambert thinks this is an unlikely problem
70,"[c:21,04,4h,09]",The essence of John's remarks are that there a...,the essence of johns remarks are that there ar...
...,...,...,...
4828,"[c:80,25,4h,06]",Bob Fulton and Lyle fight over this,bob fulton and lyle fight over this
4829,"[c:81,25,4h,06]",and Lyle is challenging the workers to give hi...,and lyle is challenging the workers to give hi...
4833,"[c:85,25,4h,06]",Lyle gets mad at Tove for trying to stop him f...,lyle gets mad at tove for trying to stop him f...
4835,"[c:87,12,4h,06]",Tove gets angry at Lyle and turns on him and s...,tove gets angry at lyle and turns on him and s...


In [5]:
# Function to extract the third component from the tag
def extract_third_component(tag):
    components = tag.strip('[]').split(',')
    return components[2] if len(components) > 2 else None

# Extracting the third component for each row
df['Third_Component'] = df['Tag'].apply(extract_third_component)

# Identifying rows with '4h'
df['Is_4h'] = df['Third_Component'] == '4h'

# Counting consecutive '4h'
count_consecutive_4h = (df['Is_4h'] != df['Is_4h'].shift()).cumsum()
df['Consecutive_4h'] = df['Is_4h'].groupby(count_consecutive_4h).cumsum()

# Counting instances of single, double, triple, quadruple and more consecutive '4h'
counts = df['Consecutive_4h'].value_counts().sort_index()
count_single = counts.get(1, 0)
count_double = counts.get(2, 0)
count_triple = counts.get(3, 0)
count_quadruple = counts.get(4, 0)
count_more = sum([count for k, count in counts.items() if k > 4])

count_single, count_double, count_triple, count_quadruple, count_more

(270, 42, 11, 3, 2)

# Comparisons between 4h and other verbal contribution categories

In [6]:
verbal_contribution_codes = {
    '1': 'Initiation activity',
    '1a': 'Motion making',
    '1b': 'Information giving',
    '1c': 'Information seeking',
    '1d': 'Making specific suggestions for action',
    '2': 'Support behavior',
    '2a': 'Motion seconding',
    '2b': 'Making statements in support of another person\'s argument',
    '3': 'System maintenance',
    '3a': 'Tension management',
    '3b': 'Direction of traffic',
    '3c': 'Collective spirit and solidarity moves',
    '4': 'Board discussion, debate, argumentation',
    '4a': 'Personal defensiveness',
    '4b': 'Personal gains',
    '4c': 'Agreeing reluctantly',
    '4d': 'Sensible,nonpersonal arguments',
    '4e1': 'Management',
    '4e2': 'Union',
    '4e3': 'People (workers) as distinct from union or the union leadership',
    '4f': 'Attempts to propose new board topics',
    '4g': 'Corporate interests',
    '4h': 'Disagreements, conflicts, attacks',
    '4i': 'Stonewalling',
    '5': 'Unclassified verbal behaviors',
}

In [7]:
df['Tag_Component'] = df['Tag'].apply(lambda x: x.split(',')[2] if len(x.split(',')) > 2 else None)

# Set the focus category here
focus_category = '4h'
df['Is_Focus_Category'] = df['Tag_Component'] == focus_category

def get_relevant_rows(df):
    relevant_rows = []
    skip_flag = False  

    for i in range(len(df)):
        if df.iloc[i]['Is_Focus_Category']:
            if skip_flag:
                continue
            skip_flag = True  

            for j in range(1, 4):
                if i-j < 0:
                    break  
                if not df.iloc[i-j]['Is_Focus_Category']:
                    relevant_rows.append(df.iloc[i-j])

            relevant_rows.append(df.iloc[i])

        else:
            skip_flag = False  

    return pd.DataFrame(relevant_rows).sort_index()

df_final = get_relevant_rows(df)

# Analyze sequences leading to the focus category
sequences_leading_to_focus = []

for index in range(len(df_final)):
    if df_final.iloc[index]['Is_Focus_Category']:
        sequence = []
        for j in range(1, 4):
            if index-j < 0:
                break  
            if not df_final.iloc[index-j]['Is_Focus_Category']:
                sequence.append(df_final.iloc[index-j]['Tag_Component'])
        sequences_leading_to_focus.append(tuple(reversed(sequence)))

# Count the frequency of each sequence
sequence_counts = Counter(sequences_leading_to_focus)
top10_sequences = sequence_counts.most_common(10)

translated_top10_sequences = []
for sequence, count in top10_sequences:
    translated_sequence = [verbal_contribution_codes.get(code, "Unknown") for code in sequence]
    translated_top10_sequences.append((translated_sequence, count))

formatted_results = []
for sequence, count in translated_top10_sequences:
    sequence_str = ' -> '.join(sequence)
    sentence = f"[{sequence_str}] leads to {focus_category} {count} times."
    formatted_results.append(sentence)

formatted_results

['[Information giving -> Information giving -> Sensible,nonpersonal arguments] leads to 4h 5 times.',
 "[Making statements in support of another person's argument -> Sensible,nonpersonal arguments -> Sensible,nonpersonal arguments] leads to 4h 4 times.",
 '[Sensible,nonpersonal arguments -> Information seeking -> Sensible,nonpersonal arguments] leads to 4h 4 times.',
 '[Sensible,nonpersonal arguments -> Sensible,nonpersonal arguments] leads to 4h 4 times.',
 '[Personal defensiveness -> Personal defensiveness] leads to 4h 4 times.',
 '[Sensible,nonpersonal arguments -> Sensible,nonpersonal arguments -> Sensible,nonpersonal arguments] leads to 4h 4 times.',
 '[Sensible,nonpersonal arguments -> Information seeking -> Information giving] leads to 4h 4 times.',
 '[Information seeking -> Information giving -> Information giving] leads to 4h 4 times.',
 '[Information giving -> Sensible,nonpersonal arguments] leads to 4h 3 times.',
 '[Information giving -> Sensible,nonpersonal arguments -> Inf

In [8]:
count_4h = df_final['Tag_Component'].value_counts().get(focus_category, 0)
count_4h

270

# Comparison between topics

In [9]:
topic_codes = {
    '01': 'Routine Board Functions',
    '02': 'Scope of Board Issues',
    '03': 'Financial Status of Firm',
    '04': 'Management Personnel Issues',
    '05': 'Marketing and Sales',
    '06': 'Employee Stock Ownership Plan (ESOP) Financial',
    '07': 'ESOP Participation',
    '08': 'Employee Benefits',
    '09': 'Plant Production and Manufacturing Process Issues',
    '10': 'The Union and Contract Issues',
    '11': 'Supervision',
    '12': 'hog procurement',
    '13': 'Political Relations with the Community'
}

In [10]:
# Extract the fourth component (Topic) from the 'Tag' field
df['Topic_Component'] = df['Tag'].apply(
    lambda x: x.split(',')[3].strip(']').strip() if len(x.split(',')) > 3 else None
)

# Apply the get_relevant_rows function to get rows relevant to the '4h' focus category
df_final_topic = get_relevant_rows(df)

# Filter the DataFrame for rows where 'Is_Focus_Category' is True
df_focus_true = df_final_topic[df_final_topic['Is_Focus_Category']]

# Count the occurrences of each unique 'Topic_Component' in these rows
topic_component_counts = df_focus_true['Topic_Component'].value_counts()

# Translate the topic components using the topic_codes dictionary
translated_topic_counts = {topic_codes.get(key, "Unknown"): value for key, value in topic_component_counts.items()}

# Display the translated counts
translated_topic_counts

# Calculate the total count for percentage calculation
total_count = topic_component_counts.sum()

# Translate the topic components and calculate the percentage
translated_topic_counts = {
  topic_codes.get(key, "Unknown"): {
    'Count': value, 
    'Percentage': (value / total_count) * 100
  }
  for key, value in topic_component_counts.items() 
}

for topic, data in translated_topic_counts.items():
  percentage = round(data['Percentage'], 2)
  data['Percentage'] = f"{percentage}%"

for topic, data in translated_topic_counts.items():
  print(f"{topic}: {data['Count']} times ({data['Percentage']})")

Management Personnel Issues: 55 times (20.37%)
Financial Status of Firm: 49 times (18.15%)
The Union and Contract Issues: 40 times (14.81%)
Marketing and Sales: 21 times (7.78%)
Routine Board Functions: 20 times (7.41%)
Plant Production and Manufacturing Process Issues: 20 times (7.41%)
ESOP Participation: 20 times (7.41%)
Employee Benefits: 16 times (5.93%)
Employee Stock Ownership Plan (ESOP) Financial: 14 times (5.19%)
Supervision: 8 times (2.96%)
hog procurement: 4 times (1.48%)
Political Relations with the Community: 2 times (0.74%)
Scope of Board Issues: 1 times (0.37%)


In [11]:
sequences_leading_to_focus_topic = []

for index in range(len(df_final_topic)):
    if df_final_topic.iloc[index]['Is_Focus_Category']:
        topic_sequence = []
        for j in range(1, 4):
            if index-j < 0:
                break
            if not df_final_topic.iloc[index-j]['Is_Focus_Category']:
                topic_sequence.append(df_final_topic.iloc[index-j]['Topic_Component'])
        sequences_leading_to_focus_topic.append(tuple(reversed(topic_sequence)))
        
sequence_counts = Counter(sequences_leading_to_focus_topic)
top10_sequences = sequence_counts.most_common(10)

# Consolidate duplicates
consolidated_counts = {}
for seq, count in top10_sequences:
  consolidated_seq = tuple(set(seq))
  if consolidated_seq not in consolidated_counts:
    consolidated_counts[consolidated_seq] = 0
  consolidated_counts[consolidated_seq] += count
  
# Translate codes
translated_counts = {}
for seq, count in consolidated_counts.items():
  translated_seq = tuple(topic_codes.get(code, 'Unknown') for code in seq)
  if translated_seq not in translated_counts:
    translated_counts[translated_seq] = 0
  translated_counts[translated_seq] += count
  
# Get total count
total_count = sum(count for seq, count in top10_sequences)  

# Print results with percentage
print("Top 10 Sequences Leading to Focus Topic (Translated):")
for translated_seq, count in translated_counts.items():
  percentage = round(count/total_count * 100, 2)
  print(f"{translated_seq}: {count} times ({percentage}%)")

Top 10 Sequences Leading to Focus Topic (Translated):
('Financial Status of Firm',): 39 times (27.66%)
('Management Personnel Issues',): 35 times (24.82%)
('The Union and Contract Issues',): 24 times (17.02%)
('Marketing and Sales',): 9 times (6.38%)
('Plant Production and Manufacturing Process Issues',): 9 times (6.38%)
('Employee Benefits',): 9 times (6.38%)
('ESOP Participation',): 8 times (5.67%)
('Routine Board Functions',): 8 times (5.67%)


In [12]:
total_count = sum(sequence_counts.values())
print(total_count)

270


In [13]:
# Reapplying the get_relevant_rows function with index reset
df_final_topic = df_final_topic.reset_index(drop=True)

# Extracting '4h' rows and their preceding rows
focus_rows = df_final_topic[df_final_topic['Is_Focus_Category']]

# Finding the indices of the '4h' rows
focus_indices = focus_rows.index

# Creating a list to store instances where the topic is different
different_topic_instances = []

# Iterating through the focus indices to compare topics
for index in focus_indices:
    if index > 0 and df_final_topic.loc[index - 1, 'Topic_Component'] != df_final_topic.loc[index, 'Topic_Component']:
        different_topic_instances.append({
            'Preceding_Sentence': df_final_topic.loc[index - 1, 'Standardized_Sentence'],
            'Preceding_Topic': df_final_topic.loc[index - 1, 'Topic_Component'],
            'Current_Sentence': df_final_topic.loc[index, 'Standardized_Sentence'],
            'Current_Topic': df_final_topic.loc[index, 'Topic_Component']
        })

# Count of different topic instances
count_different_topics = len(different_topic_instances)
count_different_topics

55

In [14]:
topic_components = [
    {
        'Preceding_Topic': entry['Preceding_Topic'],
        'Current_Topic': entry['Current_Topic']
    }
    for entry in different_topic_instances
]

topic_components

[{'Preceding_Topic': '03', 'Current_Topic': '09'},
 {'Preceding_Topic': '09', 'Current_Topic': '05'},
 {'Preceding_Topic': '09', 'Current_Topic': '03'},
 {'Preceding_Topic': '02', 'Current_Topic': '06'},
 {'Preceding_Topic': '02', 'Current_Topic': '08'},
 {'Preceding_Topic': '01', 'Current_Topic': '07'},
 {'Preceding_Topic': '11', 'Current_Topic': '10'},
 {'Preceding_Topic': '07', 'Current_Topic': '09'},
 {'Preceding_Topic': '11', 'Current_Topic': '08'},
 {'Preceding_Topic': '03', 'Current_Topic': '05'},
 {'Preceding_Topic': '05', 'Current_Topic': '03'},
 {'Preceding_Topic': '05', 'Current_Topic': '09'},
 {'Preceding_Topic': '05', 'Current_Topic': '04'},
 {'Preceding_Topic': '06', 'Current_Topic': '01'},
 {'Preceding_Topic': '10', 'Current_Topic': '01'},
 {'Preceding_Topic': '09', 'Current_Topic': '12'},
 {'Preceding_Topic': '03', 'Current_Topic': '08'},
 {'Preceding_Topic': '07', 'Current_Topic': '11'},
 {'Preceding_Topic': '07', 'Current_Topic': '11'},
 {'Preceding_Topic': '10', 'Cur

# Only one sentence preceding 4h and the topic is management personal issues

In [15]:
df['Topic_Category'] = df['Tag'].apply(
    lambda x: x.split(',')[3].replace(']', '') if len(x.split(',')) > 3 else None
)
df['Tag_Component'] = df['Tag'].apply(
    lambda x: x.split(',')[2] if len(x.split(',')) > 2 else None
)

relevant_instances = []

for i in range(1, len(df) - 1):
    prev_row = df.iloc[i - 1]
    current_row = df.iloc[i]
    next_row = df.iloc[i + 1]

    if prev_row['Tag_Component'] == '4h' and current_row['Topic_Category'] == '04' and next_row['Tag_Component'] == '4h' and current_row['Tag_Component'] != '4h':
        relevant_instances.append((prev_row['Tag'], current_row['Tag'], next_row['Tag'],
                                      prev_row['Standardized_Sentence'], current_row['Standardized_Sentence'], next_row['Standardized_Sentence']))

columns = ['Prev_Tag', 'Current_Tag', 'Next_Tag', 'Prev_Sentence', 'Current_Sentence', 'Next_Sentence']
df_relevant = pd.DataFrame(relevant_instances, columns=columns)

df_relevant

Unnamed: 0,Prev_Tag,Current_Tag,Next_Tag,Prev_Sentence,Current_Sentence,Next_Sentence
0,"[c:115,08,4h,04]","[c:116,03,4a,04]","[c:117,11,4h,04]",frye said herb when youre not here anymore,and herb said when shouldnt i be here i dont p...,ralph says well you might die i looked at swis...
1,"[c:208,03,4h,04]","[c:209,31,1b,04]","[c:210,25,4h,04]",herb says well theres nothing like taking chances,mr peterson said that mr tamborello expressed ...,mr taylor said that he did not get the same re...
2,"[c:120,01,4h,04]","[c:121,01,1c,04]","[c:122,12,4h,04]",swisher gets mad at tove and,says that he wants to know why i think that,i explain that wally should have gotten out he...
3,"[c:127,14,4h,04]","[c:128,20,4a,04]","[c:129,14,4h,04]",he looks at bass and he says now why would you...,bass speaks back and says i just want to look ...,sid says what do you want to look at them for ...


In [16]:
df['Topic_Category'] = df['Tag'].apply(
    lambda x: x.split(',')[3].replace(']', '') if len(x.split(',')) > 3 else None
)
df['Tag_Component'] = df['Tag'].apply(
    lambda x: x.split(',')[2] if len(x.split(',')) > 2 else None
)


relevant_instances = []

for i in range(1, len(df)):
    prev_row = df.iloc[i - 1]
    current_row = df.iloc[i]

    if current_row['Tag_Component'] == '4h' and prev_row['Topic_Category'] == '05':
        relevant_instances.append((prev_row['Tag'], current_row['Tag'],
                                  prev_row['Standardized_Sentence'], current_row['Standardized_Sentence']))

columns = ['Prev_Tag', 'Current_Tag', 'Prev_Sentence', 'Current_Sentence']
df_relevant = pd.DataFrame(relevant_instances, columns=columns)

df_relevant

Unnamed: 0,Prev_Tag,Current_Tag,Prev_Sentence,Current_Sentence
0,"[c:18,12,4d,05]","[c:19,04,4h,05]",tove then suggests that morrell might be sitti...,john lambert thinks this is an unlikely problem
1,"[c:40,12,1b,05]","[c:41,02,4h,05]",tove to mac mac why dont you say something abo...,mac we dont need more processed meat this is a...
2,"[c:41,02,4h,05]","[c:42,04,4h,05]",mac we dont need more processed meat this is a...,at that point john lambert just sighs but nods...
3,"[c:43,01,3a,05]","[c:44,02,4h,05]",at this point swisher interrupts by telling jo...,mac fights back by saying that people are buyi...
4,"[c:33,04,4d,05]","[c:34,11,4h,05]",john their pricing,ralph i should say it is their pricing
5,"[c:18,08,4h,05]","[c:19,11,4h,05]",fry is getting aggressive towards john and att...,ralph attacks the diffused language in the fin...
6,"[c:41,16,05,05]","[c:42,02,4h,03]",walt cunningham is also saying something,john it is as if the workers feel that someone...
7,"[c:61,04,4a,05]","[c:62,20,4h,09]",john says we are putting them in an impossible...,glen bass takes an active role in this labor c...
8,"[c:33,04,4b,05]","[c:34,11,4h,05]",then he says as an aside that all these charts...,ralph asks john for the exact meaning of all t...
9,"[c:35,04,1b,05]","[c:36,02,4h,05]",he is now being dragged through the same quest...,mac looks up and smiles and says this is all a...


# Probability Analysis

In [17]:
# If you have a topic 04, what is the probability that the next sentence is 4h !!!! 
# VS
# If the current sentence is 4h, what the probability that the preceding sentence is 04 

In [18]:
df = df.copy()

# Split tags
split_tags = df['Tag'].str.strip('[]').str.split(',', expand=True)

# Extract components
df['Verbal'] = split_tags[2]  
df['Topic'] = split_tags[3]

# Calculate probabilities
topic_04 = df[df['Topic'] == '04']
next_4h_count = (topic_04['Verbal'].shift(-1) == '4h').sum()
total_topic_04 = len(topic_04)
prob_next_4h_given_04 = next_4h_count / total_topic_04 if total_topic_04 > 0 else 0

verbal_4h = df[df['Verbal'] == '4h']
prev_04_count = (verbal_4h['Topic'].shift(1) == '04').sum() 
total_verbal_4h = len(verbal_4h)
prob_prev_04_given_4h = prev_04_count / total_verbal_4h if total_verbal_4h > 0 else 0

# Convert to percentage and round
pct_next_4h_given_04 = round(prob_next_4h_given_04 * 100, 2)  
pct_prev_04_given_4h = round(prob_prev_04_given_4h * 100, 2)

print(f"Probability next sentence is 4h given topic is 04: {pct_next_4h_given_04}%")
print(f"Probability previous sentence is 04 given sentence is 4h: {pct_prev_04_given_4h}%")

Probability next sentence is 4h given topic is 04: 11.32%
Probability previous sentence is 04 given sentence is 4h: 19.51%


# Topic Transfer

In [19]:
df['Tag_Component'] = df['Tag'].apply(
    lambda x: x.split(',')[2] if len(x.split(',')) > 2 else None
)

cluster_id = 0
current_cluster_size = 0
cluster_sentences = []

for i, row in df.iterrows():
    if row['Tag_Component'] == '4h':
        if current_cluster_size == 0:
            # Start a new cluster
            cluster_id += 1
        current_cluster_size += 1
        cluster_sentences.append((cluster_id, row['Tag'], row['Standardized_Sentence']))
    else:
        if current_cluster_size > 1:
            # Reset cluster size if we just finished a cluster of more than one '4h'
            current_cluster_size = 0
        elif current_cluster_size == 1:
            # Remove last sentence if it was a standalone '4h'
            if cluster_sentences and cluster_sentences[-1][0] == cluster_id:
                cluster_sentences.pop()
            current_cluster_size = 0

# Create a new DataFrame from the cluster sentences
df_clusters = pd.DataFrame(cluster_sentences, columns=['Cluster_ID', 'Tag', 'Standardized_Sentence'])

# Filter out clusters with only one sentence
df_clusters = df_clusters[df_clusters['Cluster_ID'].duplicated(keep=False)]

display(df_clusters)

print("====" * 20)

df_clusters['Topic_Category'] = df_clusters['Tag'].apply(
    lambda x: x.split(',')[3].replace(']', '') if len(x.split(',')) > 3 else None
)

# Check if the 'Topic_Category' is consistent within each cluster
inconsistent_clusters = []

# Group by 'Cluster_ID' and check each cluster
for cluster_id, group in df_clusters.groupby('Cluster_ID'):
    if group['Topic_Category'].nunique() > 1:
        # If there is more than one unique topic in the cluster, it's inconsistent
        inconsistent_clusters.append(group)

# Combine all inconsistent clusters into a single DataFrame
df_inconsistent_clusters = pd.concat(inconsistent_clusters) if inconsistent_clusters else pd.DataFrame()

df_inconsistent_clusters['Translated_Category'] = df_inconsistent_clusters['Topic_Category'].map(topic_codes)

df_inconsistent_clusters


Unnamed: 0,Cluster_ID,Tag,Standardized_Sentence
0,7,"[c:35,09,4h,03]",jack thomas sums it up and says it sounds like...
1,7,"[c:36,02,4h,03]",mac indicates disagreement
2,7,"[c:37,04,4h,03]",john indicates disagreement
3,8,"[c:41,02,4h,05]",mac we dont need more processed meat this is a...
4,8,"[c:42,04,4h,05]",at that point john lambert just sighs but nods...
...,...,...,...
95,265,"[c:21,22,4h,03]",laverne is putting in some objections
96,265,"[c:22,25,4h,03]",and lyle says he will sue next time somebody o...
97,267,"[c:79,15,4h,06]",bob fulton and lyle fight over this
98,267,"[c:80,25,4h,06]",bob fulton and lyle fight over this




Unnamed: 0,Cluster_ID,Tag,Standardized_Sentence,Topic_Category,Translated_Category
7,28,"[c:83,01,4h,10]",swisher says lets not get into an argument ove...,10,The Union and Contract Issues
8,28,"[c:84,26,4h,11]",at this point chuck mueller gets agitated and ...,11,Supervision
20,68,"[c:67,03,4h,11]",herb says at that point that the foremen are a...,11,Supervision
21,68,"[c:68,12,4h,11]",i say that i cant see any reason why the forem...,11,Supervision
22,68,"[c:69,11,4h,07]",ralph gets at herb and wants to force him to s...,7,ESOP Participation
46,131,"[c:106,25,4h,04]",you dont go into the bargaining table with fig...,4,Management Personnel Issues
47,131,"[c:107,01,4h,10]",swisher then reminds lyle that louis anderson ...,10,The Union and Contract Issues
57,163,"[c:176,14,4h,10]",sid oberman jumps in and says the ham boning l...,10,The Union and Contract Issues
58,163,"[c:177,11,4h,04]",ralph says we have lost over 15 million dollar...,4,Management Personnel Issues
59,164,"[c:185,08,4h,09]",art says to lyle hey youre suppose to fix thes...,9,Plant Production and Manufacturing Process Issues


In [20]:
df_inconsistent_clusters['Cluster_ID'].unique() # 6

array([ 28,  68, 131, 163, 164, 194])

In [21]:
df_clusters['Cluster_ID'].unique()    # total 40, 15% of 4h clusters had topic transfer

array([  7,   8,  19,  28,  33,  35,  44,  46,  58,  68,  73,  80,  91,
        94,  95, 112, 122, 126, 131, 141, 145, 152, 160, 163, 164, 177,
       179, 183, 193, 194, 203, 216, 220, 222, 234, 240, 249, 253, 254,
       260, 265, 267])

# Work with AI

## possible prompts: <br>

The following transcripts including the sentences in the board meetings across serveral months. You will see within the brackets, the third component code 4h represents cases of disaggrement, conflict, and attack. Look at 3 sentences that takes place prior to each 4h case. You should look only at the those 3 sentences, what the moods or sentiments are? what we mean by mood or sentiment is the tone of the emotions and the feeling of tension or comfort. produce an output that includes a copy of the sentences and also followed by your description of the moods or sentiments. Describe your reasoning before giving your answer.

============ <br>
**Is there a more common pattern across all transcripts? Not case by case.** <br>
**what are the 3 most typical examples patterns among all 4h cases.** <br>
**find out 3 most common patterns** <br>
===========

## result

Here are 10 examples with expanded analysis of the mood/sentiment:

1. Sentences: 
"[c:13,02,4i,03]",mac clams up completely and gets red <br>
"[c:14,11,4h,03]",ralph answers irritably that we cannot only go into one problem we have to attack all problems at once <br>
"[c:15,11,1b,10]",he then begins to talk about his sub committees report<br>

Analysis: Mac's lack of response and obvious discomfort creates an awkward and tense mood. Ralph becomes irritated with the lack of substantive answers and harshly states that many problems need to be addressed, not just one. His curt response heightens the strained atmosphere. Overall, Mac's avoidance escalates frustration and leads to Ralph's overt impatience. 

2. Sentences:
"[c:10,02,4i,10]",he does not answer directly<br>
"[c:11,11,4h,10]",ralph presses him for a more substantial response by standing up and saying he refuses to believe the managers have behaved responsibly and that someone should be fired<br>
"[c:12,03,4i,10]",herb chooses not to respond <br>

Analysis: The mood grows increasingly tense as Mac avoids answering directly. Ralph becomes exasperated with the vague responses and bluntly demands accountability and change, even rising from his seat to emphasize his point. However, Herb declines to engage further, deflecting the confrontation. The lack of direct answers leaves the matter unresolved, sustaining frustration.

3. Sentences:
"[c:19,04,4h,05]",john lambert says that that would be an excellent sales point  if people could learn that at rath the quality is really high because employees made their own judgements about when productivity should continue to insure high quality could be used very effectively in advertising<br>
"[c:20,04,4b,05]",john then goes on to talk about 3rd quarter planning sales meetings and recent developments including the promotions involving five leading international union presidents and the labor press <br> 
"[c:21,04,1b,05]",theres a new sales entity which sets up a separate unit  degroot with 5 people and a phone capacity to do carlot business this is warehouse solicitation hpi cisco foods<br>

Analysis: Despite Glen's valid concern about quality issues, John ignores it and shifts focus to highlighting positives like promotional campaigns. The rapid change of subject creates an uneasy mood, as the core problem is avoided. John seems to want to distract from criticism by emphasizing successes, but the underlying tension persists.

4. Sentences:
"[c:10,09,4h,12]",jack doesnt see the reason why we are not buying hogs when they are there<br>
"[c:11,04,1b,12]",john explains why there is an underlying criticism here of the buying operation<br>
"[c:12,03,4a,12]",herb picks this up and responds to that  says that he has talked to pihl and mac and they dont feel that the outlying buying stations is a problem<br>

Analysis: Jack bluntly questions their purchasing decisions, creating immediate tension. John tries to provide more context to justify their approach. However, Herb gives a vague, broad response that fails to directly address the criticism. This likely further frustrates Jack and sustains the strained mood.

5. Sentences:
"[c:08,25,4h,03]",lyle says that people dont trust him anymore sliced bacon has deteriorated while lyle worked with the cutandkill and the superintendents are quitting because mac stopped sending info over to lyle at the union hall so lyle could use it<br>
"[c:09,25,1b,07]",another thing that has happened is that the leadership in the action teams the informal leaders have broken down stewards and foremen do not sit down with one another without screaming<br>
"[c:10,25,4d,07]",there is no talking between people now there has to be more clear communication<br>

Analysis: Lyle vents frustration over the breakdown in collaboration and trust, setting an anxious tone. Descriptions of conflicts persisting create further unease. The demand for improved communication highlights unresolved issues and leaves uncertainty hanging.

6. Sentences:  
"[c:11,20,4h,09]",glen also says that when they send out these new products to the new business connections those products have to be the best they can possibly make and they cannot afford to screw that up<br>
"[c:12,04,4c,05]",john agrees that that would be an excellent sales point  if people could learn that at rath the quality is really high because employees made their own judgements about when productivity should continue to insure high quality could be used very effectively in advertising<br>
"[c:13,04,1b,05]",he and mr lambert commented on plans for bumper stickers displays and coupons<br>

Analysis: Glen raises an important concern about quality control, creating unease. However, John agrees in a superficial way then immediately shifts focus to unrelated marketing plans. The rapid topic change seems to dismiss Glen's substantive point, leaving the concern unresolved. 

7. Sentences:
"[c:21,12,4h,07]",tove at that point asks a question to herb its a question about the information  trying to find out what kind of information was coming to the group and what kind of information was not coming to the group and who stopped it  <br>
"[c:22,03,4i,07]",herb says that lyle and ivan piel will work out the specific goals and communicate that to the department<br>
"[c:23,03,4g,10]",he says he needs the full support of the union especially lyle taylor doing his best to get people to try and get along<br>

Analysis: Tova tries to pinpoint issues around information sharing, but Herb gives a vague response about setting goals rather than addressing the communication breakdown. This likely further frustrates Tova. Herb then shifts to discussing needing union support, rather than acknowledging the concerns.

8. Sentences:
"[c:59,08,4h,03]",frye makes general comments and says that we have a problem and at this point there is general laughter to the fact that frye says we have a problem it is well understood that we have a problem indeed<br>
"[c:60,02,4h,03]",mac indicates disagreement<br>
"[c:61,04,4h,03]",john indicates disagreement<br>

Analysis: Frye states the obvious about their troubles, leading to uncomfortable laughter. However, both Mac and John seem defensive and hurriedly indicate disagreement, escalating tension rather than diffusing it. Their sensitivity to criticism creates an uneasy mood.

9. Sentences:
"[c:23,08,4i,07]",look you have 3 workers on the board why cant they tell you  <br>
"[c:24,11,4h,09]",mac you said that last months problems were due to labor problems that some of those problems were due to poor maintenance problems and lack of spare parts how do you explain that<br>
"[c:25,02,4a,09]",mac answers can i make one more observation last week we had a 1 12 hour meeting on improvement in production<br>

Analysis: Art dismissively questions why more communication is needed, setting a impatient tone. Ralph harshly presses Mac for clarity on past problems, putting him on the defensive. Mac tries to redirect to recent positive steps, but the tension persists from the unresolved issues.

10. Sentences:
"[c:09,11,4h,05]",ralph what do you mean by hope<br>
"[c:10,04,4h,05]",he decided to take the turn and interrupts mac by asking if he can take 15 minutes now to talk about the abattoir problem  as he has industry wide data<br>
"[c:11,04,1b,05]",john then launches into his talk  <br>

Analysis: Ralph bluntly questions John's vague statement, creating tension. John hurriedly interrupts and changes subject before providing clarification. His attempt to redirect comes across as an avoidance tactic, leaving Ralph's concern unaddressed. This unease follows into the next topic.

--------------------

## another prompt

The following transcripts including the sentences in the board meetings across serveral months. You will see within the brackets, the third component represents verbal contribution categories. A code '4h' represents cases of disagreement, conflict, and attack. Look at the conversation that takes place before each 4h case. what is the most common pattern of discussion immediately prior to a 4h? When we talked about patterns of discussion immediately prior to a 4h, can you use the verbal contribution categories of those preceding sentences? You should get the pattern from not only one preceding sentence.

## result

Based on reviewing the transcripts, the most common pattern of discussion immediately prior to a "4h" (disagreement/conflict/attack) seems to be:

1. Someone makes a statement or raises a question (often coded as "1b", "1c", "1d", or "4d").

2. There is some back and forth discussion with multiple people chiming in (coded as "05"). 

3. The disagreement/"4h" then emerges, often in response to or building on the initial statement/question and the subsequent discussion.

Some examples:

- "[c:11,02,1b,03]", Mac gives an explanation 
- "[c:12,08,05,03]", Discussion between Mac, John, and Fry  
- "[c:13,11,4h,03]", Ralph still wants to know why the plant is losing money

- "[c:09,04,1b,05]", John explains the retailer adjustments process
- "[c:10,08,05,05]", Fry and John discuss 
- "[c:11,11,4h,05]", Ralph attacks the diffuse language in the financial statements

- "[c:69,03,1b,10]", Herb reports on meetings with workers
- "[c:70,11,05,10]", Ralph and Dick Clark ask about preparation time for meetings
- "[c:71,11,4h,10]", Ralph argues more time is needed

So in summary, the pattern tends to be: statement/question -> group discussion -> disagreement/conflict emerging from that discussion. The categories leading up to the 4h are often 1b, 1c, 1d, 4d, 05.

In [22]:
# is those patterns specific to 4hs? How about non-4h cases? What will be the patterns?

# Four Major Tasks

## Task 1

- Frequency of people who make 4h comment
- Frequency of people who just precede (the 1 prior statement) a 4h comment 
- Interaction: Who speaks about what topics?

### Data

In [23]:
df['Tag_Component'] = df['Tag'].apply(lambda x: x.split(',')[2] if len(x.split(',')) > 2 else None)

# Set the focus category here
focus_category = '4h'
df['Is_Focus_Category'] = df['Tag_Component'] == focus_category

def get_relevant_rows(df):
    relevant_rows = []
    skip_flag = False  

    for i in range(len(df)):
        if df.iloc[i]['Is_Focus_Category']:
            if skip_flag:
                continue
            skip_flag = True  

            for j in range(1, 4):
                if i-j < 0:
                    break  
                if not df.iloc[i-j]['Is_Focus_Category']:
                    relevant_rows.append(df.iloc[i-j])

            relevant_rows.append(df.iloc[i])

        else:
            skip_flag = False  

    return pd.DataFrame(relevant_rows).sort_index()

df_final = get_relevant_rows(df)

### People who speak 4h

In [24]:
data = {
    'Name': ['Chuck Swisher', 'Emmet “Mac” MacGuire', 'Herb Epstein', 'John Lambert', 'Wally Rath, regional sales managers, & Robert Cray & John DeGroat', 'Bob Soleday, Mowry, Bloomfield,(beginning Jan. 1985)', 'Ivan Pihl', 'Art Frye', 'Jack Thomas', 'Harold Rath', 'Ralph Helstein', 'Tove Hammer', 'Len Dodson', 'Sid Oberman', 'Bob Fulton', 'Walter Cunningham', 'Dick Clarke', 'Phyllis Walters', 'Earl Murray', 'Glen Bass', 'Clark Towne', 'LaVerne Patrie', 'Peter Bruskern', 'Bob Kavangh', 'Lyle Taylor', 'Chuck Mueller', 'Gene Redmond & other union officials', 'Jim Miller', 'Ron Peterson, William Scogland, Charles McCarthy, Tom Mandler, Wes Hall, Ravel', 'Berthold', 'Rudnick', 'Gerjerts, Bill Wait', 'Jack Curtis', 'Bruce Wilson', 'John Stevens, Greg Kohn, Lewis Rudel', 'Potential Business deal people', 'ESOP consultants (Chris Meek & Warner Woodworth, W.F. Whyte)', 'ESOP trustees, Larry Wrede, Jim Anderson, Cox', 'Wayne Wright', 'Insurance people & other consultants', 'Leroy Grittman, Tobias, Mary Frost', 'Swisher & Cohrt, Law firm (Steve Weidner)'],
    'Id Number': ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44']
}

person_df = pd.DataFrame(data)
person_df

Unnamed: 0,Name,Id Number
0,Chuck Swisher,1
1,Emmet “Mac” MacGuire,2
2,Herb Epstein,3
3,John Lambert,4
4,"Wally Rath, regional sales managers, & Robert ...",5
5,"Bob Soleday, Mowry, Bloomfield,(beginning Jan....",6
6,Ivan Pihl,7
7,Art Frye,8
8,Jack Thomas,9
9,Harold Rath,10


In [25]:
# Extract the second component (Topic) from the 'Tag' field
df['Tag_Component'] = df['Tag'].apply(lambda x: x.split(',')[1].strip('[').strip() if len(x.split(',')) > 1 else None)

# Apply the get_relevant_rows function to get rows relevant to the '4h' focus category
df_final_person = get_relevant_rows(df)

# Filter the DataFrame for rows where 'Is_Focus_Category' is True
df_focus_true = df_final_person[df_final_person['Is_Focus_Category']]

# Count the occurrences of each unique 'Topic_Component' in these rows
person_component_counts = df_focus_true['Tag_Component'].value_counts()

# Calculate the total count for percentage calculation
total_count = person_component_counts.sum()

# Calculate the percentage
person_component_counts_percentage = {
  key: {
    'Count': value, 
    'Percentage': (value / total_count) * 100
  }
  for key, value in person_component_counts.items() 
}

for person, data in person_component_counts_percentage.items():
  percentage = round(data['Percentage'], 2)
  data['Percentage'] = f"{percentage}%"

for person, data in person_component_counts_percentage.items():
  print(f"{person}: {data['Count']} times ({data['Percentage']})")

11: 54 times (20.0%)
25: 35 times (12.96%)
08: 29 times (10.74%)
12: 25 times (9.26%)
15: 18 times (6.67%)
01: 17 times (6.3%)
14: 12 times (4.44%)
03: 11 times (4.07%)
02: 10 times (3.7%)
20: 9 times (3.33%)
13: 8 times (2.96%)
26: 6 times (2.22%)
22: 6 times (2.22%)
09: 5 times (1.85%)
04: 5 times (1.85%)
19: 5 times (1.85%)
06: 3 times (1.11%)
40: 3 times (1.11%)
17: 2 times (0.74%)
18: 2 times (0.74%)
31: 2 times (0.74%)
05: 1 times (0.37%)
32: 1 times (0.37%)
21: 1 times (0.37%)


In [26]:
#person_component_counts_percentage
counts_df = pd.DataFrame.from_dict(person_component_counts_percentage, orient='index').reset_index()
counts_df.rename(columns={'index': 'Id Number'}, inplace=True)

id_to_name = {row['Id Number']: row['Name'] for index, row in person_df.iterrows()}

counts_df['Name'] = counts_df['Id Number'].apply(lambda x: id_to_name.get(x, "Unknown"))
counts_df

Unnamed: 0,Id Number,Count,Percentage,Name
0,11,54,20.0%,Ralph Helstein
1,25,35,12.96%,Lyle Taylor
2,8,29,10.74%,Art Frye
3,12,25,9.26%,Tove Hammer
4,15,18,6.67%,Bob Fulton
5,1,17,6.3%,Chuck Swisher
6,14,12,4.44%,Sid Oberman
7,3,11,4.07%,Herb Epstein
8,2,10,3.7%,Emmet “Mac” MacGuire
9,20,9,3.33%,Glen Bass


### People who speak 4h and their topics

In [27]:
person_topic = df_focus_true[['Tag_Component', 'Topic_Component']]
person_topic = person_topic.rename(columns={'Tag_Component': 'Person_Component'})
person_topic['Topic_Component'] = person_topic['Topic_Component'].map(topic_codes)
#display(person_topic)

grouped_count = person_topic.groupby(['Person_Component', 'Topic_Component']).size().reset_index(name='Count')
sorted_grouped_count = grouped_count.sort_values(by='Count', ascending=False)
#sorted_grouped_count

In [28]:
sorted_grouped_count['Person_Name'] = sorted_grouped_count['Person_Component'].apply(lambda x: id_to_name.get(str(x), "Unknown"))
sorted_grouped_count
#sorted_grouped_count.to_csv('/Users/ningyuhan/Desktop/people_who_speak_4h_and_topics.csv', index= False)

Unnamed: 0,Person_Component,Topic_Component,Count,Person_Name
26,08,Financial Status of Firm,14,Art Frye
45,11,The Union and Contract Issues,11,Ralph Helstein
41,11,Management Personnel Issues,10,Ralph Helstein
40,11,Financial Status of Firm,10,Ralph Helstein
95,25,Management Personnel Issues,8,Lyle Taylor
...,...,...,...,...
64,14,Supervision,1,Sid Oberman
66,15,ESOP Participation,1,Bob Fulton
68,15,Employee Stock Ownership Plan (ESOP) Financial,1,Bob Fulton
72,15,Political Relations with the Community,1,Bob Fulton


In [29]:
total_count = sorted_grouped_count['Count'].sum()
total_count

270

### People who speak before 4h

In [30]:
sequences_leading_to_focus_person = []

for index in range(len(df_final_person)):
    if df_final_person.iloc[index]['Is_Focus_Category']:
        person_sequence = []
        if index-1 >= 0 and not df_final_person.iloc[index-1]['Is_Focus_Category']:
            person_sequence.append(df_final_person.iloc[index-1]['Tag_Component'])
        sequences_leading_to_focus_person.append(tuple(person_sequence))
        
sequence_counts = Counter(sequences_leading_to_focus_person)
top10_sequences = sequence_counts.most_common(10)
#sequence_counts
len(sequence_counts)

28

In [31]:
sequence_df = pd.DataFrame([{'Id Number': k[0], 'Count': v} for k, v in sequence_counts.items()])

sequence_df['Name'] = sequence_df['Id Number'].apply(lambda x: id_to_name.get(x, "Unknown"))
sorted_sequence_df = sequence_df.sort_values(by='Count', ascending=False)
sorted_sequence_df
#sorted_sequence_df.to_csv('/Users/ningyuhan/Desktop/people_who_speak_BEFORE_4h.csv', index= False)

Unnamed: 0,Id Number,Count,Name
14,3,45,Herb Epstein
3,12,31,Tove Hammer
10,25,29,Lyle Taylor
6,1,26,Chuck Swisher
0,11,17,Ralph Helstein
4,15,14,Bob Fulton
1,2,12,Emmet “Mac” MacGuire
2,8,11,Art Frye
7,20,10,Glen Bass
9,19,10,Earl Murray


## Task 2

- TopicM, TopicM, TopicY  -- 4h – LONE Y is enough to trigger 4h 
- TopicZ, Topic Z, Topic Z – 4h  - SUSTAINED Z needed to trigger 4h
- NOT CASE: TopicM, Topic M, TopicZ – 4h

### Data

In [32]:
df_task2= df_final[['Tag', 'Standardized_Sentence', 'Is_4h', 'Tag_Component']]

### Extrack topics of those preceding sentences

In [33]:
df_task2 = df_task2.copy()

# Extracting 'Sentence_Number' and 'Topic_Category'
df_task2['Sentence_Number'] = df_task2['Tag'].apply(lambda x: int(x.split(',')[0].split(':')[1]))
df_task2['Topic_Category'] = df_task2['Tag'].apply(lambda x: x.split(',')[3].strip(']'))

In [34]:
df_preceding_topics = pd.read_csv('/Users/ningyuhan/Desktop/df_task2.csv')

# Initialize an empty list to store the 'preceding_topics' for the '4h' instance
preceding_topics_for_4h = []

# Track the index of the last '4h' instance
last_4h_index = -1  

# Iterate over the DataFrame to find the '4h' instances and get their 'preceding_topics'
for index, row in df_preceding_topics.iterrows():
    if row['Tag_Component'] == '4h':
        # Get the preceding topics in that range, including duplicates
        preceding_topics = df_preceding_topics.iloc[last_4h_index + 1:index]['Topic_Category'].tolist()
        
        # Update the last '4h' index to the current index
        last_4h_index = index

        # Append the 'preceding_topics' to the list for '4h' instance
        preceding_topics_for_4h.append(preceding_topics)
    else:
        # Append None for rows that are not '4h' instances
        preceding_topics_for_4h.append(None)

# Assign the 'preceding_topics_for_4h' to the DataFrame
df_preceding_topics['Preceding_Topics'] = preceding_topics_for_4h
df_we_need = df_preceding_topics[df_preceding_topics['Tag_Component'] == '4h'][['Tag','Standardized_Sentence','Tag_Component', 'Preceding_Topics']]
display(df_we_need)

Unnamed: 0,Tag,Standardized_Sentence,Tag_Component,Preceding_Topics
3,"[c:13,13,4h,01]",dodson got mad at ralph,4h,"[4, 4, 1]"
7,"[c:50,02,4h,09]",everybody talked about the armed camp atmosphe...,4h,"[5, 7, 9]"
9,"[c:03,01,4h,01]",swisher argues that if one wants aint it then ...,4h,[1]
13,"[c:19,04,4h,05]",john lambert thinks this is an unlikely problem,4h,"[5, 5, 5]"
15,"[c:21,04,4h,09]",the essence of johns remarks are that there ar...,4h,[3]
...,...,...,...,...
962,"[c:45,25,4h,09]",but lyle says no it can be a hardcutits a flas...,4h,"[9, 9, 9]"
966,"[c:79,15,4h,06]",bob fulton and lyle fight over this,4h,"[3, 3, 6]"
970,"[c:85,25,4h,06]",lyle gets mad at tove for trying to stop him f...,4h,"[6, 6, 6]"
972,"[c:87,12,4h,06]",tove gets angry at lyle and turns on him and s...,4h,[6]


### Preceding Sentences have the same topic

In [35]:
df_we_need['Sameness'] = df_we_need['Preceding_Topics'].apply(
    lambda x: 'Same' if len(set(x)) == 1 else 'Different'
    if isinstance(x, list) and x else 'Undefined'
)

df_same = df_we_need[df_we_need['Sameness'] == 'Same'].drop('Sameness', axis=1)
df_different = df_we_need[df_we_need['Sameness'] == 'Different'].drop('Sameness', axis=1)

display(df_same)

Unnamed: 0,Tag,Standardized_Sentence,Tag_Component,Preceding_Topics
9,"[c:03,01,4h,01]",swisher argues that if one wants aint it then ...,4h,[1]
13,"[c:19,04,4h,05]",john lambert thinks this is an unlikely problem,4h,"[5, 5, 5]"
15,"[c:21,04,4h,09]",the essence of johns remarks are that there ar...,4h,[3]
19,"[c:29,02,4h,05]",but mac also says that rath has new products w...,4h,"[9, 9, 9]"
27,"[c:41,02,4h,05]",mac we dont need more processed meat this is a...,4h,"[5, 5, 5]"
...,...,...,...,...
958,"[c:21,22,4h,03]",laverne is putting in some objections,4h,"[3, 3, 3]"
962,"[c:45,25,4h,09]",but lyle says no it can be a hardcutits a flas...,4h,"[9, 9, 9]"
970,"[c:85,25,4h,06]",lyle gets mad at tove for trying to stop him f...,4h,"[6, 6, 6]"
972,"[c:87,12,4h,06]",tove gets angry at lyle and turns on him and s...,4h,[6]


### Preceding Sentences have different topics

In [36]:
display(df_different)

Unnamed: 0,Tag,Standardized_Sentence,Tag_Component,Preceding_Topics
3,"[c:13,13,4h,01]",dodson got mad at ralph,4h,"[4, 4, 1]"
7,"[c:50,02,4h,09]",everybody talked about the armed camp atmosphe...,4h,"[5, 7, 9]"
23,"[c:35,09,4h,03]",jack thomas sums it up and says it sounds like...,4h,"[5, 5, 9]"
33,"[c:76,01,4h,01]",and swisher answers emphatically no it will not,4h,"[10, 1, 1]"
41,"[c:44,04,4h,09]",john no john then goes on to explain what spec...,4h,"[12, 9, 9]"
...,...,...,...,...
929,"[c:128,22,4h,04]",he says that grittman let the best two people ...,4h,"[9, 3, 10]"
936,"[c:52,25,4h,02]",lyle then goes on to say that the board should...,4h,"[11, 11, 4]"
950,"[c:146,12,4h,10]",tove makes this comment of course they will be...,4h,"[10, 3, 10]"
954,"[c:162,20,4h,06]",why should the company pay it just because the...,4h,"[5, 6, 6]"


### Only two sentences between 4hs: 4h, s1, s2, 4h

In [37]:
df_same_two_elements = df_same[df_same['Preceding_Topics'].apply(lambda x: len(x) == 2)]
df_different_two_elements = df_different[df_different['Preceding_Topics'].apply(lambda x: len(x) == 2)]

display(df_same_two_elements)

Unnamed: 0,Tag,Standardized_Sentence,Tag_Component,Preceding_Topics
64,"[c:171,08,4h,08]",fry and ralph at the same time big companies d...,4h,"[8, 8]"
147,"[c:52,11,4h,03]",ralph retorts with the fact that borrowing mon...,4h,"[3, 3]"
174,"[c:59,11,4h,04]",ralph feels that epstein doesnt have to ask fo...,4h,"[4, 4]"
248,"[c:74,11,4h,07]",ralph says to herb that you have to show peopl...,4h,"[7, 7]"
321,"[c:172,14,4h,04]",sid oberman says whats your point in a very ho...,4h,"[4, 4]"
328,"[c:188,25,4h,01]",lyle laughs and says youre just like the inter...,4h,"[1, 1]"
351,"[c:55,11,4h,10]",ralph continues to press herb for figures and now,4h,"[10, 10]"
402,"[c:43,11,4h,04]",and says that he refuses to believe that the m...,4h,"[9, 9]"
416,"[c:66,11,4h,10]",ralph says that he refuses to give anybody any...,4h,"[1, 1]"
435,"[c:107,12,4h,01]",tove gets very angry at earl and tells him tha...,4h,"[1, 1]"


In [38]:
display(df_different_two_elements)

Unnamed: 0,Tag,Standardized_Sentence,Tag_Component,Preceding_Topics
243,"[c:67,03,4h,11]",herb says at that point that the foremen are a...,4h,"[7, 11]"
411,"[c:61,15,4h,10]",bob fulton argues against sid but im not sure ...,4h,"[3, 10]"
458,"[c:75,08,4h,04]",the economy has had very little effect on rath...,4h,"[4, 10]"
511,"[c:198,11,4h,10]",ralph says as his counsel i will not let him a...,4h,"[8, 10]"


### If there are three sentences between 4hs, extract the last 2

**For example:[A,B,B] -> [B,B]**

In [39]:
def extract_last_two_if_three(lst):
    if len(lst) == 3:  
        return lst[1:] 
    return lst 

df_we_need['Last_Two_If_Three'] = df_we_need['Preceding_Topics'].apply(extract_last_two_if_three)

df_last_two_if_3 = df_we_need[['Preceding_Topics', 'Last_Two_If_Three']]
display(df_last_two_if_3)

Unnamed: 0,Preceding_Topics,Last_Two_If_Three
3,"[4, 4, 1]","[4, 1]"
7,"[5, 7, 9]","[7, 9]"
9,[1],[1]
13,"[5, 5, 5]","[5, 5]"
15,[3],[3]
...,...,...
962,"[9, 9, 9]","[9, 9]"
966,"[3, 3, 6]","[3, 6]"
970,"[6, 6, 6]","[6, 6]"
972,[6],[6]


In [40]:
df_last_two_if_3['Sameness'] = df_last_two_if_3['Last_Two_If_Three'].apply(
    lambda x: 'Same' if len(set(x)) == 1 else 'Different'
    if isinstance(x, list) and x else 'Undefined'
)

df_same_new = df_last_two_if_3[df_last_two_if_3['Sameness'] == 'Same'].drop('Sameness', axis=1)
df_different_new = df_last_two_if_3[df_last_two_if_3['Sameness'] == 'Different'].drop('Sameness', axis=1)

df_same_two_elements_new = df_same_new[df_same_new['Last_Two_If_Three'].apply(lambda x: len(x) == 2)]
df_different_two_elements_new = df_different_new[df_different_new['Last_Two_If_Three'].apply(lambda x: len(x) == 2)]
display(df_same_two_elements_new)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_last_two_if_3['Sameness'] = df_last_two_if_3['Last_Two_If_Three'].apply(


Unnamed: 0,Preceding_Topics,Last_Two_If_Three
13,"[5, 5, 5]","[5, 5]"
19,"[9, 9, 9]","[9, 9]"
27,"[5, 5, 5]","[5, 5]"
33,"[10, 1, 1]","[1, 1]"
37,"[5, 5, 5]","[5, 5]"
...,...,...
954,"[5, 6, 6]","[6, 6]"
958,"[3, 3, 3]","[3, 3]"
962,"[9, 9, 9]","[9, 9]"
970,"[6, 6, 6]","[6, 6]"


In [41]:
print(len(df_different_two_elements_new))
display(df_different_two_elements_new) # 180+52= 232 -270 38 

52


Unnamed: 0,Preceding_Topics,Last_Two_If_Three
3,"[4, 4, 1]","[4, 1]"
7,"[5, 7, 9]","[7, 9]"
23,"[5, 5, 9]","[5, 9]"
49,"[8, 8, 13]","[8, 13]"
53,"[6, 6, 2]","[6, 2]"
86,"[7, 1, 7]","[1, 7]"
102,"[11, 11, 10]","[11, 10]"
108,"[9, 7, 11]","[7, 11]"
126,"[9, 4, 7]","[4, 7]"
130,"[3, 3, 5]","[3, 5]"


### compare the same last topic between those 2 tables

In [42]:
df_different_two_elements_new['Last_Element'] = df_different_two_elements_new['Last_Two_If_Three'].apply(lambda x: x[-1] if x else None)
df_same_two_elements_new['Last_Element'] = df_same_two_elements_new['Last_Two_If_Three'].apply(lambda x: x[-1] if x else None)

display(df_different_two_elements_new.head(5))
display(df_same_two_elements_new.head(5))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_same_two_elements_new['Last_Element'] = df_same_two_elements_new['Last_Two_If_Three'].apply(lambda x: x[-1] if x else None)


Unnamed: 0,Preceding_Topics,Last_Two_If_Three,Last_Element
3,"[4, 4, 1]","[4, 1]",1
7,"[5, 7, 9]","[7, 9]",9
23,"[5, 5, 9]","[5, 9]",9
49,"[8, 8, 13]","[8, 13]",13
53,"[6, 6, 2]","[6, 2]",2


Unnamed: 0,Preceding_Topics,Last_Two_If_Three,Last_Element
13,"[5, 5, 5]","[5, 5]",5
19,"[9, 9, 9]","[9, 9]",9
27,"[5, 5, 5]","[5, 5]",5
33,"[10, 1, 1]","[1, 1]",1
37,"[5, 5, 5]","[5, 5]",5


In [43]:
df_same_counts = df_same_two_elements_new['Last_Element'].value_counts()
df_same_counts = df_same_counts.reset_index()

df_same_counts.columns = ['Number', 'Count']
df_same_counts

Unnamed: 0,Number,Count
0,3,38
1,4,37
2,10,31
3,1,14
4,9,13
5,5,12
6,7,11
7,8,10
8,6,9
9,11,3


In [44]:
df_different_counts = df_different_two_elements_new['Last_Element'].value_counts()
df_different_counts = df_different_counts.reset_index()

df_different_counts.columns = ['Number', 'Count']
df_different_counts

Unnamed: 0,Number,Count
0,10,14
1,9,7
2,3,7
3,5,6
4,7,5
5,1,3
6,11,3
7,6,3
8,4,2
9,13,1


In [45]:
joined_df = pd.merge(df_same_counts, df_different_counts, on='Number', how='outer', suffixes=('same', 'different'))
joined_df.columns = ['topic_number', 'Count_same', 'Count_different']
joined_df['topic_number_str'] = joined_df['topic_number'].apply(lambda x: f"{x:02}")
joined_df['topic_category'] = joined_df['topic_number_str'].map(topic_codes)
df_final_task2 = joined_df[['topic_number','Count_same','Count_different','topic_category']]
df_final_task2

Unnamed: 0,topic_number,Count_same,Count_different,topic_category
0,3,38.0,7.0,Financial Status of Firm
1,4,37.0,2.0,Management Personnel Issues
2,10,31.0,14.0,The Union and Contract Issues
3,1,14.0,3.0,Routine Board Functions
4,9,13.0,7.0,Plant Production and Manufacturing Process Issues
5,5,12.0,6.0,Marketing and Sales
6,7,11.0,5.0,ESOP Participation
7,8,10.0,,Employee Benefits
8,6,9.0,3.0,Employee Stock Ownership Plan (ESOP) Financial
9,11,3.0,3.0,Supervision


<div style="background-color: #FFFF00; padding: 10px;">
    <strong>same:</strong> [5,5] -> 4h <br>
    <strong>different:</strong>[5,1] -> 4h
</div>

## Task 3

- Is conflict a precursor to decision diversion (human identified) (4i - stonewalling)(4f-change the subject)
- HOW OFTEN DOES 4i and 4f occur?
- TOPIC BEFORE VS AFTER 4i/4f?

### Data

In [46]:
df = pd.read_csv('/Users/ningyuhan/Desktop/combined_dataframe.csv')
def standardize_text(text):
    if pd.isna(text):
        return text  
    # Remove non-ASCII characters directly
    text = ''.join([char for char in text if ord(char) < 128])
    text = text.strip().lower()  
    # Remove all non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

df['Standardized_Sentence'] = df['Sentence'].apply(standardize_text)

def standardize_tag(tag):
    if pd.isna(tag):
        return tag
    # Remove all spaces and extra characters
    tag = tag.replace(' ', '')
    # Ensure consistency in formatting
    tag = ','.join(tag.split(','))
    return tag

df['Tag'] = df['Tag'].apply(standardize_tag)

# Now, your DataFrame should have the "Tag" column standardized
df_unique = df.drop_duplicates(subset=['Standardized_Sentence']).copy()
df_unique['Tag'] = df_unique['Tag'].apply(
    lambda x: x.replace('4h,30]', '4h,03]') if '4h,30]' in x else x
)

def correct_typo(tag):
    parts = tag.strip('[]').split(',')
    if len(parts) > 3 and len(parts[3]) == 1:
        parts[3] = '0' + parts[3]  # Add a leading zero if it's a single digit
    return '[' + ','.join(parts) + ']'

df_unique['Tag'] = df_unique['Tag'].apply(correct_typo)

df_unique = df_unique.drop('Sentence', axis=1)

df_unique['Tag_Component'] = df_unique['Tag'].apply(lambda x: x.split(',')[2] if len(x.split(',')) > 2 else None)

df_unique

Unnamed: 0,Tag,Standardized_Sentence,Tag_Component
0,"[c:01,01,1b,01]",swisher opens the meeting,1b
1,"[c:02,08,1a,01]",it was moved by mr frey,1a
2,"[c:03,10,2a,01]",seconded by mr rath that the minutes of the ju...,2a
3,"[c:04,02,1b,03]",mr mcguire informed the board that the may sta...,1b
4,"[c:05,02,4d,03]",he did indicate the preliminary weekly results...,4d
...,...,...,...
4852,"[c:104,22,4e3,06]",they also need concessions from the workers and,4e3
4853,"[c:105,22,4e3,06]",they will offer 725 per hour of which 100 wil...,4e3
4854,"[c:106,22,4e3,08]",six months of insurance,4e3
4855,"[c:107,22,4e3,10]",so its very close to the deal lyle is proposin...,4e3


### Sentences immediately after each 4h

In [47]:
def get_following_rows(df):
    following_rows = []
    previous_was_focus = False

    for i in range(len(df) - 1):
        if df.iloc[i]['Is_Focus_Category']:
            if previous_was_focus:
                # Skip if the previous row was also '4h' to avoid consecutive '4h'
                previous_was_focus = True
                continue
            else:
                # Get the next row if it's not '4h'
                if not df.iloc[i+1]['Is_Focus_Category']:
                    following_rows.append(df.iloc[i+1])
                previous_was_focus = True
        else:
            previous_was_focus = False

    return pd.DataFrame(following_rows)

# Assuming df_unique is your initial DataFrame
df_unique['Is_Focus_Category'] = df_unique['Tag_Component'] == '4h'
adjusted_selected_df = get_following_rows(df_unique)
df_final = adjusted_selected_df.drop_duplicates(subset='Standardized_Sentence')
df_final

Unnamed: 0,Tag,Standardized_Sentence,Tag_Component,Is_Focus_Category
13,"[c:14,12,2b,01]",the discussion on compensation was most unfort...,2b,False
51,"[c:02,08,4d,01]",in response to frys question of the power of t...,4d,False
53,"[c:04,15,4d,01]",the issue is resolved by robert fulton who arg...,4d,False
69,"[c:20,15,4g,03]",he wants us to stop spinning our wheels on ext...,4g,False
71,"[c:22,04,1c,09]",and he wants to know why four million pounds p...,1c,False
...,...,...,...,...
4729,"[c:163,15,4i,06]",in the end we defer the issue to find out what...,4i,False
4794,"[c:46,25,1b,09]",the money required just to do the planning for...,1b,False
4834,"[c:86,25,4e1,06]",and the board has a responsibility towards the...,4e1,False
4836,"[c:88,25,4c,06]",lyle backs down when i get angry,4c,False


### 4i and 4f immediately after 4h

In [48]:
filtered_df_new = df_final[df_final['Tag_Component'].isin(['4i', '4f'])]
print(len(filtered_df_new))
filtered_df_new

18


Unnamed: 0,Tag,Standardized_Sentence,Tag_Component,Is_Focus_Category
347,"[c:159,02,4i,06]",mac clams up completely and gets red,4i,False
423,"[c:53,08,4i,07]",art frye says why bother with these details ye...,4i,False
468,"[c:98,08,4i,07]",frye is responding look you put 7 8 people on...,4i,False
699,"[c:86,13,4i,04]",len suggested wed table the issue until we cou...,4i,False
898,"[c:90,03,4i,02]",herb is now explaining to chuck and lyle what ...,4i,False
1153,"[c:30,03,4i,05]",herb says he has a plan to increase by one mil...,4i,False
1198,"[c:75,03,4i,07]",he tells chuck swisher that we will have somet...,4i,False
1361,"[c:91,03,4i,10]",herb chooses not to respond now,4i,False
2539,"[c:27,01,4i,03]",swisher says to frye you have to take our word...,4i,False
2558,"[c:46,01,4i,03]",swisher says thats all right lyle over here wi...,4i,False


### 4i and 4f in the big dateframe

In [49]:
filtered_df = df_unique[df_unique['Tag_Component'].isin(['4i', '4f'])]
filtered_df

Unnamed: 0,Tag,Standardized_Sentence,Tag_Component,Is_Focus_Category
47,"[c:48,01,4i,07]",the consensus of the board was that time did n...,4i,False
62,"[c:13,02,4i,03]",mac says that it was a selling and inventory p...,4i,False
105,"[c:56,02,4i,10]",mac does not answer immediately,4i,False
109,"[c:60,15,4f,10]",bob fulton speaks up again and asks why we can...,4f,False
249,"[c:61,02,4i,09]",but gets no more a response out of mac,4i,False
...,...,...,...,...
4690,"[c:121,15,4i,03]",bob fulton says that this will be an insiders ...,4i,False
4719,"[c:150,25,4i,03]",very curtly that that is their businessnot our...,4i,False
4720,"[c:151,08,4f,04]",art says that he wants to say something,4f,False
4729,"[c:163,15,4i,06]",in the end we defer the issue to find out what...,4i,False


### Topic before and after 4i/4f

In [50]:
sentences_before_list = []
sentences_after_list = []

# Iterate through df_unique using iterrows()
for index, row in df_unique.iterrows():
    if row['Tag_Component'] in ['4i', '4f']:
        # Add sentence before if it exists
        if index > df_unique.index[0]:
            prev_index = df_unique.index[df_unique.index.get_loc(index) - 1]
            sentences_before_list.append(df_unique.loc[prev_index])
        # Add sentence after if it exists
        if index < df_unique.index[-1]:
            next_index = df_unique.index[df_unique.index.get_loc(index) + 1]
            sentences_after_list.append(df_unique.loc[next_index])

# Create DataFrames from the collected rows
sentences_before = pd.DataFrame(sentences_before_list)
sentences_after = pd.DataFrame(sentences_after_list)

# Reset index for the new DataFrames
sentences_before.reset_index(drop=True, inplace=True)
sentences_after.reset_index(drop=True, inplace=True)

# Display the DataFrames
print("Sentences Before:")
sentences_before

Sentences Before:


Unnamed: 0,Tag,Standardized_Sentence,Tag_Component,Is_Focus_Category
0,"[c:47,04,2b,05]",lambert echoing that,2b,False
1,"[c:12,12,1c,03]",tove asks mcguire why a million dollars were l...,1c,False
2,"[c:55,12,1c,10]",tove mac what are the labor problems,1c,False
3,"[c:59,02,1b,10]",in response to absenteeism occurs mac says tha...,1b,False
4,"[c:60,12,1c,09]",tove rephrases the question,1c,False
...,...,...,...,...
150,"[c:120,20,4h,03]",but bass says hes seen supplies go out of the ...,4h,True
151,"[c:149,12,1c,03]",then how will cj make moneyif we are paying mu...,1c,False
152,"[c:150,25,4i,03]",very curtly that that is their businessnot our...,4i,False
153,"[c:162,20,4h,06]",why should the company pay it just because the...,4h,True


In [51]:
print("\nSentences After:")
sentences_after


Sentences After:


Unnamed: 0,Tag,Standardized_Sentence,Tag_Component,Is_Focus_Category
0,"[c:49,02,1b,09]",mac was asked about conditions in the plant an...,1b,False
1,"[c:14,09,1c,05]",jack thomas wants to know where all the meat i...,1c,False
2,"[c:57,01,1b,10]",swisher interrupts and tells mac to give us so...,1b,False
3,"[c:61,15,4d,10]",there follows an exchange between bob fulton a...,4d,False
4,"[c:62,12,4e2,09]",tove then asks who will do the labor,4e2,False
...,...,...,...,...
150,"[c:122,31,1b,03]",ron peterson comes on the telephone to talk ab...,1b,False
151,"[c:151,08,4f,04]",art says that he wants to say something,4f,False
152,"[c:152,08,4d,04]",art makes the comment about ken bloomfield hes...,4d,False
153,"[c:165,13,2a,06]",seconded by mr dodson and unanimously carried ...,2a,False


In [52]:
def extract_fourth_component(tag_str):
    components = tag_str.strip('[]').split(',')
    return components[3] if len(components) > 3 else None

def count_fourth_components(df):
    fourth_components = df['Tag'].apply(extract_fourth_component)
    return Counter(fourth_components)

def translate_counts(count_dict):
    translated_counts = {topic_codes.get(key, "Unknown"): value for key, value in count_dict.items()}
    return translated_counts

def print_translated_counts_with_percentage(translated_counts):
    total_count = sum(translated_counts.values())
    for key, value in translated_counts.items():
        percentage = (value / total_count) * 100
        print(f"{key}: {value} ({percentage:.2f}%)")

#### topics before 4i/4f

In [53]:
before_counts = count_fourth_components(sentences_before)
after_counts = count_fourth_components(sentences_after)

In [54]:
translated_before_counts = translate_counts(before_counts)

In [55]:
print("Translated Counts in Sentences Before:")
print_translated_counts_with_percentage(translated_before_counts)

Translated Counts in Sentences Before:
Marketing and Sales: 11 (7.10%)
Financial Status of Firm: 31 (20.00%)
The Union and Contract Issues: 32 (20.65%)
Plant Production and Manufacturing Process Issues: 12 (7.74%)
Employee Benefits: 7 (4.52%)
Employee Stock Ownership Plan (ESOP) Financial: 7 (4.52%)
Routine Board Functions: 10 (6.45%)
ESOP Participation: 14 (9.03%)
Supervision: 4 (2.58%)
Management Personnel Issues: 23 (14.84%)
hog procurement: 3 (1.94%)
Political Relations with the Community: 1 (0.65%)


#### topics after 4i/4f

In [56]:
translated_after_counts = translate_counts(after_counts)

In [57]:
print("Translated Counts in Sentences After:")
print_translated_counts_with_percentage(translated_after_counts)

Translated Counts in Sentences After:
Plant Production and Manufacturing Process Issues: 10 (6.45%)
Marketing and Sales: 10 (6.45%)
The Union and Contract Issues: 29 (18.71%)
Employee Benefits: 12 (7.74%)
Employee Stock Ownership Plan (ESOP) Financial: 7 (4.52%)
Routine Board Functions: 12 (7.74%)
ESOP Participation: 14 (9.03%)
Supervision: 2 (1.29%)
Financial Status of Firm: 31 (20.00%)
Management Personnel Issues: 24 (15.48%)
hog procurement: 3 (1.94%)
Political Relations with the Community: 1 (0.65%)


### Presence of 1a

In [58]:
df_unique['Topic_Category'] = df_unique['Tag'].apply(lambda x: x.split(',')[3].strip(']'))

#df_unique.to_csv('/Users/ningyuhan/Desktop/df_unique.csv', index=False)

extracted_df = df_unique[['Tag', 'Standardized_Sentence', 'Tag_Component', 'Topic_Category']]

filtered_df = extracted_df[extracted_df['Tag_Component'] == '1a']
print(len(filtered_df))

116


In [59]:
# 4i/4f -> 1a,2a
# Does 1a,2a has higher probabality to occur after 4i/4f than other vcs

In [60]:
selected_indices = []
for index in filtered_df.index:
    selected_indices.extend([index, index + 1, index + 2, index + 3])

valid_indices = extracted_df.index.tolist()  
valid_selected_indices = [idx for idx in selected_indices if idx in valid_indices]

result_df = extracted_df.loc[valid_selected_indices]
result_df
#result_df.to_csv('/Users/ningyuhan/Desktop/result_df.csv', index=False)

Unnamed: 0,Tag,Standardized_Sentence,Tag_Component,Topic_Category
1,"[c:02,08,1a,01]",it was moved by mr frey,1a,01
2,"[c:03,10,2a,01]",seconded by mr rath that the minutes of the ju...,2a,01
3,"[c:04,02,1b,03]",mr mcguire informed the board that the may sta...,1b,03
4,"[c:05,02,4d,03]",he did indicate the preliminary weekly results...,4d,03
8,"[c:09,15,1a,04]",it was moved by mr fulton,1a,04
...,...,...,...,...
4749,"[c:01,15,1b,01]",fulton opened the meeting we begin with the fi...,1b,01
4809,"[c:61,12,1a,03]",we make a motion to resist the countys claim a...,1a,03
4810,"[c:62,13,2a,03]",and len seconds as bob says,2a,03
4811,"[c:63,15,3a,03]",the foreigners in this case the outsiders tel...,3a,03


### Find 1a instances and the topics of the 3 following sentences

In [61]:
# Correcting the approach to exclude the topic of the "1a" sentence itself from the 'Following_Topics'

# Re-initialize the list to store the 'following_topics' correctly
corrected_following_topics_for_1a = []

# Iterate over the DataFrame again to correct the 'following_topics'
for index in range(len(result_df)):
    if result_df.iloc[index]['Tag_Component'] == '1a':
        # Initialize an empty list for this '1a' instance
        following_topics = []
        
        # Start from the sentence immediately after the current '1a' sentence
        next_index = index + 1
        while next_index < len(result_df) and result_df.iloc[next_index]['Tag_Component'] != '1a':
            # Append the topic of the following sentence
            following_topics.append(result_df.iloc[next_index]['Topic_Category'])
            next_index += 1

        # Append the collected topics for this '1a' instance
        corrected_following_topics_for_1a.append(following_topics)
    else:
        # Append None for rows that are not '1a' instances, to maintain alignment with the DataFrame
        corrected_following_topics_for_1a.append(None)

# Assign the corrected 'following_topics' to the DataFrame
result_df['Corrected_Following_Topics'] = corrected_following_topics_for_1a

# Filter the DataFrame for '1a' instances and select relevant columns for display
corrected_df_we_need = result_df[result_df['Tag_Component'] == '1a'][['Tag', 'Standardized_Sentence', 'Tag_Component', 'Corrected_Following_Topics']]

corrected_df_we_need = corrected_df_we_need.drop_duplicates(subset=['Tag', 'Standardized_Sentence'])
corrected_df_we_need

Unnamed: 0,Tag,Standardized_Sentence,Tag_Component,Corrected_Following_Topics
1,"[c:02,08,1a,01]",it was moved by mr frey,1a,"[01, 03, 03]"
8,"[c:09,15,1a,04]",it was moved by mr fulton,1a,"[04, 01]"
18,"[c:19,10,1a,01]",it was moved by mr rath,1a,[01]
20,"[c:21,11,1a,01]",mr helstein moved to amend mr raths motion to ...,1a,[01]
29,"[c:30,11,1a,01]",it was moved by mr helstein,1a,"[01, 01]"
...,...,...,...,...
4659,"[c:90,08,1a,04]",mr frey proposed the adoption of the following...,1a,"[04, 04, 04]"
4738,"[c:172,13,1a,06]",after which mr dodson moved that the stock be ...,1a,[06]
4740,"[c:174,22,1a,06]",mr patrie then moved that this matter be tabled,1a,[06]
4746,"[c:180,25,1a,01]",mr taylor moved that the meeting be adjourned,1a,"[01, 01, 01]"


### The instances that the topic are not sustained after 1a

In [62]:
# Define a function to check if all elements in a list are the same
def are_elements_unique(lst):
    return len(set(lst)) > 1

# Apply the function to the 'Corrected_Following_Topics' column and create a mask for rows with non-unique elements
non_unique_mask = corrected_df_we_need['Corrected_Following_Topics'].apply(are_elements_unique)

# Use the mask to filter the DataFrame
non_unique_topics_df = corrected_df_we_need[non_unique_mask]
print(len(non_unique_topics_df))
non_unique_topics_df

58


Unnamed: 0,Tag,Standardized_Sentence,Tag_Component,Corrected_Following_Topics
1,"[c:02,08,1a,01]",it was moved by mr frey,1a,"[01, 03, 03]"
8,"[c:09,15,1a,04]",it was moved by mr fulton,1a,"[04, 01]"
37,"[c:38,09,1a,06]",it was moved by mr thomas,1a,"[06, 08]"
139,"[c:90,13,1a,10]",it was moved by mr dodson,1a,"[10, 03, 09]"
163,"[c:114,15,1a,08]",moved by mr fulton,1a,"[08, 06, 06]"
369,"[c:181,02,1a,03]",it was moved by mr mcguire,1a,"[03, 01, 01]"
409,"[c:39,12,1a,01]",it was moved by mrs hammer,1a,"[01, 07, 07]"
718,"[c:105,09,1a,01]",it was moved mr thomas,1a,"[01, 05, 05]"
989,"[c:181,15,1a,01]",it was then moved by mr fulton,1a,"[01, 06, 06]"
1329,"[c:59,03,1a,04]",mr epstein moved,1a,"[04, 01]"


## Task 4

**prompts:** <br>
The following transcripts including the sentences in the board meetings across serveral months. You will see within the brackets, the third component code 4h represents cases of disaggrement, conflict, and attack. Can you look at sentences before each 4h case and provide the three most common patterns leading up to 4h? I also want you to explain why this leads to a 4h and describe how frequently each of these pattern is seen before a 4h.

**answer:** <br>
1. <br>
One person making a suggestion or statement, followed by another person directly questioning or challenging that statement.
For example:
"[c:11,02,1b,09]",mac mentions the problem of hair on hogs. <br>
"[c:12,11,1c,09]",ralph interrupts and asks for an explanation.

This leads to 4h because Ralph is directly challenging Mac's statement, rather than accepting it or building on it. This creates an oppositional dynamic. This pattern is seen quite frequently before 4h labels.

2. <br>
One person giving a vague, general, or "glossy" response, followed by another person pressing for more details or specifics.
For example:
"[c:18,03,4i,12]",herb gives a superficial explanation for hog buying. <br>
"[c:19,09,1c,12]",jack thomas wants to know more about the hog hedging operation.

This leads to 4h because the vague response leaves the other person unsatisfied, so they push harder, putting the original person on the defensive. This pattern is moderately common.

3. <br>
One person interpreting or restating what another said, followed by the original person disagreeing with that interpretation.
For example:
"[c:42,04,2b,05]",john lambert agrees with tove's question. <br>
"[c:43,02,4h,05]",mac disagrees with john's interpretation of tove's question.

This leads to 4h because the original person feels misinterpreted or mischaracterized, leading them to reject the other's statement. This pattern is less frequent than the other two.

Based on the sentences in these documents, I did not categorize any of them as stress level 5 because none of them seemed to indicate an extreme or very high level of stress. Here are some reasons why level 5 was not used:

- The sentences describe meetings, conversations, reports, motions, votes, etc. The content is quite factual and neutral. Even when there is disagreement or conflict, it does not seem to reach an extreme level based on the descriptions.

- The language used is quite formal and proper. There is no strong emotional language that would indicate extreme stress, anger, frustration, etc.

- Many sentences simply report on financial matters, procedures, updates, etc. These are routine business matters that likely do not provoke extreme stress.

- While some sentences mention problems or challenges the company is facing, the tone and language remains measured. There are no dramatic expressions of crisis or urgency.

- Issues are discussed and handled in a orderly fashion through motions, votes, and other standard meeting proceedings. This structured approach diminishes signals of extreme stress.

- Even for difficult issues like union relations, the descriptions focus on the logical exchanges without strong emotional undertones.

- The participants are business professionals having discussions in a corporate board meeting setting. Extreme stress responses would be uncharacteristic and unlikely unless described explicitly.

In summary, the content, tone, language, setting, and participants suggest measured, logical discourse. Without clear indicators of extreme crisis, anger, urgency, etc. using stress level 5 would not seem appropriate based on the descriptions provided in these documents. More explicitsignals of intense emotion would be needed to categorize a sentence as a 5.