**In this notebook, I am trying to identify the precursors (3 and 5 sentences before the desired category) that might lead to the '4h' verbal contribution category, which stands for attack, disagreement, and conflict.**

# Packages

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk import ngrams, FreqDist
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import re
from collections import Counter
from nltk.tokenize import RegexpTokenizer



# Data

In [2]:
df = pd.read_csv('/Users/ningyuhan/Desktop/combined_dataframe.csv')
df.head(5)

Unnamed: 0,Tag,Sentence
0,"[c:01,01,1b,01]",Swisher opens the meeting.
1,"[c:02,08,1a,01]","It was moved by Mr. Frey,"
2,"[c:03,10,2a,01]","seconded by Mr. Rath, that the minutes of the ..."
3,"[c:04,02,1b,03]",Mr. McGuire informed the Board that the May st...
4,"[c:05,02,4d,03]",He did indicate the preliminary weekly results...


# Get 3 tags and sentences before meeting 4h

In [3]:
def find_sentences_with_4h_and_preceding(df, target_component='4h', num_previous=3):
    df['ThirdComponent'] = df['Tag'].apply(lambda x: x.split(',')[2])

    included_sentences = set()
    result = []

    for index in range(len(df)):
        if df.iloc[index]['ThirdComponent'] == target_component:
            # Determine the range of indices to include
            start_index = max(index - num_previous, 0)
            end_index = index + 1  # Include current index

            for idx in range(start_index, end_index):
                # Check to avoid duplicates
                if df.iloc[idx]['Sentence'] not in included_sentences:
                    result.append(df.iloc[idx][['Tag', 'Sentence']].tolist())
                    included_sentences.add(df.iloc[idx]['Sentence'])
    
    return result

# Apply the function to your DataFrame
result = find_sentences_with_4h_and_preceding(df)
df_result = pd.DataFrame(result, columns=["Tag", "Sentence"])

df_result

Unnamed: 0,Tag,Sentence
0,"[c:10,12,2a,04]","seconded by Mrs. Hammer, and unanimously carri..."
1,"[c:11,15,1a,01]","It was moved by Mr. Fulton,"
2,"[c:12,11,2a,01]","seconded by Mr. Helstein, and unanimously carr..."
3,"[c:13,13,4h,01]",Dodson got mad at Ralph
4,"[c:47,04,2b,05]",Lambert echoing that.
...,...,...
1022,"[c:87,12,4h,06]",Tove gets angry at Lyle and turns on him and s...
1023,"[c:98,22,4e3,04]","which includes Walker, who is an old CEO."
1024,"[c:99,12,1c,04]",I ask if he wasn't the one who was fired and w...
1025,"[c:100,22,1b,04]","Laverne says he wasnÕt fired,"


## Word Frequency Analysis Before meeting 4h

In [8]:
from nltk import download

# Ensure necessary NLTK resources are downloaded
download('punkt', quiet=True)
download('stopwords', quiet=True)

# Function to clean and tokenize sentences
def clean_and_tokenize(sentence):
    if pd.isnull(sentence):
        return []
    
    sentence = re.sub(r'\W+', ' ', str(sentence)).lower()
    words = word_tokenize(sentence)
    stop_words = set(stopwords.words('english'))
    return [word for word in words if word.isalpha() and word not in stop_words]

# Lists to store preceding sentences and tags
preceding_sentences = []
preceding_tags = []

# Iterate through the DataFrame
for index, row in df_result.iterrows():
    if '4h' in row['Tag']:
        # Get up to 3 preceding sentences and tags
        for i in range(max(0, index-3), index):
            preceding_sentences.append(df_result.iloc[i]['Sentence'])
            preceding_tags.append(df_result.iloc[i]['Tag'])

# List comprehension to tokenize and clean all preceding sentences
all_words = [word for sentence in preceding_sentences for word in clean_and_tokenize(sentence)]

# Calculate frequency distribution of words
word_freq = FreqDist(all_words)
print("Most common words:", word_freq.most_common(10))

# Generate bigrams from the list of words
bigrams = ngrams(all_words, 2)

# Calculate frequency distribution of bigrams
bigram_freq = FreqDist(bigrams)
print("Most common bigrams:", bigram_freq.most_common(10))


Most common words: [('says', 462), ('herb', 270), ('lyle', 258), ('people', 172), ('board', 136), ('ralph', 130), ('ó', 127), ('wants', 125), ('tove', 120), ('swisher', 111)]
Most common bigrams: [(('herb', 'says'), 55), (('lyle', 'says'), 35), (('million', 'dollars'), 34), (('bob', 'fulton'), 33), (('ralph', 'says'), 27), (('swisher', 'says'), 25), (('tove', 'says'), 24), (('wants', 'know'), 20), (('dick', 'clark'), 19), (('says', 'wants'), 17)]


## Tag Sequence Analysis

In [9]:
def extract_third_component(tag):
    match = re.findall(r'\[c:\d+,\d+,(\w+),\d+\]', tag)
    if match:
        return match[0]
    else:
        return None  

def analyze_third_component_sequences(df):
    preceding_tags = []

    for index, row in df.iterrows():
        if '4h' in row['Tag']:
            # Get up to 3 preceding tags
            start_index = max(0, index - 3)
            for i in range(start_index, index):
                tag = df.iloc[i]['Tag']
                third_component = extract_third_component(tag)
                if third_component:
                    preceding_tags.append(third_component)

    third_component_seq = [" -> ".join(preceding_tags[i:i+3]) for i in range(len(preceding_tags) - 2)]
    third_component_seq_freq = Counter(third_component_seq)

    return third_component_seq_freq

# Analyzing third component sequences
third_component_seq_freq = analyze_third_component_sequences(df_result)

# Display most common third component sequences
print("Most common third component sequences:", third_component_seq_freq.most_common(10))

Most common third component sequences: [('4h -> 4h -> 4h', 20), ('4d -> 1b -> 4d', 11), ('4h -> 4d -> 4h', 10), ('4d -> 4d -> 4h', 9), ('1b -> 4d -> 1b', 9), ('1b -> 1c -> 1b', 9), ('1c -> 1b -> 1c', 9), ('4d -> 4d -> 4d', 9), ('1b -> 1c -> 4d', 8), ('4d -> 1c -> 4d', 8)]


# Get 5 tags and sentences before meeting 4h

In [11]:
def find_sentences_with_4h_and_preceding(df, target_component='4h', num_previous=5):
    df['ThirdComponent'] = df['Tag'].apply(lambda x: x.split(',')[2])

    included_sentences = set()
    result = []

    for index in range(len(df)):
        if df.iloc[index]['ThirdComponent'] == target_component:
            # Determine the range of indices to include
            start_index = max(index - num_previous, 0)
            end_index = index + 1  # Include current index

            for idx in range(start_index, end_index):
                # Check to avoid duplicates
                if df.iloc[idx]['Sentence'] not in included_sentences:
                    result.append(df.iloc[idx][['Tag', 'Sentence']].tolist())
                    included_sentences.add(df.iloc[idx]['Sentence'])
    
    return result

result = find_sentences_with_4h_and_preceding(df)
df_result_5 = pd.DataFrame(result, columns=["Tag", "Sentence"])
df_result_5

Unnamed: 0,Tag,Sentence
0,"[c:08,01,4e1,04]",Swisher would like to be able to assure the of...
1,"[c:09,15,1a,04]","it was moved by Mr. Fulton,"
2,"[c:10,12,2a,04]","seconded by Mrs. Hammer, and unanimously carri..."
3,"[c:11,15,1a,01]","It was moved by Mr. Fulton,"
4,"[c:12,11,2a,01]","seconded by Mr. Helstein, and unanimously carr..."
...,...,...
1371,"[c:97,22,4f,03]",Then comes LaverneÕs proposal for Reorganization
1372,"[c:98,22,4e3,04]","which includes Walker, who is an old CEO."
1373,"[c:99,12,1c,04]",I ask if he wasn't the one who was fired and w...
1374,"[c:100,22,1b,04]","Laverne says he wasnÕt fired,"


## Word Frequency Analysis

In [14]:
# Download necessary NLTK resources quietly
download('punkt', quiet=True)
download('stopwords', quiet=True)

# Improved tokenizer
tokenizer = RegexpTokenizer(r"\b\w[\w']+\b")

# Use NLTK's stop words
stop_words = set(stopwords.words('english'))

def clean_and_tokenize(sentence):
    if pd.isnull(sentence):
        return []
    
    sentence = sentence.lower()
    words = tokenizer.tokenize(sentence)
    return [word for word in words if word not in stop_words]

# Assuming df_result_5 is your DataFrame
# Extract preceding sentences
preceding_sentences = [df_result_5.iloc[i]['Sentence'] for index, row in df_result_5.iterrows() if '4h' in row['Tag'] for i in range(max(0, index-3), index)]

# Word Frequency Analysis
all_words = [word for sentence in preceding_sentences for word in clean_and_tokenize(sentence)]
word_freq = FreqDist(all_words)

# Display most common words
print("Most common words:", word_freq.most_common(10))

# N-gram Analysis
bigrams = ngrams(all_words, 2)
bigram_freq = FreqDist(bigrams)
print("Most common bigrams:", bigram_freq.most_common(10))

Most common words: [('says', 463), ('herb', 264), ('lyle', 250), ('people', 171), ('board', 137), ('ralph', 128), ('wants', 126), ('tove', 115), ('swisher', 108), ('get', 103)]
Most common bigrams: [(('herb', 'says'), 55), (('lyle', 'says'), 35), (('million', 'dollars'), 34), (('bob', 'fulton'), 32), (('ralph', 'says'), 27), (('swisher', 'says'), 25), (('tove', 'says'), 24), (('wants', 'know'), 20), (('dick', 'clark'), 18), (('says', 'wants'), 17)]


## Tag Sequence Analysis

In [13]:
def extract_third_component(tag):
    match = re.findall(r'\[c:\d+,\d+,(\w+),\d+\]', tag)
    return match[0] if match else None

def analyze_third_component_sequences_with_five(df):
    preceding_tags = []

    for index, row in df.iterrows():
        if '4h' in row['Tag']:
            # Get up to 5 preceding tags
            start_index = max(0, index - 5)
            for i in range(start_index, index):
                tag = df.iloc[i]['Tag']
                third_component = extract_third_component(tag)
                if third_component:
                    preceding_tags.append(third_component)

    # Create sequences of 5 third components
    third_component_seq = [" -> ".join(preceding_tags[i:i+5]) for i in range(len(preceding_tags) - 4)]
    third_component_seq_freq = Counter(third_component_seq)

    return third_component_seq_freq

# Analyzing third component sequences
third_component_seq_freq = analyze_third_component_sequences_with_five(df_result_5)

# Display most common third component sequences
most_common_sequences = third_component_seq_freq.most_common(10)
most_common_sequences



[('4h -> 4h -> 4h -> 4h -> 4h', 7),
 ('4d -> 4d -> 4d -> 4d -> 4d', 5),
 ('1b -> 1b -> 4d -> 1c -> 4d', 3),
 ('4h -> 4d -> 4h -> 4d -> 4h', 3),
 ('4d -> 1b -> 1b -> 1b -> 4d', 3),
 ('1b -> 1c -> 1b -> 1c -> 1b', 3),
 ('1c -> 1b -> 1c -> 1b -> 1c', 3),
 ('1b -> 4d -> 1b -> 1b -> 1b', 3),
 ('1b -> 4h -> 1b -> 4d -> 1c', 2),
 ('4h -> 1b -> 4d -> 1c -> 1b', 2)]