In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("speaker-speech.csv")

In [3]:
data['speaker'] = data['speaker'].str.replace('First Citizen', 'First_Citizen', regex=False)
data['speaker'] = data['speaker'].str.replace('Second Citizen', 'Second_Citizen', regex=False)

data

Unnamed: 0,speaker,speech,speech_cleaned
0,First_Citizen,"Before we proceed any further, hear me speak.",Before we proceed any further hear me speak
1,All,"Speak, speak.",Speak speak
2,First_Citizen,You are all resolved rather to die than to fam...,You are all resolved rather to die than to famish
3,All,Resolved. resolved.,Resolved resolved
4,First_Citizen,"First, you know Caius Marcius is chief enemy t...",First you know Caius Marcius is chief enemy to...
5,All,"We know't, we know't.",We knowt we knowt
6,First_Citizen,"Let us kill him, and we'll have corn at our ow...",Let us kill him and well have corn at our own ...
7,All,"No more talking on't; let it be done: away, away!",No more talking ont let it be done away away
8,Second_Citizen,"One word, good citizens.",One word good citizens
9,First_Citizen,"We are accounted poor citizens, the patricians...",We are accounted poor citizens the patricians ...


# Dealing with speakers

In [4]:
speaker = data['speaker'].tolist()

In [5]:
# Get unique states in desired order

states = list(dict.fromkeys(speaker))
states

['First_Citizen', 'All', 'Second_Citizen', 'MENENIUS', 'MARCIUS']

In [6]:
# find the index of each state
idx_map = {state: i for i, state in enumerate(states)}
idx_map

{'First_Citizen': 0,
 'All': 1,
 'Second_Citizen': 2,
 'MENENIUS': 3,
 'MARCIUS': 4}

In [7]:
# Initialize zero matrix
n = len(states)
count_matrix = np.zeros((n, n), dtype=int)

In [8]:
count_matrix

array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])

In [9]:
# Fill transition counts
for current, next_ in zip(speaker, speaker[1:]):
    if current in idx_map and next_ in idx_map:
        i, j = idx_map[current], idx_map[next_]
        count_matrix[i, j] += 1

count_matrix

array([[ 0,  5,  4, 13,  1],
       [ 4,  0,  2,  0,  0],
       [ 5,  1,  0,  0,  0],
       [12,  0,  0,  0,  5],
       [ 1,  0,  0,  4,  0]])

In [10]:
# Create DataFrame
df = pd.DataFrame(count_matrix, index=states, columns=states)


# Convert counts to probabilities row-wise
df = df.div(df.sum(axis=1), axis=0).fillna(0)

df

Unnamed: 0,First_Citizen,All,Second_Citizen,MENENIUS,MARCIUS
First_Citizen,0.0,0.217391,0.173913,0.565217,0.043478
All,0.666667,0.0,0.333333,0.0,0.0
Second_Citizen,0.833333,0.166667,0.0,0.0,0.0
MENENIUS,0.705882,0.0,0.0,0.0,0.294118
MARCIUS,0.2,0.0,0.0,0.8,0.0


# Dealing with the speeches

In [11]:
unique_speaker = states
unique_speaker

['First_Citizen', 'All', 'Second_Citizen', 'MENENIUS', 'MARCIUS']

In [12]:
data_dict = {}

for talking in unique_speaker:
    data_dict[talking] = []  # Initialize the list for each speaker
    for i in range(len(data)):
        if data.loc[i, 'speaker'] == talking:
            data_dict[talking].append(data.loc[i, 'speech_cleaned'])

In [13]:
# Convert each list into a single space-separated string
for speaker in data_dict:
    data_dict[speaker] = ' '.join(data_dict[speaker])

First_Citizen = data_dict['First_Citizen']
All = data_dict['All']
Second_Citizen = data_dict['Second_Citizen']
MENENIUS = data_dict['MENENIUS']
MARCIUS = data_dict['MARCIUS']

In [14]:
# l = ['One word good citizens Would you proceed']
# split_words = l[0].split()

First_Citizen = First_Citizen.split()
All = All.split()
Second_Citizen = Second_Citizen.split()
MENENIUS = MENENIUS.split()
MARCIUS = MARCIUS.split()

In [19]:
# Make every letter a small case letter so that there's no distinction between the same word that is differentiated by capital letters
MARCIUS_2 = []

for item in MARCIUS:
    if isinstance(item, str):
        MARCIUS_2.append(item.lower())
    else:
       MARCIUS_2.append(item)


In [22]:
def transition_matrix(l):
    
    # Get unique states in desired order
    states = list(dict.fromkeys(l))

    # find the index of each state
    idx_map = {state: i for i, state in enumerate(states)}

    # Initialize zero matrix
    n = len(states)
    count_matrix = np.zeros((n, n), dtype=int)

    # Fill transition counts
    for current, next_ in zip(l, l[1:]):
        if current in idx_map and next_ in idx_map:
            i, j = idx_map[current], idx_map[next_]
            count_matrix[i, j] += 1

    # Create DataFrame
    df = pd.DataFrame(count_matrix, index=states, columns=states)


    # Convert counts to probabilities row-wise
    df = df.div(df.sum(axis=1), axis=0).fillna(0)

    return df

In [27]:
MARCIUS_3 = transition_matrix(MARCIUS_2)
MARCIUS_3

Unnamed: 0,thanks,whats,the,matter,you,dissentious,rogues,that,rubbing,poor,...,win,throw,greater,themes,insurrections,arguing,go,get,home,fragments
thanks,0.0,1.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
whats,0.0,0.0,0.500000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
the,0.0,0.0,0.000000,0.095238,0.0,0.000000,0.0,0.0,0.0,0.047619,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
matter,0.0,0.0,0.000000,0.000000,0.5,0.000000,0.0,0.5,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
you,0.0,0.0,0.071429,0.000000,0.0,0.071429,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.071429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
arguing,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.000000,0.000000
go,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.000000,0.000000
get,0.0,0.0,0.000000,0.000000,1.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
home,0.0,0.0,0.000000,0.000000,1.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000


In [28]:
# Check for any non-zero value
has_non_zero = (MARCIUS_3 != 0).any().any()

print(has_non_zero)

True
