In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("speaker-speech.csv")

In [3]:
data['speaker'] = data['speaker'].str.replace('First Citizen', 'First_Citizen', regex=False)
data['speaker'] = data['speaker'].str.replace('Second Citizen', 'Second_Citizen', regex=False)

data

Unnamed: 0,speaker,speech,speech_cleaned
0,First_Citizen,"Before we proceed any further, hear me speak.",Before we proceed any further hear me speak
1,All,"Speak, speak.",Speak speak
2,First_Citizen,You are all resolved rather to die than to fam...,You are all resolved rather to die than to famish
3,All,Resolved. resolved.,Resolved resolved
4,First_Citizen,"First, you know Caius Marcius is chief enemy t...",First you know Caius Marcius is chief enemy to...
5,All,"We know't, we know't.",We knowt we knowt
6,First_Citizen,"Let us kill him, and we'll have corn at our ow...",Let us kill him and well have corn at our own ...
7,All,"No more talking on't; let it be done: away, away!",No more talking ont let it be done away away
8,Second_Citizen,"One word, good citizens.",One word good citizens
9,First_Citizen,"We are accounted poor citizens, the patricians...",We are accounted poor citizens the patricians ...


# Dealing with speakers

In [4]:
speaker = data['speaker'].tolist()

In [5]:
# Get unique states in desired order

states = list(dict.fromkeys(speaker))
states

['First_Citizen', 'All', 'Second_Citizen', 'MENENIUS', 'MARCIUS']

In [6]:
# find the index of each state
idx_map = {state: i for i, state in enumerate(states)}
idx_map

{'First_Citizen': 0,
 'All': 1,
 'Second_Citizen': 2,
 'MENENIUS': 3,
 'MARCIUS': 4}

In [7]:
# Initialize zero matrix
n = len(states)
count_matrix = np.zeros((n, n), dtype=int)

In [8]:
count_matrix

array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])

In [9]:
# Fill transition counts
for current, next_ in zip(speaker, speaker[1:]):
    if current in idx_map and next_ in idx_map:
        i, j = idx_map[current], idx_map[next_]
        count_matrix[i, j] += 1

count_matrix

array([[ 0,  5,  4, 13,  1],
       [ 4,  0,  2,  0,  0],
       [ 5,  1,  0,  0,  0],
       [12,  0,  0,  0,  5],
       [ 1,  0,  0,  4,  0]])

In [10]:
# Create DataFrame
df = pd.DataFrame(count_matrix, index=states, columns=states)


# Convert counts to probabilities row-wise
df = df.div(df.sum(axis=1), axis=0).fillna(0)

df

Unnamed: 0,First_Citizen,All,Second_Citizen,MENENIUS,MARCIUS
First_Citizen,0.0,0.217391,0.173913,0.565217,0.043478
All,0.666667,0.0,0.333333,0.0,0.0
Second_Citizen,0.833333,0.166667,0.0,0.0,0.0
MENENIUS,0.705882,0.0,0.0,0.0,0.294118
MARCIUS,0.2,0.0,0.0,0.8,0.0


# Dealing with the speeches

In [53]:
unique_speaker = states
unique_speaker

['First_Citizen', 'All', 'Second_Citizen', 'MENENIUS', 'MARCIUS']

In [61]:
data_dict = {}

for talking in unique_speaker:
    data_dict[talking] = []  # Initialize the list for each speaker
    for i in range(len(data)):
        if data.loc[i, 'speaker'] == talking:
            data_dict[talking].append(data.loc[i, 'speech_cleaned'])

In [62]:
# Convert each list into a single space-separated string
for speaker in data_dict:
    data_dict[speaker] = ' '.join(data_dict[speaker])

First_Citizen = data_dict['First_Citizen']
All = data_dict['All']
Second_Citizen = data_dict['Second_Citizen']
MENENIUS = data_dict['MENENIUS']
MARCIUS = data_dict['MARCIUS']

In [63]:
Second_Citizen

'One word good citizens Would you proceed especially against Caius Marcius Consider you what services he has done for his country Nay but speak not maliciously What he cannot help in his nature you account a vice in him You must in no way say he is covetous Worthy Menenius Agrippa one that hath always loved the people'

In [64]:
# l = ['One word good citizens Would you proceed']
# split_words = l[0].split()

First_Citizen = First_Citizen.split()
All = All.split()
Second_Citizen = Second_Citizen.split()
MENENIUS = MENENIUS.split()
MARCIUS = MARCIUS.split()

In [69]:
# Make every letter a small case letter so that there's no distinction between the same word that is differentiated by capital letters
MARCIUS_2 = []

for item in All:
    if isinstance(item, str):
        MARCIUS_2.append(item.lower())
    else:
        MARCIUS_2.append(item)


In [70]:
def transition_matrix(l):
    
    # Get unique states in desired order
    states = list(dict.fromkeys(l))

    # find the index of each state
    idx_map = {state: i for i, state in enumerate(states)}

    # Initialize zero matrix
    n = len(states)
    count_matrix = np.zeros((n, n), dtype=int)

    # Fill transition counts
    for current, next_ in zip(speaker, speaker[1:]):
        if current in idx_map and next_ in idx_map:
            i, j = idx_map[current], idx_map[next_]
            count_matrix[i, j] += 1

    # Create DataFrame
    df = pd.DataFrame(count_matrix, index=states, columns=states)


    # Convert counts to probabilities row-wise
    df = df.div(df.sum(axis=1), axis=0).fillna(0)

    return df

In [75]:
MENENIUS_3 = transition_matrix(MENENIUS_2)
MENENIUS_3

Unnamed: 0,speak,resolved,we,knowt,no,more,talking,ont,let,it,...,him,first,hes,a,very,dog,to,the,commonalty,come
speak,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
resolved,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
we,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
knowt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
no,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
more,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
talking,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ont,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
let,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
it,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
import numpy as np
import pandas as pd

def transition_matrix(seq, sort_states=True, normalize=False):
    """
    Creates a transition matrix from a sequence of states.

    Parameters:
    - seq: list of states (e.g., ['a', 'b', 'c'])
    - sort_states: if True, states are sorted alphabetically
                   if False, the order of first appearance is preserved
    - normalize: if True, returns a row-normalized probability matrix

    Returns:
    - pandas.DataFrame of transitions (counts or probabilities)
    """
    if len(seq) < 2:
        raise ValueError("Sequence must have at least two elements")

    # Get unique states in desired order
    if sort_states:
        states = sorted(set(seq))
    else:
        states = list(dict.fromkeys(seq))  # preserves first occurrence order

    idx_map = {state: i for i, state in enumerate(states)}
    n = len(states)

    # Initialize zero matrix
    count_matrix = np.zeros((n, n), dtype=int)

    # Fill transition counts
    for current, next_ in zip(seq, seq[1:]):
        if current in idx_map and next_ in idx_map:
            i, j = idx_map[current], idx_map[next_]
            count_matrix[i, j] += 1

    # Create DataFrame
    df = pd.DataFrame(count_matrix, index=states, columns=states)

    if normalize:
        # Convert counts to probabilities row-wise
        df = df.div(df.sum(axis=1), axis=0).fillna(0)

    return df

# ── Example usage ─────────────────────────────────────
x = ['a', 'b', 'd', 'a', 'c', 'b', 'a', 'd']

# Transition counts
transition_counts = transition_matrix(x, sort_states=True, normalize=False)
print("🔢 Transition Counts:")
print(transition_counts)

# Transition probabilities
transition_probs = transition_matrix(x, sort_states=True, normalize=True)
print("\n📊 Transition Probabilities:")
print(transition_probs.round(2))


🔢 Transition Counts:
   a  b  c  d
a  0  1  1  1
b  1  0  0  1
c  0  1  0  0
d  1  0  0  0

📊 Transition Probabilities:
     a     b     c     d
a  0.0  0.33  0.33  0.33
b  0.5  0.00  0.00  0.50
c  0.0  1.00  0.00  0.00
d  1.0  0.00  0.00  0.00
