# Data Preparation

In [1]:
import numpy as np
import pandas as pd

data = pd.read_csv("speaker-speech.csv")

The speakers First Citizen and Second Citizen have spaces within them and since we'll be working with these from a dataframe and list perspective it's better to transcribe the spaces into underscores until the final datasets

In [2]:
data['speaker'] = data['speaker'].str.replace('First Citizen', 'First_Citizen', regex=False)
data['speaker'] = data['speaker'].str.replace('Second Citizen', 'Second_Citizen', regex=False)

# Creating the transition matrices

In this step we separate the process into three sub-processes:


1) Create an algorithm to calculate a transition matrix:


In this process we code up instructions for how to take a list and then determine the unqiue states that exist within that list and then do the transition matrix with percent chances of moving from one state to another.


2) Create the transition matrix for how the speakers follow one another:

We only need to convert the column "speaker" into a list and then run it through the algorithm to create a transition matrix


3) Create transition matrices for each speaker:

In this instance, we first create lists that contain all the different speeches for each speaker. 
    
Then we ensure that all the speeches are in one cell, so we can separate the words within the speeches to create one long list of the words from all the speeches by that speaker. These are further ran though a function that ensures all words are lower case so the transition matrix algorithm doesn't detect the same word as multiple states due to capitalization of some of the words.

Finally, we run the different lists through the algorthm for final transition probabilities of each word from the different speakers.

## 1) Algorthim for creating a transition matrix

In [3]:
def transition_matrix(l):
    # l is the list that we input into the algorithm
    
    # Get unique states in desired order
    states = list(dict.fromkeys(l))

    # find the index of each state
    idx_map = {state: i for i, state in enumerate(states)}

    # Initialize zero matrix
    n = len(states)
    count_matrix = np.zeros((n, n), dtype=int)

    # Fill transition counts
    for current, next_ in zip(l, l[1:]):
        if current in idx_map and next_ in idx_map:
            i, j = idx_map[current], idx_map[next_]
            count_matrix[i, j] += 1

    # Create DataFrame
    df = pd.DataFrame(count_matrix, index=states, columns=states)


    # Convert counts to probabilities row-wise
    df = df.div(df.sum(axis=1), axis=0).fillna(0)

    return df

## 2) Speakers Transition Matrix

In [4]:
# convert the speaker column into a list

speaker = data['speaker'].tolist()

# run the list through the algorithm for final

speaker_transition_matrix = transition_matrix(speaker)

In [5]:
speaker_transition_matrix

Unnamed: 0,First_Citizen,All,Second_Citizen,MENENIUS,MARCIUS
First_Citizen,0.0,0.217391,0.173913,0.565217,0.043478
All,0.666667,0.0,0.333333,0.0,0.0
Second_Citizen,0.833333,0.166667,0.0,0.0,0.0
MENENIUS,0.705882,0.0,0.0,0.0,0.294118
MARCIUS,0.2,0.0,0.0,0.8,0.0


In [6]:
# save into a csv

speaker_transition_matrix.to_csv("speaker_transition_matrix.csv", index=False)

## 3) Speeches Transition Matrices

### a) Collect each speaker's speech into list

In [6]:
# initialize a data dictionary to store the different lists

data_dict = {}

# create a list of the unique speakers to loop through
unique_speaker = list(dict.fromkeys(speaker))

for talking in unique_speaker:
    data_dict[talking] = []  # Initialize the list for each speaker
    for i in range(len(data)):
        if data.loc[i, 'speaker'] == talking:
            data_dict[talking].append(data.loc[i, 'speech_cleaned'])

### b) Convert each speech into one itemed list

In [7]:
for speaker in data_dict:
    
    # Convert each list into a single space-separated string (speeches converted into 1 item)
    data_dict[speaker] = ' '.join(data_dict[speaker])

    # split that string into a list of words
    data_dict[speaker] = data_dict[speaker].split()

    # save the list into respective speaker
    globals()[speaker] = data_dict[speaker]

### c) Ensure each alphabet is lower case

In [8]:
# ensure all alphabets are low case

# List of lists
all_lists = [First_Citizen, All, Second_Citizen, MENENIUS, MARCIUS]

# Process each list
for i in range(len(all_lists)):
    for j in range(len(all_lists[i])):
        all_lists[i][j] = all_lists[i][j].lower()

### d) Calculate transition matrices

In [27]:
# List of lists of speeches
all_lists = [First_Citizen, All, Second_Citizen, MENENIUS, MARCIUS]

# Corresponding variable names for clarity
speaker_names = ['first_citizen', 'all', 'second_citizen', 'menenius', 'marcius']

# Store the resulting transition matrices in a dictionary
transition_matrices = {}

# Loop through each list and compute the transition matrix
for i in range(len(all_lists)):
    transition_matrices[speaker_names[i]] = transition_matrix(all_lists[i])

In [28]:
transition_matrices['marcius']

Unnamed: 0,thanks,whats,the,matter,you,dissentious,rogues,that,rubbing,poor,...,win,throw,greater,themes,insurrections,arguing,go,get,home,fragments
thanks,0.0,1.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
whats,0.0,0.0,0.500000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
the,0.0,0.0,0.000000,0.095238,0.0,0.000000,0.0,0.0,0.0,0.047619,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
matter,0.0,0.0,0.000000,0.000000,0.5,0.000000,0.0,0.5,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
you,0.0,0.0,0.071429,0.000000,0.0,0.071429,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.071429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
arguing,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.000000,0.000000
go,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.000000,0.000000
get,0.0,0.0,0.000000,0.000000,1.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
home,0.0,0.0,0.000000,0.000000,1.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000


In [29]:
# save the files into CSVs

for name, matrix in transition_matrices.items():
    matrix.to_csv(f"{name}_transition_matrix.csv", index=True)