# Season 13 data preparation

This is the second version of the software, which was created to support larger amounts of data. Here, the alignment was performed on all of the episodes from the thirteenth season of *Grey's Anatomy*. The input is the following:

 
1.   An XLSX file containing the data from episode 13x01 to 13x24.
2.   A ZIP file containing the subtitles of each episode.


# Libraries

In addition to pandas and re, some additional libraries were used in order to extract and organize the files.

In [None]:
import pandas as pd
import numpy as np
import tempfile
import zipfile
import shutil
import glob
import os
import re

In [None]:
pd.options.mode.chained_assignment = None 

# Importing the files

In [None]:
# Opening season 13 .xlsx file 

excel_path = input('Enter .xlsx file path: ')
df_excel = pd.read_excel(excel_path)

Enter .xlsx file path: /content/GAS12.xlsx


In [None]:
# Structure of the .xlsx file

df_excel

Unnamed: 0,Series,Season,Codice,Inizio,Fine,PP,SP,MC
0,GA,GAS11,GAS11E01,00:00:00,00:00:27,0,0,0
1,GA,GAS11,GAS11E01,00:00:27,00:00:48,0,6,0
2,GA,GAS11,GAS11E01,00:00:48,00:00:58,0,6,0
3,GA,GAS11,GAS11E01,00:00:58,00:01:11,0,6,0
4,GA,GAS11,GAS11E01,00:01:11,00:01:16,0,6,0
...,...,...,...,...,...,...,...,...
1831,GA,GAS11,GAS11E24,00:37:08,00:37:48,0,6,0
1832,GA,GAS11,GAS11E24,00:37:48,00:38:12,0,6,0
1833,GA,GAS11,GAS11E24,00:38:12,00:40:25,0,6,0
1834,GA,GAS11,GAS11E24,00:40:25,00:41:16,0,6,0


In [None]:
# Unzipping the contents of the .zip file to a temporary directory

temp_dir = tempfile.TemporaryDirectory()
zip_path = input('Enter .zip file path: ')

with zipfile.ZipFile(zip_path) as z:
  
   for zip_info in z.infolist():
    if zip_info.filename[-1] == '/':
      continue

    zip_info.filename = os.path.basename(zip_info.filename)
    z.extract(zip_info, temp_dir.name)

print('Temp directory path: ' + str(temp_dir.name))

Enter .zip file path: /content/subtitles_season_12.zip
Temp directory path: /tmp/tmp7q710f4y


In [None]:
# Getting the paths of the extracted .srt files

srt_paths = []

for filename in sorted(glob.glob(os.path.join(temp_dir.name, '*.srt'))):
  srt_paths.append(filename)

# Cleaning the data

Before aligning the files, the following cleaning was performed on the Excel file:
*   Timestamps containing milliseconds were rounded to the nearest second
*   Typographical errors were fixed (e.g. "NA ", "000:09:20,487")
*   NaNs were replaced with 0s
*   Floats were converted to integers





In [None]:
# Dropping columns

df_excel = df_excel.drop(['Season', 'N_segmento', 'Durata', 'PP_rel', 'SP_rel', 'MC_rel', 'Note'], axis=1)

In [None]:
# Rounding start timestamps

df_excel['Inizio'] = df_excel['Inizio'].astype(str)
df_excel['Inizio'] = pd.to_datetime(df_excel['Inizio'], errors = 'coerce')
df_excel['Inizio'] = df_excel['Inizio'].dt.round('1s')
df_excel['Inizio'] = df_excel['Inizio'].dt.time

In [None]:
# Rounding end timestamps

df_excel['Fine'] = df_excel['Fine'].astype(str)
df_excel['Fine'] = df_excel['Fine'].apply(lambda x : '00:09:20' if x=='000:09:20,487' else x) # Fixing a value in line 614
df_excel['Fine'] = pd.to_datetime(df_excel['Fine'], errors = 'coerce')
df_excel['Fine'] = df_excel['Fine'].dt.round('1s')
df_excel['Fine'] = df_excel['Fine'].dt.time

In [None]:
# Fixing other typographical errors

df_excel.PP.replace('NA ', np.nan, inplace=True)
df_excel.SP.replace('NA ', np.nan, inplace=True)
df_excel.MC.replace('NA ', np.nan, inplace=True)
df_excel.MC.replace('SP10', np.nan, inplace=True) # Fixing a value in line 786

In [None]:
# Replacing NaNs with 0s

df_excel = df_excel.fillna(0)

In [None]:
# Converting floats to ints

df_excel['PP'] = df_excel['PP'].astype(int)
df_excel['SP'] = df_excel['SP'].astype(int)
df_excel['MC'] = df_excel['MC'].astype(int)

In [None]:
# Structure of the .xlsx file after cleaning

df_excel

Unnamed: 0,Series,Season,Codice,Inizio,Fine,PP,SP,MC
0,GA,GAS12,GAS12E01,00:00:00,00:00:07,0,0,0
1,GA,GAS12,GAS12E01,00:00:07,00:00:27,0,6,0
2,GA,GAS12,GAS12E01,00:00:27,00:00:29,0,0,0
3,GA,GAS12,GAS12E01,00:00:29,00:00:56,0,6,0
4,GA,GAS12,GAS12E01,00:00:56,00:01:10,0,6,0
...,...,...,...,...,...,...,...,...
1491,GA,GAS12,GAS12E24,00:38:59,00:39:58,0,6,0
1492,GA,GAS12,GAS12E24,00:39:58,00:40:13,0,6,0
1493,GA,GAS12,GAS12E24,00:40:13,00:40:17,0,6,0
1494,GA,GAS12,GAS12E24,00:40:17,00:41:23,0,6,0


# Splitting the Excel file by episode

The idea behind the following section is to split the Excel file so that each episode is stored as a separate DataFrame. This way, the procedure followed in episode_12x01_data_preparation.ipynb can be reimplemented for multiple episodes as well.

In [None]:
# Finding where each episode starts and ends

episode_start = []
episode_end = []
episode_start.append(0)

for idx, val in enumerate(df_excel.Codice):

    try:
      if df_excel.Codice[idx] != df_excel.Codice[idx+1]:
        episode_end.append(idx)
        episode_start.append(idx+1)
    
    except Exception:
      episode_end.append(len(df_excel.Codice))

In [None]:
# Boundaries of each episode

print(episode_start)
print(episode_end)

[0, 68, 130, 196, 265, 324, 382, 450, 512, 558, 653, 719, 782, 857, 922, 978, 1046, 1113, 1170, 1228, 1279, 1336, 1387, 1433]
[67, 129, 195, 264, 323, 381, 449, 511, 557, 652, 718, 781, 856, 921, 977, 1045, 1112, 1169, 1227, 1278, 1335, 1386, 1432, 1496]


In [None]:
# Storing each episode as a separate DataFrame

df_episodes_list = []

for ep_start, ep_end in zip(episode_start, episode_end):

  df_episode = df_excel.loc[ep_start:ep_end]
  df_episodes_list.append(df_episode)

print('Total episodes: ' + str(len(df_episodes_list)))

Total episodes: 24


# Defining the functions

The following functions reimplement the procedure that was described in episode_12x01_data_preparation.ipynb. 

In [None]:
# Computing the mean of each subtitle's timespan, which is used to align the subs with the Excel segments

def average_time(average_times, start_times_datetime, end_times_datetime):

  for s, e in zip(start_times_datetime, end_times_datetime):
    ts1 = s
    ts2 = e
    average_time = ts1+(ts2-ts1)/2
    average_times.append(average_time)

The data extracted from the subtitles and the Excel file can be passed to the functions align_segments() or align_subs(). When executing the code, the user is asked to choose whether to align the data by segment (=to group subtitles that are part of the same segment) or by subtitle (=to display each subtitle independently).

In [None]:
# If the user selects option [1] when alignment_type() is called, the data is aligned by segment

def align_segments(df_subs, codice, inizio, fine, pp, sp, mc, aligned_subs):

  for code, start, end, pp, sp, mc in zip(codice, inizio, fine, pp, sp, mc):
    
    mask = (df_subs['Average'] > start) & (df_subs['Average'] <= end)
    mask_df = df_subs.loc[mask]

    mask_df = mask_df.drop(['Average'], axis=1)
    mask_df['PP'] = pp
    mask_df = mask_df.groupby('PP')
    mask_df = mask_df['Subtitle text'].agg(lambda column: ''.join(column))
    mask_df = mask_df.reset_index(name='Subtitle text')
    mask_df.rename(columns={'Subtitle text':'Segment text'}, inplace=True)
    mask_df['SP'] = sp
    mask_df['MC'] = mc
    mask_df['Segment start'] = start   
    mask_df['Segment end'] = end
    mask_df['Code'] = code
    mask_df = mask_df[['Code', 'Segment start', 'Segment end', 'PP', 'SP', 'MC', 'Segment text']]
    aligned_subs.append(mask_df)

In [None]:
# If the user selects option [2] when alignment_type() is called, the data is aligned by subtitle

def align_subs(df_subs, codice, inizio, fine, pp, sp, mc, aligned_subs):

  for code, start, end, pp, sp, mc in zip(codice, inizio, fine, pp, sp, mc):
    
    mask = (df_subs['Average'] > start) & (df_subs['Average'] <= end)
    mask_df = df_subs.loc[mask]

    mask_df['PP'] = pp
    mask_df['SP'] = sp
    mask_df['MC'] = mc
    mask_df['Subtitle start'] = mask_df['Start'].astype(str)
    mask_df['Subtitle start'] = mask_df['Subtitle start'].str.replace(r'000$', '', regex=True)  
    mask_df['Subtitle end'] = mask_df['End'].astype(str)
    mask_df['Subtitle end'] = mask_df['Subtitle end'].str.replace(r'000$', '', regex=True) 
    mask_df['Start'] = start   
    mask_df['End'] = end
    mask_df['Code'] = code
    mask_df = mask_df[['Code', 'Subtitle start', 'Subtitle end', 'PP', 'SP', 'MC', 'Subtitle text']]
    aligned_subs.append(mask_df)

For each episode, the process is the same as before:

1.   Parsing and storing subtitles in a DataFrame
2.   Calculating the average of the subtitles' timespans
3.   Using the average to perform the alignment either by segment or by subtitle
4.   Storing the episode's DataFrame in a list 
5.   Repeating the process until all of the episodes have been aligned

In [None]:
import codecs

def process_files(xlsx_file, srt_file, reply, aligned_episodes):

  df_excel = xlsx_file

  with codecs.open(srt_file, 'r', 'utf-8', 'replace') as f:
    subs = f.read().splitlines()
    subs = [x.encode('utf-8', 'replace').decode('utf-8') for x in subs]

    # Parsing .srt file

    re_pattern = r'[0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3} -->'
    regex = re.compile(re_pattern)

    start_times = list(filter(regex.search, subs))
    start_times = [time.split(' ')[0] for time in start_times]
    start_times_datetime1 = pd.to_datetime(start_times, format='%H:%M:%S,%f')
    start_times_datetime2 = pd.Series(start_times_datetime1, name='Start').dt.time

    end_times = list(filter(regex.search,subs))
    end_times = [time.split(' ')[2] for time in end_times]
    end_times_datetime1 = pd.to_datetime(end_times, format='%H:%M:%S,%f')
    end_times_datetime2 = pd.Series(end_times_datetime1, name='End').dt.time

    subtitles = [[]]

    for sub in subs:
        if re.match(re_pattern, sub):
          subtitles[-1].pop()
          subtitles.append([])
        else:
          subtitles[-1].append(sub)

    subtitles = subtitles[1:]
    subtitles = [' '.join(x) for x in subtitles]
    subtitles = pd.Series(subtitles, name='Subtitle text', dtype='string')

    # Averaging timespans

    average_times = []
    average_time(average_times, start_times_datetime1, end_times_datetime1)
    average_times = pd.Series(average_times, name='Average').dt.time

    # Creating .srt DataFrame

    df_subs = pd.concat([start_times_datetime2, end_times_datetime2, average_times, subtitles], axis=1)
    df_subs = df_subs.reset_index(drop=True)

    # Matching subtitles with segments

    aligned_subs = []

    if reply == '1':
      align_segments(df_subs, df_excel.Codice, df_excel.Inizio, df_excel.Fine, df_excel.PP, df_excel.SP, df_excel.MC, aligned_subs)
    
    if reply == '2':
      align_subs(df_subs, df_excel.Codice, df_excel.Inizio, df_excel.Fine, df_excel.PP, df_excel.SP, df_excel.MC, aligned_subs)

    else:
      pass

    # Appending aligned episode to episode list

    aligned_file = pd.concat(aligned_subs)
    aligned_file.reset_index(drop=True)
    aligned_episodes.append(aligned_file)

# Aligning the episodes

Calling the functions described above to perform the alignment.

In [None]:
aligned_episodes = []

def alignment_type(question):

  reply = str(input(question)).lower().strip()
  
  print('Aligning episodes...')

  for xlsx, srt in zip(df_episodes_list, srt_paths):
      process_files(xlsx, srt, reply, aligned_episodes)

In [None]:
alignment_type('Align by segment [1] or Align by subtitle [2]? ') 

Align by segment [1] or Align by subtitle [2]? 2
Aligning episodes...


The list containing each episode's DataFrame is merged with pd.concat() to create a single DataFrame with all of the aligned data.

In [None]:
season_df = pd.concat(aligned_episodes)
season_df = season_df.reset_index(drop=True)
season_df

Unnamed: 0,Code,Subtitle start,Subtitle end,PP,SP,MC,Subtitle text
0,GAS12E01,00:00:00.804,00:00:02.701,0,0,0,Each of you comes here today hopeful...
1,GAS12E01,00:00:02.702,00:00:04.619,0,0,0,I have five rules. Memorize them.
2,GAS12E01,00:00:04.620,00:00:07.288,0,0,0,Can anybody name...
3,GAS12E01,00:00:07.289,00:00:08.456,0,6,0,"<i>So, you might be thinking...</i>"
4,GAS12E01,00:00:08.457,00:00:10.291,0,6,0,"Rule number five... When I move, you move."
...,...,...,...,...,...,...,...
22067,GAS12E24,00:42:00.204,00:42:02.305,0,6,0,Riggs.
22068,GAS12E24,00:42:03.808,00:42:06.977,0,6,0,I really like him.
22069,GAS12E24,00:42:07.011,00:42:09.079,0,6,0,I think he likes me.
22070,GAS12E24,00:42:09.113,00:42:11.081,0,6,0,<i>Or maybe...</i>


# Deleting temporary files

Deleting the temporary directory where the extracted .srt files were stored.

In [None]:
shutil.rmtree(temp_dir.name)

# Exporting to Excel

In [None]:
def yes_or_no(question):
    reply = str(input(question+' [y/n]: ')).lower().strip()
    if reply[0] == 'y':
        return season_df.to_excel('season_12_with_subtitles.xlsx')
        print('Exported to Excel')
    if reply[0] == 'n':
        pass
    else:
        pass

In [None]:
yes_or_no('Export to Excel?') 

Export to Excel? [y/n]: y
