# **Season 13 alignment**

**Input:** season 13 .xlsx file (coding), season 13 .srt files (subtitles)

**Output:** season 13 subtitles aligned with the coding file

# Splitting .xlsx file by episode

In [1]:
import pandas as pd
import numpy as np
import glob
import os
import re

In [2]:
pd.options.mode.chained_assignment = None 

In [3]:
# Opening season 13 .xlsx file 

excelpath = input('Enter .xlsx file path: ')
df_excel = pd.read_excel(excelpath)
df_excel = df_excel.drop(['Season', 'N_segmento', 'Durata', 'PP_rel', 'SP_rel', 'MC_rel', 'Note'], axis=1)

Enter .xlsx file path: /content/coding_season_13.xlsx


In [5]:
# Normalizing timestamps

df_excel['Inizio'] = df_excel['Inizio'].astype(str)
df_excel['Inizio'] = pd.to_datetime(df_excel['Inizio'], errors = 'coerce')
df_excel['Inizio'] = df_excel['Inizio'].dt.round('1s')
df_excel['Inizio'] = df_excel['Inizio'].dt.time

In [6]:
df_excel['Fine'] = df_excel['Fine'].astype(str)
df_excel['Fine'] = df_excel['Fine'].apply(lambda x : '00:09:20' if x=='000:09:20,487' else x) # Fixing a value in line 614
df_excel['Fine'] = pd.to_datetime(df_excel['Fine'], errors = 'coerce')
df_excel['Fine'] = df_excel['Fine'].dt.round('1s')
df_excel['Fine'] = df_excel['Fine'].dt.time

In [7]:
# Cleaning up labels

df_excel.PP.replace('NA ', np.nan, inplace=True)
df_excel.SP.replace('NA ', np.nan, inplace=True)
df_excel.MC.replace('NA ', np.nan, inplace=True)
df_excel.MC.replace('SP10', np.nan, inplace=True) # Fixing a value in line 786

In [8]:
# Replacing NaNs with 0s

df_excel = df_excel.fillna(0)

In [9]:
df_excel

Unnamed: 0,Codice,Inizio,Fine,PP,SP,MC
0,GAS13E01,00:00:00,00:00:44,0.0,0.0,0.0
1,GAS13E01,00:00:44,00:00:49,0.0,0.0,0.0
2,GAS13E01,00:00:49,00:02:18,0.0,6.0,0.0
3,GAS13E01,00:02:18,00:02:36,2.0,2.0,2.0
4,GAS13E01,00:02:36,00:03:18,0.0,6.0,0.0
...,...,...,...,...,...,...
1575,GAS13E24,00:41:23,00:41:43,0.0,0.0,0.0
1576,GAS13E24,00:41:43,00:41:47,0.0,0.0,6.0
1577,GAS13E24,00:41:47,00:41:59,0.0,0.0,0.0
1578,GAS13E24,00:41:59,00:42:10,0.0,0.0,6.0


In [10]:
# Finding where each episode starts and ends

episode_start = []
episode_end = []

episode_start.append(0)

for idx, val in enumerate(df_excel.Codice):

    try:

      if df_excel.Codice[idx] != df_excel.Codice[idx+1]:
        episode_end.append(idx)
        episode_start.append(idx+1)
    
    except Exception:
      
      episode_end.append(len(df_excel.Codice))

In [11]:
# Boundaries of each episode

print(episode_start)
print(episode_end)

[0, 47, 112, 162, 221, 278, 331, 387, 443, 497, 542, 601, 654, 714, 818, 893, 963, 1034, 1098, 1171, 1261, 1324, 1392, 1474]
[46, 111, 161, 220, 277, 330, 386, 442, 496, 541, 600, 653, 713, 817, 892, 962, 1033, 1097, 1170, 1260, 1323, 1391, 1473, 1580]


In [12]:
# Storing each episode as a separate df

df_episodes_list = []

for ep_start, ep_end in zip(episode_start, episode_end):

  df_episode = df_excel.loc[ep_start:ep_end]
  df_episodes_list.append(df_episode)

print('Total episodes: ' + str(len(df_episodes_list)))

Total episodes: 24


In [13]:
# Exporting the dfs as .xlsx files

folderpath = "/content/coding_season_13" 
os.mkdir(folderpath)
excel = 'xlsx'  

for i, df in enumerate(df_episodes_list):
    filename = "{}.{}".format(i+1, excel)
    filepath = os.path.join(folderpath, filename)
    df.to_excel(filepath)

# Defining the functions

In [14]:
def average_time(average_times, start_times_datetime, end_times_datetime):

  for s, e in zip(start_times_datetime, end_times_datetime):
    ts1 = s
    ts2 = e
    average_time = ts1+(ts2-ts1)/2
    average_times.append(average_time)

In [30]:
def align_subs(df_subs, inizio, fine, pp, sp, mc, aligned_subs):

  for start, end, pp, sp, mc in zip(inizio, fine, pp, sp, mc):
    
    mask = (df_subs['Average'] > start) & (df_subs['Average'] <= end)
    mask_df = df_subs.loc[mask]

    mask_df = mask_df.drop(['Average'], axis=1)
    mask_df['PP'] = pp
    mask_df = mask_df.groupby("PP")
    mask_df = mask_df["Subtitle"].agg(lambda column: "".join(column))
    mask_df = mask_df.reset_index(name="Subtitle")
    mask_df['SP'] = sp
    mask_df['MC'] = mc
    #mask_df['Start'] = start   # To add start/end timestamps as well
    #mask_df['End'] = end
    mask_df = mask_df[['Start', 'End', 'Subtitle', 'PP', 'SP', 'MC']]
    aligned_subs.append(mask_df)

In [31]:
aligned_episodes = []

def process_files(xlsx_file, srt_file, aligned_episodes):

  df_excel = pd.read_excel(xlsx_file)
  df_excel = df_excel.reset_index(drop=True)
  df_excel = df_excel.drop('Unnamed: 0', axis=1)
  df_excel['Inizio'] =  pd.to_datetime(df_excel['Inizio']).dt.time
  df_excel['Fine'] = pd.to_datetime(df_excel['Fine']).dt.time


  with open(srt_file, 'r') as f:
    subs = f.read().splitlines()

    # Parsing .srt file

    re_pattern = r'[0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3} -->'
    regex = re.compile(re_pattern)

    start_times = list(filter(regex.search, subs))
    start_times = [time.split(' ')[0] for time in start_times]
    start_times_datetime1 = pd.to_datetime(start_times, format="%H:%M:%S,%f")
    start_times_datetime2 = pd.Series(start_times_datetime1, name='Start').dt.time

    end_times = list(filter(regex.search,subs))
    end_times = [time.split(' ')[2] for time in end_times]
    end_times_datetime1 = pd.to_datetime(end_times, format="%H:%M:%S,%f")
    end_times_datetime2 = pd.Series(end_times_datetime1, name='End').dt.time

    subtitles = [[]]

    for sub in subs:
        if re.match(re_pattern, sub):
          subtitles[-1].pop()
          subtitles.append([])
        else:
          subtitles[-1].append(sub)

    subtitles = subtitles[1:]
    subtitles = [' '.join(x) for x in subtitles]
    subtitles = pd.Series(subtitles, name='Subtitle')

    # Averaging timespans

    average_times = []
    average_time(average_times, start_times_datetime1, end_times_datetime1)
    average_times = pd.Series(average_times, name='Average').dt.time

    # Creating .srt DataFrame

    df_subs = pd.concat([average_times, subtitles], axis=1)
    df_subs = df_subs.reset_index(drop=True)

    # Matching subtitles with segments

    aligned_subs = []
    align_subs(df_subs, df_excel.Inizio, df_excel.Fine, df_excel.PP, df_excel.SP, df_excel.MC, aligned_subs)

    # Appending aligned episode to episode list

    aligned_file = pd.concat(aligned_subs)
    aligned_file.reset_index(drop=True)
    aligned_episodes.append(aligned_file)


# Preparing files

In [17]:
!unzip /content/subtitles_season_13.zip

Archive:  /content/subtitles_season_13.zip
   creating: season_13/
  inflating: season_13/13x01.srt     
  inflating: season_13/13x02.srt     
  inflating: season_13/13x03.srt     
  inflating: season_13/13x04.srt     
  inflating: season_13/13x05.srt     
  inflating: season_13/13x06.srt     
  inflating: season_13/13x07.srt     
  inflating: season_13/13x08.srt     
  inflating: season_13/13x09.srt     
  inflating: season_13/13x10.srt     
  inflating: season_13/13x11.srt     
  inflating: season_13/13x12.srt     
  inflating: season_13/13x13.srt     
  inflating: season_13/13x14.srt     
  inflating: season_13/13x15.srt     
  inflating: season_13/13x16.srt     
  inflating: season_13/13x17.srt     
  inflating: season_13/13x18.srt     
  inflating: season_13/13x19.srt     
  inflating: season_13/13x20.srt     
  inflating: season_13/13x21.srt     
  inflating: season_13/13x22.srt     
  inflating: season_13/13x23.srt     
  inflating: season_13/13x24.srt     


In [18]:
# Subtitle paths 

srt_paths = []

os.rename('/content/season_13','/content/subtitles_season_13')

for filename in sorted(glob.glob(os.path.join('/content/subtitles_season_13', '*.srt'))):
  srt_paths.append(filename)

In [19]:
# Excel paths

xlsx_paths = []

sort_dir = sorted(os.listdir('/content/coding_season_13/'), key=lambda x: int(x.replace(".xlsx", "")))

for filename in sort_dir:
  dir_path = '/content/coding_season_13/' + filename
  xlsx_paths.append(dir_path)

# Aligning the episodes

In [32]:
for xlsx, srt in zip(xlsx_paths, srt_paths):
  process_files(xlsx, srt, aligned_episodes)

In [33]:
season_df = pd.concat(aligned_episodes)
season_df = season_df.reset_index(drop=True)
season_df

Unnamed: 0,Start,End,Subtitle,PP,SP,MC
0,00:00:00,00:00:44,"<i>Previously on ""Grey's Anatomy""...</i> I wan...",0,0,0
1,00:00:44,00:00:49,♪,0,0,0
2,00:00:49,00:02:18,♪ I ain't got no problem ♪ ♪ That's for real ♪...,0,6,0
3,00:02:18,00:02:36,[Siren wails] Isaac: What do we got? We got a ...,2,2,2
4,00:02:36,00:03:18,Two champagnes. You got it. I thought you were...,0,6,0
...,...,...,...,...,...,...
1466,00:40:58,00:41:23,"[Engine starts] <i>Nobody wakes up thinking, ""...",0,0,6
1467,00:41:23,00:41:43,"<i>Sometimes, we wake up, we face our fears......",0,0,0
1468,00:41:43,00:41:47,<i>We take them by the hand.</i> ♪♪,0,0,6
1469,00:41:59,00:42:10,"♪ So far away ♪ <i>- And we stand there, waiti...",0,0,6


In [34]:
season_df.to_excel('/content/season_13.xlsx')