<a href="https://colab.research.google.com/github/SunkaraboinaPraveenKumar/Machine_Learning_Projects/blob/main/Theme_classification_development.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
from transformers import pipeline
from nltk import sent_tokenize
import nltk
import torch

In [13]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
#Load Model
model_name="facebook/bart-large-mnli"
device = 0 if torch.cuda.is_available() else 'cpu'

In [15]:
def load_model(device):
    theme_classifier=pipeline(
        "zero-shot-classification",
        model=model_name,
        device=device
    )
    return theme_classifier

In [16]:
theme_classifier=load_model(device)

In [17]:
theme_list=["friendship","hope","sacrifice","battle","self development","betrayal","love","dialogue"]

In [18]:
theme_classifier(
    "I gave him a right hook then a left jab",
    theme_list,
    multi_label=True
)

{'sequence': 'I gave him a right hook then a left jab',
 'labels': ['battle',
  'self development',
  'hope',
  'sacrifice',
  'dialogue',
  'betrayal',
  'love',
  'friendship'],
 'scores': [0.9121255278587341,
  0.4749981760978699,
  0.0878177061676979,
  0.04499982297420502,
  0.020132659003138542,
  0.012040399014949799,
  0.004292286932468414,
  0.0028172098100185394]}

In [19]:
!unzip /content/subtitles.zip

Archive:  /content/subtitles.zip
   creating: subtitles/
  inflating: subtitles/Naruto Season 1 - 01.ass  
  inflating: subtitles/Naruto Season 1 - 02.ass  
  inflating: subtitles/Naruto Season 1 - 03.ass  
  inflating: subtitles/Naruto Season 1 - 04.ass  
  inflating: subtitles/Naruto Season 1 - 05.ass  
  inflating: subtitles/Naruto Season 1 - 06.ass  
  inflating: subtitles/Naruto Season 1 - 07.ass  
  inflating: subtitles/Naruto Season 1 - 08.ass  
  inflating: subtitles/Naruto Season 1 - 09.ass  
  inflating: subtitles/Naruto Season 1 - 10.srt  
  inflating: subtitles/Naruto Season 1 - 11.srt  
  inflating: subtitles/Naruto Season 1 - 12.ass  
  inflating: subtitles/Naruto Season 1 - 13.ass  
  inflating: subtitles/Naruto Season 1 - 14.ass  
  inflating: subtitles/Naruto Season 1 - 15.ass  
  inflating: subtitles/Naruto Season 1 - 16.ass  
  inflating: subtitles/Naruto Season 1 - 17.ass  
  inflating: subtitles/Naruto Season 1 - 18.ass  
  inflating: subtitles/Naruto Season 1 - 19

In [20]:
#Load dataset
from glob import glob
files=glob("/content/subtitles/*.ass")

In [21]:
files[:5]

['/content/subtitles/Naruto Season 3 - 62.ass',
 '/content/subtitles/Naruto Season 5 - 114.ass',
 '/content/subtitles/Naruto Season 4 - 97.ass',
 '/content/subtitles/Naruto Season 1 - 01.ass',
 '/content/subtitles/Naruto Season 3 - 55.ass']

In [22]:
with open(files[0],'r') as file:
  lines=file.readlines()
  lines=lines[27:]
  lines=[",".join(line.split(",")[9:]) for line in lines ]

In [23]:
lines[:2]

['Yeah, turn your sadness into kindness,\\Nyour uniqueness into strength\n',
 'Even if you get lost, let’s start walking\n']

In [24]:
#replace \n with space
lines=[line.replace("\n"," ") for line in lines]

In [25]:
lines[:2]

['Yeah, turn your sadness into kindness,\\Nyour uniqueness into strength ',
 'Even if you get lost, let’s start walking ']

In [26]:
" ".join(lines[:10])

"Yeah, turn your sadness into kindness,\\Nyour uniqueness into strength  Even if you get lost, let’s start walking  One more time  Are you really happy  when you're always meeting\\Nto someone's expectations?  Even if you change to\\Nwho you really wanna be  Will you always have a great smile?  Dreaming of only “the beginning,”\\NI wake up  and someday, I'll live on my own.  Yes, precious things are never in shape "

In [27]:
int(files[0].split('-')[-1].split('.')[0].strip())

62

In [28]:
import pandas as pd

In [29]:
def load_subtitles_dataset(dataset_path):
  subtitles_paths=glob('/content/subtitles/*.ass')
  scripts=[]
  episodes=[]
  for path in subtitles_paths:
    with open(path,'r') as file:
      lines=file.readlines()
      lines=lines[27:]
      lines=[",".join(line.split(",")[9:]) for line in lines ]
      lines=[line.replace("\n"," ") for line in lines]
      script=" ".join(lines)
      episode=int(path.split('-')[-1].split('.')[0].strip())
      scripts.append(script)
      episodes.append(episode)
  df=pd.DataFrame.from_dict({
      "episode":episodes,
      "script":scripts
  })
  return df


In [30]:
dataset_path='/content/subtitles'
df=load_subtitles_dataset(dataset_path)

In [31]:
df.head()

Unnamed: 0,episode,script
0,62,"Yeah, turn your sadness into kindness,\Nyour u..."
1,114,Connecting old words\Nthat have been used up ...
2,97,We are Fighting Dreamers aiming high Fighting...
3,1,"A long time ago, a powerful demon fox\Nappeare..."
4,55,"Yeah, turn your sadness into kindness,\Nyour u..."


In [32]:
# Run Model
script=df.iloc[0]['script']

In [33]:
script

'Yeah, turn your sadness into kindness,\\Nyour uniqueness into strength  Even if you get lost, let’s start walking  One more time  Are you really happy  when you\'re always meeting\\Nto someone\'s expectations?  Even if you change to\\Nwho you really wanna be  Will you always have a great smile?  Dreaming of only “the beginning,”\\NI wake up  and someday, I\'ll live on my own.  Yes, precious things are never in shape  You never know what it is\\Nwhen you get it or lose it  Yeah, turn your sadness into kindness,\\Nyour uniqueness into strength  Even if you get lost,\\Nlet\'s start walking  One more time  Your destiny is to lose to me.\\NFor sure.  You don’t know until you try!  Your father was killed a long time ago  and I don’t know the extent\\Nof the pain you’ve felt…  But it’s a huge mistake for you to think\\Nthat has determined your entire destiny!  You’re hopeless…  Proctor…it’s over.  You failure…  A Failure\'s True Power  Don’t…run away.  I won’t…run…  I’ll…stand by my own word

In [34]:
script_sentences=sent_tokenize(script)

In [35]:
script_sentences[:3]

["Yeah, turn your sadness into kindness,\\Nyour uniqueness into strength  Even if you get lost, let’s start walking  One more time  Are you really happy  when you're always meeting\\Nto someone's expectations?",
 'Even if you change to\\Nwho you really wanna be  Will you always have a great smile?',
 "Dreaming of only “the beginning,”\\NI wake up  and someday, I'll live on my own."]

In [36]:
#Batch Sentences
sentence_batch_size=20
script_batches_list=[]
for index in range(0,len(script_sentences),sentence_batch_size):
  sent=" ".join(script_sentences[index:index+sentence_batch_size])
  script_batches_list.append(sent)

In [37]:
script_batches_list

["Yeah, turn your sadness into kindness,\\Nyour uniqueness into strength  Even if you get lost, let’s start walking  One more time  Are you really happy  when you're always meeting\\Nto someone's expectations? Even if you change to\\Nwho you really wanna be  Will you always have a great smile? Dreaming of only “the beginning,”\\NI wake up  and someday, I'll live on my own. Yes, precious things are never in shape  You never know what it is\\Nwhen you get it or lose it  Yeah, turn your sadness into kindness,\\Nyour uniqueness into strength  Even if you get lost,\\Nlet's start walking  One more time  Your destiny is to lose to me.\\NFor sure. You don’t know until you try! Your father was killed a long time ago  and I don’t know the extent\\Nof the pain you’ve felt…  But it’s a huge mistake for you to think\\Nthat has determined your entire destiny! You’re hopeless…  Proctor…it’s over. You failure…  A Failure's True Power  Don’t…run away. I won’t…run…  I’ll…stand by my own words…  That’s m

In [38]:
theme_output=theme_classifier(
    script_batches_list[:2],
    theme_list,
    multi_label=True
)

In [39]:
# Wrangle Output
# battle: [0.514899498,0.2156498]
themes={}
for output in theme_output:
  for label,score in zip(output['labels'],output['scores']):
    if label not in themes:
      themes[label]=[]
    themes[label].append(score)


In [40]:
import numpy as np

In [41]:
themes={key:np.mean(np.array(value)) for key,value in themes.items()}

In [42]:
themes

{'self development': 0.8777180910110474,
 'sacrifice': 0.8917644023895264,
 'dialogue': 0.9312507212162018,
 'betrayal': 0.9295525252819061,
 'battle': 0.787899523973465,
 'friendship': 0.29046486504375935,
 'love': 0.27664252184331417,
 'hope': 0.3169528990983963}

In [43]:
def get_themes_inference(script):
  script_sentences=sent_tokenize(script)
  sentence_batch_size=20
  script_batches_list=[]
  for index in range(0,len(script_sentences),sentence_batch_size):
    sent=" ".join(script_sentences[index:index+sentence_batch_size])
    script_batches_list.append(sent)

  # Run Model
  theme_output=theme_classifier(
      script_batches_list,
      theme_list,
      multi_label=True
  )
  #wrangle output
  themes={}
  for output in theme_output:
    for label,score in zip(output['labels'],output['scores']):
      if label not in themes:
        themes[label]=[]
      themes[label].append(score)

  themes={key:np.mean(np.array(value)) for key,value in themes.items()}
  return themes

In [44]:
df=df.head(2)

In [45]:
df

Unnamed: 0,episode,script
0,62,"Yeah, turn your sadness into kindness,\Nyour u..."
1,114,Connecting old words\Nthat have been used up ...


In [46]:
output_themes=df['script'].apply(get_themes_inference)

In [47]:
output_themes

Unnamed: 0,script
0,"{'self development': 0.6297365591994354, 'sacr..."
1,"{'battle': 0.7392865214496851, 'friendship': 0..."


In [48]:
theme_df=pd.DataFrame(output_themes.tolist())

In [49]:
theme_df

Unnamed: 0,self development,sacrifice,dialogue,betrayal,battle,friendship,love,hope
0,0.629737,0.66997,0.693673,0.538314,0.845163,0.189756,0.189148,0.27879
1,0.644001,0.781588,0.909088,0.613735,0.739287,0.506531,0.157669,0.291027


In [50]:
df[theme_df.columns]=theme_df
df

Unnamed: 0,episode,script,self development,sacrifice,dialogue,betrayal,battle,friendship,love,hope
0,62,"Yeah, turn your sadness into kindness,\Nyour u...",0.629737,0.66997,0.693673,0.538314,0.845163,0.189756,0.189148,0.27879
1,114,Connecting old words\Nthat have been used up ...,0.644001,0.781588,0.909088,0.613735,0.739287,0.506531,0.157669,0.291027
