## Projekt token 

In [64]:
# @hidden_cell
# The project token is an authorization token that is used to access project resources like data sources, connections, and used by platform APIs.
from project_lib import Project
project = Project(project_id='project_id', project_access_token='access_token')
pc = project.project_context

from ibm_watson_studio_lib import access_project_or_space
wslib = access_project_or_space({'token':'token'})

## Inštalácia knižníc


In [None]:
pip install --upgrade ibm-watson

In [None]:
pip install pysrt

## Import knižníc

In [67]:
from os.path import join, dirname
import json
import os, types
import pandas as pd
import pysrt
import re
import matplotlib.pyplot as plt
import numpy as np
from ibm_watson import NaturalLanguageUnderstandingV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from ibm_watson.natural_language_understanding_v1 import Features, EmotionOptions, KeywordsOptions

## Authenticator 

In [68]:
authenticator = IAMAuthenticator('authenticator')
natural_language_understanding = NaturalLanguageUnderstandingV1(
    version='2021-03-25',
    authenticator=authenticator
)
natural_language_understanding.set_service_url('service_url')

## Import .srt súboru na analýzu

In [69]:

import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.

cos_client = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='api_key',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3.private.eu-de.cloud-object-storage.appdomain.cloud')

bucket = 'textanalysiswithwatsonnaturallang-donotdelete-pr-qxyrqmwcy2vfyu'
object_key = 'TheShapeOfWaterGoldenBeard.srt'

# load data of type "application/octet-stream" into a botocore.response.StreamingBody object.
# Please read the documentation of ibm_boto3 and pandas to learn more about the possibilities to load the data.
# ibm_boto3 documentation: https://ibm.github.io/ibm-cos-sdk-python/
# pandas documentation: http://pandas.pydata.org/

streaming_body_1 = cos_client.get_object(Bucket=bucket, Key=object_key)['Body']

In [70]:
cos_client.download_file(bucket, object_key, object_key)

## Spracovanie .srt súboru 

In [71]:
#Funkcia na vyčistenie textu titulkov
def clean_subs(text):
    clean_text = text.replace('\n', ' ') #Nahradenie odriadkovania medzerou
    clean_text = clean_text.replace('-','') #Odstránenie pomlčiek
    clean_text = re.sub(r'<.*?>', '', clean_text) #Odstránenie html tagov
    clean_text = re.sub(r'\[.*?\]', '', clean_text) #Odstránenie textu v hranatých zátvorkách
    clean_text = re.sub(r'\#.*?\#', '', clean_text) #Odstránenie textu v hashtagoch
    return clean_text

In [None]:
subs = pysrt.open('TheShapeOfWaterGoldenBeard.srt', encoding='iso-8859-1') #Otvoríme .srt súbor pomocou pysrt
movie_name= 'The Shape Of Water' #Zadefinujeme názov filmu

matrix = [None] * len(subs) #Zadefinujeme prázdnu maticu

#Naplníme maticu údajmi z .srt súboru
for i in range (len(subs)):
    start_time = [subs[i].start.hours, subs[i].start.minutes, subs[i].start.seconds]
    end_time = [subs[i].end.hours, subs[i].end.minutes, subs[i].end.seconds]
    st_sec = start_time[0] * 3600 + start_time[1] * 60 + start_time[2] #Zadefinujeme začiatočný čas titulku v sekundách
    matrix[i] = [i, str(start_time[0]) + ":" + str(start_time[1]) + ":" + str(start_time[2]), str(end_time[0]) + ":" + str(end_time[1]) + ":" + str(end_time[2]), st_sec, subs[i].text, movie_name]

columns = ['Index', 'Start Time', 'End Time','Start Time (sec)', 'Text', 'Movie'] #Zadefinujeme názvy pre stĺpce tabuľky
df = pd.DataFrame(matrix, columns=columns) #Vytvoríme tabuľku

df['Text'] = df['Text'].apply(clean_subs) #Aplikujeme funkciu clean_subs
df = df[df['Text'].str.strip().apply(len) > 0] #Odstránime z tabuľky všetky prázdne titulky

df.head()

In [None]:
project.save_data(data=df.to_csv(index=False),file_name=movie_name +'_Mid_Output.csv',overwrite=True)

## Analýza emócii datasetu

In [None]:
for index, row in df.iterrows():
    #Analýza aktuálneho riadku datasetu
    response = natural_language_understanding.analyze(text=row['Text'], features=Features(emotion=EmotionOptions()), language='en').get_result()
    df.loc[index, 'Analysis'] = json.dumps(response, indent=2) #Výsledok analýzy uložíme ako json
    
    json_data = json.loads(json.dumps(response, indent=2))
    df.loc[index, 'Language'] = json_data['language'] #Vytvoríme nový stĺpec s jazykom analyzovaného textu
    
    #Uložíme výsledok analýzy do samostatných stĺpcov pre jednotlivé emócie
    emotions = json_data['emotion']['document']['emotion']
    df.loc[index, 'Sadness'] = emotions['sadness']
    df.loc[index, 'Joy'] = emotions['joy']
    df.loc[index, 'Fear'] = emotions['fear']
    df.loc[index, 'Disgust'] = emotions['disgust']
    df.loc[index, 'Anger'] = emotions['anger']

project.save_data(data=df.to_csv(index=False),file_name=movie_name +'_Output.csv',overwrite=True) #Uložíme výslednú tabuľku ako .csv súbor