**IMPORTING NECESSARY PACKAGES**

In [None]:
import sqlite3
import pandas as pd

**MOUNTING THE DRIVE**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**CONNECTING TO THE DATABASE**

In [None]:
conn = sqlite3.connect('/content/drive/MyDrive/eng_subtitles_database.db')
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
print(cursor.fetchall())

[('zipfiles',)]


**PRINTING THE NAME OF THE FIELDS PRESENT IN THE DATABASE**

In [None]:
cursor.execute("PRAGMA table_info('zipfiles')")
cols = cursor.fetchall()
for col in cols:
    print(col[1])

num
name
content


**OPENING THE CONTENTS OF THE DATABASE IN DATAFRAME**

In [None]:
df = pd.read_sql_query("""SELECT * FROM zipfiles""", conn)
df.head()

Unnamed: 0,num,name,content
0,9180533,the.message.(1976).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x...
1,9180583,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x17\xb9\x...
2,9180592,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00L\xb9\x99V...
3,9180594,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00U\xa9\x99V...
4,9180600,broker.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x001\xa9\x99V...


**SAMPLING THE DATABASE**

Since the database is very huge and my computing resource is very less, I have considered 30% of random data from the dataset.

In [None]:
df = df.sample(frac=0.3)

In [None]:
df.shape

(24749, 3)

**DECODING THE SUBTITLE CONTENTS**

The subtitle text contents are encoding in UTF-8 code. So it is required to decode it to read the subtitle text contents available.

In [None]:
import zipfile
import io

count = 0

def decode_method(binary_data):
    global count
    # Decompress the binary data using the zipfile module
    # print(count, end=" ")
    count += 1
    with io.BytesIO(binary_data) as f:
        with zipfile.ZipFile(f, 'r') as zip_file:
            # Assuming there's only one file in the ZIP archive
            subtitle_content = zip_file.read(zip_file.namelist()[0])

    # Now 'subtitle_content' should contain the extracted subtitle content
    return subtitle_content.decode('latin-1')  # Assuming the content is UTF-8 encoded text

**APPLYING THE DECODE METHOD FOR EACH CONTENT IN THE DATASET**

In [None]:
df['file_content'] = df['content'].apply(decode_method)

df.head()

Unnamed: 0,num,name,content,file_content
81860,9518997,criminal.minds.s10.e20.a.place.at.the.table.(2...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x08\xa5\x...,ï»¿[Script Info]\r\nTitle: Default file\r\nScr...
72424,9478687,ghost.in.the.shell.stand.alone.complex.s01.e09...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xfba\x9aV...,"ï»¿1\r\n00:00:04,963 --> 00:00:10,093\r\nAngel..."
71937,9477051,halo.s01.e04.homecoming.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x08_\x9aV...,ï»¿[Script Info]\r\nTitle: Default file\r\nScr...
56135,9416541,make.my.day.s01.e07.episode.1.7.(2023).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x981\x9aV...,"1\r\n00:00:06,000 --> 00:00:12,074\r\nSupport ..."
74675,9488980,australian.survivor.s04.e11.episode.4.11.(2019...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xe4\x80\x...,"ï»¿00:00:00,000 --> 00:00:04,162\n<font color=..."


**PRINTING THE FILE CONTENT OF THE 81860th ROW**

In [None]:
print(df['file_content'][81860])

ï»¿[Script Info]
Title: Default file
ScriptType: v4.00+
WrapStyle: 0
PlayResX: 720
PlayResY: 480
ScaledBorderAndShadow: yes
Audio File: 
Video File: 
Video Aspect Ratio: 0
Video Zoom: 6
Video Position: 0

[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
Style: Dialogue1,Tahoma,32,&H00FDFDFD,&H000000FF,&H1F000000,&HC7000000,0,0,0,0,100,100,0,0,1,1.4,1.7,2,60,60,15,1
Style: Dialogue2,Tahoma,33,&H00FDFDFD,&H000000FF,&H1F000000,&HC7000000,0,0,0,0,100,100,0,0,1,1.4,1.7,2,60,60,15,1
Style: Dialogue4,Tahoma,31,&H00FDFDFD,&H000000FF,&H1F000000,&HC7000000,0,0,0,0,100,100,0,0,1,1.4,1.7,2,60,60,15,1
Style: Dialogue3,Tahoma,33,&H00FDFDFD,&H000000FF,&H1F000000,&HC7000000,0,0,0,0,100,100,0,0,1,1.4,1.7,2,60,60,15,1

[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR,

**PRINTING THE FILE CONTENT OF 72424th ROW OF THE DATAFRAME**

In [None]:
print(df['file_content'][72424])

ï»¿1
00:00:04,963 --> 00:00:10,093
Angels and demons were circling
over my head

2
00:00:10,636 --> 00:00:15,557
Cutting through thorns and milky ways

3
00:00:16,099 --> 00:00:21,271
You don't know happiness,
whatever the case

4
00:00:22,064 --> 00:00:28,028
If you can't understand its calling

5
00:00:29,238 --> 00:00:33,283
<i>Watch in awe, watch in awe</i>

6
00:00:34,034 --> 00:00:36,370
Heavenly glory

7
00:00:36,662 --> 00:00:39,206
Heavenly glory

8
00:00:39,581 --> 00:00:41,833
Watch in awe

9
00:00:42,501 --> 00:00:44,836
Watch in awe

10
00:00:45,128 --> 00:00:47,798
Heavenly glory

11
00:00:48,257 --> 00:00:50,634
Heavenly glory

12
00:00:50,717 --> 00:00:56,473
I am Calling Calling out!

13
00:00:56,765 --> 00:01:02,688
Spirits, I am calling!!!

14
00:01:02,854 --> 00:01:08,235
To be yourself much longer...

15
00:01:08,485 --> 00:01:13,657
Calling Calling

16
00:01:14,116 --> 00:01:19,288
in the depths of lo

**SAVING THE DATASET WITH DECODED CONTENT AS A CSV FILE**

In [None]:
file_path = '/content/drive/MyDrive/eng_movie_subtitles.csv'
df.to_csv(file_path, index=False, escapechar='\\')