# ETL

This notebook is to extract from sqlite and mongodb. Transforms data and load/save into a dataframe

### a) Gather Data from sqlite.db

Extracting data from sqlite.db and transforming to get video duration, count, like, comments and get total of engagement

In [10]:
import os
import sqlite3
import pandas as pd

# setting path to database
folder_path = "data"
db_file_path = os.path.join(folder_path, "tiktok.db")


In [11]:
# connecting to db
conn = sqlite3.connect(db_file_path)

In [14]:
# SQL query to join VideoMetrics and Videos tables on video_id
query = '''
SELECT v.video_duration_sec AS tiktok_duration_sec,
        m.video_view_count AS tiktok_view_count,
        m.video_like_count AS tiktok_like_count,
        m.video_comment_count AS tiktok_comment_count
FROM Videos v
JOIN VideoMetrics m
ON v.video_id = m.video_id
WHERE v.video_duration_sec IS NOT NULL AND
        v.video_duration_sec > 0;
'''

In [15]:
# Load into df
tiktok_df = pd.read_sql_query(query, conn)

# close connection
conn.close()


In [16]:
tiktok_df.head()

Unnamed: 0,tiktok_duration_sec,tiktok_view_count,tiktok_like_count,tiktok_comment_count
0,59,343296.0,19425.0,0.0
1,32,140877.0,77355.0,684.0
2,31,902185.0,97690.0,329.0
3,25,437506.0,239954.0,584.0
4,19,56167.0,34987.0,152.0


In [17]:
# calculate total engagement by adding views, likes, and comments
tiktok_df['tiktok_total_engagement'] = (
    tiktok_df['tiktok_view_count'] +
    tiktok_df['tiktok_like_count'] +
    tiktok_df['tiktok_comment_count']
)

In [18]:
tiktok_df.head()

Unnamed: 0,tiktok_duration_sec,tiktok_view_count,tiktok_like_count,tiktok_comment_count,tiktok_total_engagement
0,59,343296.0,19425.0,0.0,362721.0
1,32,140877.0,77355.0,684.0,218916.0
2,31,902185.0,97690.0,329.0,1000204.0
3,25,437506.0,239954.0,584.0,678044.0
4,19,56167.0,34987.0,152.0,91306.0


In [19]:
tiktok_df.describe()

Unnamed: 0,tiktok_duration_sec,tiktok_view_count,tiktok_like_count,tiktok_comment_count,tiktok_total_engagement
count,19382.0,19084.0,19084.0,19084.0,19084.0
mean,32.421732,254708.558688,84304.63603,349.312146,339362.5
std,16.229967,322893.280814,133420.546814,799.638865,437945.1
min,5.0,20.0,0.0,0.0,23.0
25%,18.0,4942.5,810.75,1.0,6013.75
50%,32.0,9954.5,3403.5,9.0,13761.0
75%,47.0,504327.0,125020.0,292.0,660820.5
max,60.0,999817.0,657830.0,9599.0,1656099.0


In [21]:
# saving data to pickle
tiktok_df.to_pickle("tiktok.pkl")

### b) Gather Data from mongoDB