# Gathering Data and Database Creations

This notebook is to gather data from a kaggle dataset into a sqlite db and api calls to place json into a local mongoDB 

### a) Dataset from kaggle

In [1]:
import os

folder_path = "data"

# creating folder for data to be held, .gitignore data/ added also
if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    print(f"Folder '{folder_path}' created successfully.")
else:
    print(f"Folder '{folder_path}' already exists.")

Folder 'data' already exists.


In [2]:
import os
from dotenv import load_dotenv
import pandas as pd

# Load environment variables from the .env file
load_dotenv()

# Set the Kaggle username and key as environment variables for the session
# used .env instead json file from kaggle
os.environ["KAGGLE_USERNAME"] = os.getenv("KAGGLE_USERNAME")
os.environ["KAGGLE_KEY"] = os.getenv("KAGGLE_KEY")


In [3]:
from kaggle.api.kaggle_api_extended import KaggleApi

# Function to download Kaggle dataset
def download_kaggle_dataset(owner, dataset_name, download_path="data"):
    os.makedirs(download_path, exist_ok=True)
    api = KaggleApi()
    api.authenticate()
    api.dataset_download_files(f"{owner}/{dataset_name}", path=download_path, unzip=True)


In [4]:
# downloading kaggle tiktok dataset
download_kaggle_dataset("yakhyojon", "tiktok")

Dataset URL: https://www.kaggle.com/datasets/yakhyojon/tiktok


### b) Dataset into a sqlite DB

Chatgpt assited in creating db. More tables should be made to be normalized but normalization is not the focus of coursework. I wish to practice interacting with a db.

In [19]:
import sqlite3
from sqlalchemy import create_engine

# loading csv
csv_path = 'data/tiktok_dataset.csv'
tiktok_data = pd.read_csv(csv_path)

In [20]:
# creating db in data along with connection
db_file_path = os.path.join(folder_path, "tiktok.db")
conn = sqlite3.connect(db_file_path)


In [21]:
cursor = conn.cursor()
# Create tables

# 1. Videos table
cursor.execute('''
CREATE TABLE IF NOT EXISTS Videos (
    video_id INTEGER PRIMARY KEY,
    video_duration_sec INTEGER,
    claim_status TEXT,
    verified_status TEXT,
    video_transcription_text TEXT
)
''')

# 2. Authors table
cursor.execute('''
CREATE TABLE IF NOT EXISTS Authors (
    author_id INTEGER PRIMARY KEY AUTOINCREMENT,
    author_ban_status TEXT
)
''')

# 3. VideoMetrics table
cursor.execute('''
CREATE TABLE IF NOT EXISTS VideoMetrics (
    metric_id INTEGER PRIMARY KEY AUTOINCREMENT,
    video_id INTEGER,
    video_view_count REAL,
    video_like_count REAL,
    video_share_count REAL,
    video_download_count REAL,
    video_comment_count REAL,
    FOREIGN KEY (video_id) REFERENCES Videos(video_id)
)
''')

<sqlite3.Cursor at 0x7d679570cd50>

In [22]:

# Insert data into these tables

# Track unique authors and their ban status
authors = {}

for _, row in tiktok_data.iterrows():
    # Insert into Videos table
    cursor.execute('''
        INSERT OR IGNORE INTO Videos (video_id, video_duration_sec, claim_status, verified_status, video_transcription_text)
        VALUES (?, ?, ?, ?, ?)
    ''', (row['video_id'], row['video_duration_sec'], row['claim_status'], row['verified_status'], row['video_transcription_text']))

    # Insert into Authors table if unique
    author_ban_status = row['author_ban_status']
    if author_ban_status not in authors:
        cursor.execute('''
            INSERT INTO Authors (author_ban_status)
            VALUES (?)
        ''', (author_ban_status,))
        authors[author_ban_status] = cursor.lastrowid  # Store the author_id for reference
    
    # Insert into VideoMetrics table
    cursor.execute('''
        INSERT INTO VideoMetrics (video_id, video_view_count, video_like_count, video_share_count, video_download_count, video_comment_count)
        VALUES (?, ?, ?, ?, ?, ?)
    ''', (row['video_id'], row['video_view_count'], row['video_like_count'], row['video_share_count'], row['video_download_count'], row['video_comment_count']))


In [23]:

# Commit changes
conn.commit()

In [24]:

# Close the connection
conn.close()

In [25]:
# connecting and testing a query
conn = sqlite3.connect(db_file_path)
cursor = conn.cursor()

query = '''
SELECT v.video_id, v.video_duration_sec, v.claim_status, m.video_view_count, m.video_like_count
FROM Videos v
JOIN VideoMetrics m ON v.video_id = m.video_id
LIMIT 5
'''
result = cursor.execute(query).fetchall()
print(result)

conn.close()

[(7017666017, 59, 'claim', 343296.0, 19425.0), (4014381136, 32, 'claim', 140877.0, 77355.0), (9859838091, 31, 'claim', 902185.0, 97690.0), (1866847991, 25, 'claim', 437506.0, 239954.0), (7105231098, 19, 'claim', 56167.0, 34987.0)]
