In [1]:
from flask import Flask 
from flask_sqlalchemy import SQLAlchemy
from sqlalchemy import text as query_text
from sqlalchemy.sql import func
import os
import re


import pandas as pd
import numpy as np

app = Flask(__name__)
# Configure SQLite database
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///' + os.path.join(app.root_path, '../douban.db')
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
app.app_context().push()

db = SQLAlchemy(app)

def execute_scripts(scripts):
    scripts = [sql.strip() for sql in sql_script.split(';')]
    db.session.begin()
    for sql in scripts:
        db.session.execute(query_text(sql))
    try:
        db.session.commit()
        print(f'Execute sucessfully.')
    except Exception as e:
        session.rollback()
        print(f'Error during transaction: {e}')

In [2]:
sql_script = """
DROP TABLE IF EXISTS douban_movies;
CREATE TABLE IF NOT EXISTS douban_movies (
    movie_id INTEGER PRIMARY KEY NOT NULL,
    movie_name TEXT NOT NULL,
    release_date DATETIME,
    country TEXT,
    movie_type TEXT,
    release_year INTEGER,
    description TEXT,
    douban_url TEXT,
    poster TEXT,
    douban_rate FLOAT,
    rating_count INTEGER
);
"""
execute_scripts(sql_script)

movies = pd.read_csv('douban/movies.csv')
cols_str = ['director', 'author', 'actor', 'genre', 'aggregateRating']
movies[cols_str] = movies[cols_str].map(eval)

movies['release_date'] = pd.to_datetime(movies.datePublished)
movies['release_year'] = movies.release_date.dt.year
movies['description'] = movies['description']
movies['douban_url'] = movies['url'].apply(lambda x: f"https://movie.douban.com{x}")
movies['douban_rate'] = movies['aggregateRating'].apply(lambda x: float(x['ratingValue']))
movies['rating_count'] = movies['aggregateRating'].apply(lambda x: int(x['ratingCount']))
movies['movie_type'] = movies['genre'].apply(lambda x: x[0]if len(x)==1 else x[1])
cols  = ['movie_id', 'name',  'release_date', 'country', 'movie_type', 'release_year', 'description', 'douban_url', 'image', 'douban_rate', 'rating_count']
df = movies.reset_index()[cols]
df.columns = ['movie_id', 'movie_name',  'release_date', 'country', 'movie_type', 'release_year', 'description', 'douban_url', 'poster', 'douban_rate', 'rating_count']
df.to_sql('douban_movies', db.engine, if_exists='append', index=False)

Execute sucessfully.


428

In [3]:
sql_script = """
DROP TABLE IF EXISTS persons;
CREATE TABLE IF NOT EXISTS persons (
    person_id INTEGER PRIMARY KEY NOT NULL,
    person_name TEXT NOT NULL,
    person_name_en TEXT,
    gender TEXT,
    birth_date TEXT,
    birth_place TEXT,
    birth_year INTEGR
);
"""
execute_scripts(sql_script)

persons = pd.read_csv('douban/celebrities.csv').fillna('')
cols  = ['person_id', 'name_cn', 'name_en', '性别', '出生日期', '出生地']
df = persons[cols].copy()
df['birtch_year'] = df['出生日期'].str.findall('\d+').apply(lambda x: np.nan if len(x)==0 else x[0])
df.columns = ['person_id', 'person_name', 'person_name_en', 'gender', 'birth_date', 'birth_place', 'birth_year']

df.to_sql('persons', db.engine, if_exists='replace', index=False)

Execute sucessfully.


10028

In [4]:
sql_script = """
DROP TABLE IF EXISTS movie_person_association;
CREATE TABLE IF NOT EXISTS movie_person_association (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    movie_id INTEGER  NOT NULL,
    person_id INTEGER NOT NULL,
    movie_role TEXT
);
"""
execute_scripts(sql_script)

relations = pd.read_csv('douban/relations.csv')
relations.to_sql('movie_person_association', db.engine, if_exists='replace', index=False)

Execute sucessfully.


13398

In [5]:
sql_script = """
DROP TABLE IF EXISTS movie_actor_association;
CREATE TABLE IF NOT EXISTS movie_actor_association (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    movie_id INTEGER  NOT NULL,
    person_id INTEGER NOT NULL
);
"""
execute_scripts(sql_script)

relations = pd.read_csv('douban/relations.csv')
movie_actor_relation = relations.loc[relations.movie_role=='actor', ['movie_id', 'person_id']].reset_index(drop=True)
movie_actor_relation.to_sql('movie_actor_association', db.engine, if_exists='replace', index=False)

Execute sucessfully.


11819

In [6]:
relations

Unnamed: 0,person_id,movie_id,movie_role
0,1349765,26752088,director
1,1047973,1292052,director
2,1054524,3541415,director
3,1328441,26794435,director
4,1054439,1291561,director
...,...,...,...
13393,1150778,1325958,actor
13394,1054487,1325958,actor
13395,1440990,1325958,actor
13396,1293034,1325958,actor


In [7]:
db.session.close()