In [53]:
import pandas as pd
from config import db_password
from sqlalchemy import create_engine

In [38]:
# File path
movies_file_path = 'movies.csv'

In [39]:
# Load in original dataset
movies_df = pd.read_csv(movies_file_path)

In [40]:
# Remove null values
movies_df = movies_df.dropna()

In [41]:
# Extract only the date from the released column
movies_df['released'] = movies_df['released'].str.replace(r"\(.*?\)", "", regex=True)

In [42]:
movies_df.head()

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0


In [43]:
# Convert date column to date object, drop extra column
movies_df['released'] = pd.to_datetime(movies_df['released'])

In [46]:
# Apply .weekday() function to values in date column (dow: 0 = Monday - 6 = Sunday)
movies_df['released_weekday'] = movies_df['released'].apply(lambda x: x.weekday())

In [47]:
movies_df.head()

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime,released_weekday
0,The Shining,R,Drama,1980,1980-06-13,8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0,4
1,The Blue Lagoon,R,Adventure,1980,1980-07-02,5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0,2
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,1980-06-20,8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0,4
3,Airplane!,PG,Comedy,1980,1980-07-02,7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0,2
4,Caddyshack,R,Comedy,1980,1980-07-25,7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0,4


In [48]:
# Add column with the written day of week 
movies_df.loc[movies_df['released_weekday'] == 0, 'weekday'] = 'Monday'
movies_df.loc[movies_df['released_weekday'] == 1, 'weekday'] = 'Tuesday'
movies_df.loc[movies_df['released_weekday'] == 2, 'weekday'] = 'Wednesday'
movies_df.loc[movies_df['released_weekday'] == 3, 'weekday'] = 'Thursday'
movies_df.loc[movies_df['released_weekday'] == 4, 'weekday'] = 'Friday'
movies_df.loc[movies_df['released_weekday'] == 5, 'weekday'] = 'Saturday'
movies_df.loc[movies_df['released_weekday'] == 6, 'weekday'] = 'Sunday'


In [57]:
movies_df = movies_df.drop(['rating', 'genre', 'year', 'released', 'score', 'votes', 'director', 'writer', 'star', 'country', 'budget', 'gross', 'company', 'runtime'], axis=1)


In [58]:
movies_df['weekday'].value_counts()

Friday       4668
Wednesday     490
Thursday      160
Tuesday        38
Saturday       33
Sunday         18
Monday         14
Name: weekday, dtype: int64

In [59]:
# Connection string to PostgreSQL
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/group_project"


In [60]:
# Create database engine
engine = create_engine(db_string)

In [61]:
# Export released date day of week to a SQL table
movies_df.to_sql(name="released_dayofweek", con=engine, index=False)