In [180]:
#Importing dependencies
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy import inspect
import numpy as np
import config

In [181]:
#Extracting academy_awards_1927-2015 as a Pandas DataFrame
academy_data_csv = "Resources/academy_awards_1927-2015.csv"
academy_data_df = pd.read_csv(academy_data_csv)
academy_data_df.head()

Unnamed: 0,Year,Ceremony,Award,Winner,Name,Film
0,1927/1928,1,Actor,,Richard Barthelmess,The Noose
1,1927/1928,1,Actor,1.0,Emil Jannings,The Last Command
2,1927/1928,1,Actress,,Louise Dresser,A Ship Comes In
3,1927/1928,1,Actress,1.0,Janet Gaynor,7th Heaven
4,1927/1928,1,Actress,,Gloria Swanson,Sadie Thompson


In [182]:
#Extracting academy_awards_1927-2015 as a Pandas DataFrame
imdb_top_csv = "Resources/imdb_top_1000.csv"
imdb_top_df = pd.read_csv(imdb_top_csv)
imdb_top_df.head(2)

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411


<h1>Transfrom Data</h1>

In [183]:
# Combine star columns into 1 combined column of actors
imdb_top_df['actors'] = imdb_top_df[imdb_top_df.columns[10:15]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1)

In [184]:
# Drop unnecessary columns
dopped_imdb_top_df = imdb_top_df.drop(columns=['Overview', 'Certificate', "Poster_Link", "Star1", "Star2", "Star3", "Star4"])

# Remove the "min" from the runtime column
dopped_imdb_top_df["Runtime"] = dopped_imdb_top_df["Runtime"].str.rstrip(' min')

# Rename column headers
imdb_top_1000_df = dopped_imdb_top_df.rename(columns={"Series_Title": "film_name",
                                                "Released_Year": "released_year",
                                                "Runtime": "runtime",
                                                "Genre": "genre",
                                                "IMDB_Rating": "imdb_rating",
                                                "Meta_score": "meta_score",
                                                "Director": "director",
                                                "actors": "actors",
                                                "No_of_Votes": "votes",
                                                "Gross": "gross"
                                                })

# Drop duplicate film names
imdb_top_1000_df = pd.DataFrame(imdb_top_1000_df[imdb_top_1000_df["film_name"] != "Drishyam"])

In [185]:
# remove commas from gross so it can be conveted to int data type
imdb_top_1000_df["gross"] = imdb_top_1000_df['gross'].replace(to_replace=",", value="", regex=True)

# remove pg from released_year so it can be conveted to int data type
imdb_top_1000_df["released_year"] = imdb_top_1000_df['released_year'].replace(to_replace="PG", value="0", regex=True)

# replaces a NaN value with 0 so we can convert the column to int
imdb_top_1000_df["gross"] = imdb_top_1000_df['gross'].fillna(0)

# convert data types
imdb_top_1000_df["gross"] = imdb_top_1000_df["gross"].astype('int')
imdb_top_1000_df["released_year"] = imdb_top_1000_df["released_year"].astype('int')
imdb_top_1000_df["runtime"] = imdb_top_1000_df["runtime"].astype('int')

imdb_top_1000_df.head(5)

Unnamed: 0,film_name,released_year,runtime,genre,imdb_rating,meta_score,director,votes,gross,actors
0,The Shawshank Redemption,1994,142,Drama,9.3,80.0,Frank Darabont,2343110,28341469,"Tim Robbins,Morgan Freeman,Bob Gunton,William ..."
1,The Godfather,1972,175,"Crime, Drama",9.2,100.0,Francis Ford Coppola,1620367,134966411,"Marlon Brando,Al Pacino,James Caan,Diane Keato..."
2,The Dark Knight,2008,152,"Action, Crime, Drama",9.0,84.0,Christopher Nolan,2303232,534858444,"Christian Bale,Heath Ledger,Aaron Eckhart,Mich..."
3,The Godfather: Part II,1974,202,"Crime, Drama",9.0,90.0,Francis Ford Coppola,1129952,57300000,"Al Pacino,Robert De Niro,Robert Duvall,Diane K..."
4,12 Angry Men,1957,96,"Crime, Drama",9.0,96.0,Sidney Lumet,689845,4360000,"Henry Fonda,Lee J. Cobb,Martin Balsam,John Fie..."


In [186]:
imdb_top_1000_df.dtypes

film_name         object
released_year      int32
runtime            int32
genre             object
imdb_rating      float64
meta_score       float64
director          object
votes              int64
gross              int32
actors            object
dtype: object

In [187]:
#Connecting to local database using config for all personal data
rds_connection_string = f'{config.protocol}://{config.username}:{config.password}@{config.host}:{config.port}/{config.database_name}'
engine = create_engine(rds_connection_string)
insp = inspect(engine)

In [188]:
insp.get_table_names()

['imdb_top_1000', 'academy_award_data']

In [189]:
imdb_top_1000_df.to_sql(name='imdb_top_1000', con=engine, if_exists='append', index=False)