In [83]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import json
from scipy import stats as st
from sqlalchemy import create_engine

pd.set_option('display.max_rows', None)

In [84]:
#set the CSV files into a database
IMDB_route = "Resources/movies.csv"
stream_route = "Resources/Stream.csv"
imdb_db = pd.read_csv(IMDB_route)
stream_db = pd.read_csv(stream_route)

In [85]:
# Create the pandas DataFrame 
stream_df = pd.DataFrame(stream_db)

stream_df.head()

Unnamed: 0.1,Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,Type,Directors,Genres,Country,Language,Runtime
0,0,1,Inception,2010,13+,8.8,87%,1,0,0,0,0,Christopher Nolan,"Action,Adventure,Sci-Fi,Thriller","United States,United Kingdom","English,Japanese,French",148.0
1,1,2,The Matrix,1999,18+,8.7,87%,1,0,0,0,0,"Lana Wachowski,Lilly Wachowski","Action,Sci-Fi",United States,English,136.0
2,2,3,Avengers: Infinity War,2018,13+,8.5,84%,1,0,0,0,0,"Anthony Russo,Joe Russo","Action,Adventure,Sci-Fi",United States,English,149.0
3,3,4,Back to the Future,1985,7+,8.5,96%,1,0,0,0,0,Robert Zemeckis,"Adventure,Comedy,Sci-Fi",United States,English,116.0
4,4,5,"The Good, the Bad and the Ugly",1966,18+,8.8,97%,1,0,1,0,0,Sergio Leone,Western,"Italy,Spain,West Germany",Italian,161.0


In [86]:
# Create the pandas DataFrame 
imdb_df = pd.DataFrame(imdb_db)

imdb_df = imdb_df.rename(columns={"title": "Title"})


In [87]:
#merge the imdb_df and stream_df together into one dataframe
merged_df = stream_df.merge(imdb_df, left_on='Title', right_on='Title')

#print all the titles out to see what to cut out
print(merged_df.columns.tolist())

['Unnamed: 0', 'ID', 'Title', 'Year', 'Age', 'IMDb', 'Rotten Tomatoes', 'Netflix', 'Hulu', 'Prime Video', 'Disney+', 'Type', 'Directors', 'Genres', 'Country', 'Language', 'Runtime', 'imdb_title_id', 'original_title', 'year', 'date_published', 'genre', 'duration', 'country', 'language', 'director', 'writer', 'production_company', 'actors', 'description', 'avg_vote', 'votes', 'budget', 'usa_gross_income', 'worlwide_gross_income', 'metascore', 'reviews_from_users', 'reviews_from_critics']


In [88]:
#keep the columns I want for my new merged dataframe
merged_df = merged_df[['Title', 'year', 'Age', 'IMDb', 'Rotten Tomatoes', 'metascore', 'description', 'Runtime', 'Netflix', 'Hulu', 'Prime Video', 'Disney+']]



In [89]:
#rename columns to get rid of Querying issue where all of the titles has "" in the title
merged_df = merged_df.rename(columns={'Title': 'title', 'Age':'age', 'IMDb':'IMDb', 'Rotten Tomatoes': 'rotten tomatoes', 
                                      'Runtime': 'runtime', 'Netflix': 'netflix', 'Hulu': 'hulu', 'Prime Video': 'prime video', 'Disney+': 'disney'})

merged_df.head(20)

Unnamed: 0,title,year,age,IMDb,rotten tomatoes,metascore,description,runtime,netflix,hulu,prime video,disney
0,Inception,2010,13+,8.8,87%,74.0,A thief who steals corporate secrets through t...,148.0,1,0,0,0
1,The Matrix,1999,18+,8.7,87%,73.0,A computer hacker learns from mysterious rebel...,136.0,1,0,0,0
2,Avengers: Infinity War,2018,13+,8.5,84%,68.0,The Avengers and their allies must be willing ...,149.0,1,0,0,0
3,Back to the Future,1985,7+,8.5,96%,87.0,"Marty McFly, a 17-year-old high school student...",116.0,1,0,0,0
4,Spider-Man: Into the Spider-Verse,2018,7+,8.4,97%,87.0,Teen Miles Morales becomes Spider-Man of his r...,117.0,1,0,0,0
5,The Pianist,1991,18+,8.5,95%,,"Jean and her sister, played by Macha Grenon, h...",150.0,1,0,1,0
6,The Pianist,2002,18+,8.5,95%,85.0,A Polish Jewish musician struggles to survive ...,150.0,1,0,1,0
7,Django Unchained,2012,18+,8.4,87%,81.0,"With the help of a German bounty hunter, a fre...",165.0,1,0,0,0
8,Raiders of the Lost Ark,1981,7+,8.4,95%,85.0,"In 1936, archaeologist and adventurer Indiana ...",115.0,1,0,0,0
9,Inglourious Basterds,2009,18+,8.3,89%,69.0,"In Nazi-occupied France during World War II, a...",153.0,1,0,0,0


In [90]:
#create the connection to my postgres account to then add the merged_df
rds_connection_string = "ETL_Group:Password@localhost/merged_df"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [91]:
#check that I have the connection and what databases there are
engine.table_names()

['Move_Reviews_DB', 'Movie_Reviews_DB']

In [92]:
#send my merged_df into pgAdmin
merged_df.to_sql(name='movie_reviews', con=engine, index=False)

In [93]:
#check to make sure merged_df went into pgAdim
#query something to make sure it is not in SQL format
pd.read_sql_query("select netflix FROM movie_reviews where title='Back to the Future'", con=engine).head()

Unnamed: 0,netflix
0,1


In [96]:
pd.read_sql_query("select disney FROM movie_reviews where title = 'Mulan'", con=engine).head()

Unnamed: 0,disney
0,1
