# Primary Data Collection

In [1]:
#imports
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [3]:
#reading in csv from kaggle
df = pd.read_csv('data/MoviesOnStreamingPlatforms_updated.csv',index_col=0)

In [4]:
df.head()

Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,Type,Directors,Genres,Country,Language,Runtime
0,1,Inception,2010,13+,8.8,87%,1,0,0,0,0,Christopher Nolan,"Action,Adventure,Sci-Fi,Thriller","United States,United Kingdom","English,Japanese,French",148.0
1,2,The Matrix,1999,18+,8.7,87%,1,0,0,0,0,"Lana Wachowski,Lilly Wachowski","Action,Sci-Fi",United States,English,136.0
2,3,Avengers: Infinity War,2018,13+,8.5,84%,1,0,0,0,0,"Anthony Russo,Joe Russo","Action,Adventure,Sci-Fi",United States,English,149.0
3,4,Back to the Future,1985,7+,8.5,96%,1,0,0,0,0,Robert Zemeckis,"Adventure,Comedy,Sci-Fi",United States,English,116.0
4,5,"The Good, the Bad and the Ugly",1966,18+,8.8,97%,1,0,1,0,0,Sergio Leone,Western,"Italy,Spain,West Germany",Italian,161.0


In [5]:
#checking shape of df
df.shape

(16744, 16)

In [6]:
#checking null value counts and data types
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16744 entries, 0 to 16743
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               16744 non-null  int64  
 1   Title            16744 non-null  object 
 2   Year             16744 non-null  int64  
 3   Age              7354 non-null   object 
 4   IMDb             16173 non-null  float64
 5   Rotten Tomatoes  5158 non-null   object 
 6   Netflix          16744 non-null  int64  
 7   Hulu             16744 non-null  int64  
 8   Prime Video      16744 non-null  int64  
 9   Disney+          16744 non-null  int64  
 10  Type             16744 non-null  int64  
 11  Directors        16018 non-null  object 
 12  Genres           16469 non-null  object 
 13  Country          16309 non-null  object 
 14  Language         16145 non-null  object 
 15  Runtime          16152 non-null  float64
dtypes: float64(2), int64(7), object(7)
memory usage: 2.2+ MB


In [5]:
#renaming columns to snake case for ease of use
df.rename(columns={'Rotten Tomatoes':'rt_critic_score','IMDb':'imdb_fan_score','Prime Video':'prime_video',
                  'ID':'id','Title':'movie','Year':'year','Age':'age_rating','Netflix':'netflix',
                  'Hulu':'hulu','Disney+':'disney_plus','Type':'type','Directors':'directors',
                  'Genres':'genres','Country':'country','Language':'language','Runtime':'movie_len_mins'},
         inplace=True)

In [6]:
#dropping column that identifies whether title is movie or tv show, as all observations are movies
df.drop(columns=['type'],inplace=True)

## Scraping Rotten Tomatoes Audience Scores -- NOT USED IN ANALYSIS

In [33]:
#function that generates a link for webscrapping from rotten tomatoes for each of the movies
def rt_links(movie):
    rt_link = 'https://www.rottentomatoes.com/m/'
    
    formated = movie.lower()
    for char in formated:
        if char not in '1234567890abcdefghijklmnopqrstuvwxyz ':
            formated = formated.replace(char,'')
    try:
        formated = formated.replace(' ','_')
        if '__' in formated:
            formated = formated.replace('__','_')
        formated = rt_link+formated
        return formated
    except:
        formated = rt_link+formated
        return formated

In [34]:
#creating column with rotten tomatoes movie link for each observation
df['rt_movie_link'] = df['movie'].apply(rt_links)

In [67]:
#function that scrapes rotten tomatoes website based on link for movie and returns the audience score in percent
def rt_audience_score(link):
    try:
        res = requests.get(link)
        soup = BeautifulSoup(res.content,'lxml')
        return int(soup.find('score-board').attrs['audiencescore'])/100
    except:
        return 'no score'

In [74]:
#creating a sample to test above function
sample = df.sample(100)

In [75]:
#timing how long it will take to pull scores from the entire dataframe
%%time
sample['rt_audience_score'] = sample['rt_movie_link'].apply(rt_audience_score)

CPU times: user 12.5 s, sys: 517 ms, total: 13.1 s
Wall time: 2min 12s


In [80]:
#function to split dataframe into sections so that I can pull RT scores in sections
def df_split(df,sections):
    df_sections = []
    size = df.shape[0]//sections
    start = 0
    end = size
    for num in range(sections):
        df_sections.append(df[start:end])
        start = end
        end = start+size
    df_sections.append(df.iloc[-8:])
    return df_sections

In [81]:
#creating sections using above function
sections = df_split(df,16)

In [99]:
#pulling scores for first section
sections[0]['rt_audience_score'] = sections[0]['rt_movie_link'].apply(rt_audience_score)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [100]:
#pulling scores for second section
sections[1]['rt_audience_score'] = sections[1]['rt_movie_link'].apply(rt_audience_score)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [101]:
#pulling scores for third section
sections[2]['rt_audience_score'] = sections[2]['rt_movie_link'].apply(rt_audience_score)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [103]:
#pulling scores for 4th-7th section 
for i in range(3,7):
    sections[i]['rt_audience_score'] = sections[i]['rt_movie_link'].apply(rt_audience_score)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [104]:
#pulling scores for 8th-11th section
for i in range(7,11):
    sections[i]['rt_audience_score'] = sections[i]['rt_movie_link'].apply(rt_audience_score)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [105]:
#pulling scores for 12th-15th section
for i in range(11,15):
    sections[i]['rt_audience_score'] = sections[i]['rt_movie_link'].apply(rt_audience_score)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [109]:
#pulling scores for 16th section
sections[15]['rt_audience_score'] = sections[15]['rt_movie_link'].apply(rt_audience_score)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [110]:
#pulling final section
sections[16]['rt_audience_score'] = sections[16]['rt_movie_link'].apply(rt_audience_score)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [111]:
#checking the shape of each section to make sure all pulled scores successfully
for i in sections:
    print(i.shape)

(1046, 17)
(1046, 17)
(1046, 17)
(1046, 17)
(1046, 17)
(1046, 17)
(1046, 17)
(1046, 17)
(1046, 17)
(1046, 17)
(1046, 17)
(1046, 17)
(1046, 17)
(1046, 17)
(1046, 17)
(1046, 17)
(8, 17)


In [113]:
#concatenating all sections back into one dataframe with new score column
df = pd.concat(sections)

In [116]:
#exporting dataframe to new csv
df.to_csv('movies_with_rt.csv',index=False)

## Scraping IMDB links, Movie Descriptions

In [77]:
df = pd.read_csv('data/movies_with_rt.csv')

In [86]:
#function that googles phrases to obtain imdb cast link for each movie title
def get_cast_link(movie):
    from googlesearch import search
    query = f'{movie} full cast and crew imdb'
    for link in search(query,stop=1):
        return link

In [87]:
#applying above function to all movies in the dataframe, saving each section to a csv
%%time
count = 8
for section in sections[7:]:
    section['imdb_cast_link'] = section['movie'].apply(get_cast_link)
    section.to_csv(f'data/sections/section_{count}.csv')
    count += 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


CPU times: user 8min 27s, sys: 28.5 s, total: 8min 55s
Wall time: 7h 55min 20s


In [89]:
#combining all sections back into single dataframe with links included
count = 1
for i in range(1,18):
    if count == 1:
        df = pd.read_csv(f'data/sections/section_{count}.csv')
        count += 1
    else:
        sub_df = pd.read_csv(f'data/sections/section_{count}.csv')
        df = pd.concat([df,sub_df])
        count += 1

In [91]:
df.shape

(16744, 19)

In [93]:
df.drop(columns=['Unnamed: 0'],inplace=True)

In [112]:
# getting movie id from cast_link
def imdb_movie_id(link):
    if 'https://www.imdb.com/title/' in link:
        return link.split('/')[4]
    else:
        return 'no ID'

In [99]:
df['imdb_movie_id'] = df['imdb_cast_link'].apply(imdb_movie_id)

In [108]:
# description webscraping
def get_description(movie_id):
    url = f'https://www.imdb.com/title/{movie_id}/'
    res = requests.get(url)
    soup = BeautifulSoup(res.content,'lxml')
    try:
        return soup.find('span',attrs={'role':'presentation','data-testid':'plot-l'}).text
    except:
        return 'No description'

In [106]:
#splitting data into sections for movie description scraping
sections_for_desc = df_split(df,16)

In [None]:
#scraping description data in sections, saving file
%%time
for i in range(1,18):
    if i == 1:
        df = sections_for_desc[i]
        df['movie_desc'] = df['imdb_movie_id'].apply(get_description)
    else:
        sub_df = sections_for_desc[i]
        sub_df['movie_desc'] = sub_df['imdb_movie_id'].apply(get_description)
        df = pd.concat([df,sub_df])
        df.to_csv('data/df_with_movie_desc.csv',index=False)