# Project 2: Web Scraping and API access

In [15]:
!pip install beautifulsoup4



## Part 1: Explore the html for Wikipedia articles. 

### A. Using inspect element, copy the html code for a table.

table {
    display: table;
    border-collapse: separate;
    box-sizing: border-box;
    text-indent: initial;
    unicode-bidi: isolate;
    width: max-content;
    min-width: 100vw;
    font-size: initial;
    font-family: monospace;
    tab-size: 4;
    border-spacing: 0px;
    border-color: gray;
    white-space: pre;
    margin: 0px;
}

### B. Using inspect element, find the html syntax for a link. 

<a class="vector-toc-link" href="#Scope_and_approach">

### C. Using inspect element, find the html syntax for linking an image

<img class="mw-logo-icon" src="/static/images/icons/wikipedia.png" alt="" aria-hidden="true" height="50" width="50">

## Part 2: Explore one Wikipedia page with the beautifulsoup package

In [16]:
import bs4
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [17]:
#save and print the text content of a page with all tags removed
url = "https://en.wikipedia.org/wiki/Information_science"  
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
text_content = soup.get_text()
print(text_content[:1000])  





Information science - Wikipedia




































Jump to content







Main menu





Main menu
move to sidebar
hide



		Navigation
	


Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us





		Contribute
	


HelpLearn to editCommunity portalRecent changesUpload file



















Search











Search















Donate








Appearance
















Create account

Log in








Personal tools





 Create account Log in





		Pages for logged out editors learn more



ContributionsTalk




























Contents
move to sidebar
hide




(Top)





1
Foundations




Toggle Foundations subsection





1.1
Scope and approach








1.2
Definitions






1.2.1
Related terms










1.3
Philosophy of information








1.4
Ontology








1.5
Science or discipline?










2
Careers




Toggle Careers subsection





2.1
Information scientist








2.2
Systems analyst








2.3
Information professional





In [18]:
#download an image with beautifulsoup and save it in this repository
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/8/82/Bibliometrics_definition.svg/440px-Bibliometrics_definition.svg.png"
image_response = requests.get(image_url)
with open("downloaded_image.jpg", 'wb') as file:
    file.write(image_response.content)

In [19]:
#find all the links in a page with beautifulsoup
#print the first 100 characters of ten of these links
links = soup.find_all('a')
for link in links[:10]:  
    print(str(link.get('href'))[:100])  


#bodyContent
/wiki/Main_Page
/wiki/Wikipedia:Contents
/wiki/Portal:Current_events
/wiki/Special:Random
/wiki/Wikipedia:About
//en.wikipedia.org/wiki/Wikipedia:Contact_us
/wiki/Help:Contents
/wiki/Help:Introduction
/wiki/Wikipedia:Community_portal


## Part 3: Downloading scripts

In [20]:
scripts=pd.read_csv('pudding_data.csv')

In [21]:
scripts

Unnamed: 0,imdb_id,script_id,title,year,gross (inflation-adjusted),link
0,tt0019777,4031,The Cocoanuts,1929,,http://www.pages.drexel.edu/~ina22/splaylib/Sc...
1,tt0021884,8521,Frankenstein,1931,298.0,Frankenstein (Florey & Fort) [1931-5-23] [Scan...
2,tt0022054,1086,The Last Flight,1931,,"film_20100519/all_imsdb_05_19_10/Last-Flight,-..."
3,tt0022626,1631,American Madness,1932,,http://www.imsdb.com/Movie Scripts/American Ma...
4,tt0022958,2438,Grand Hotel,1932,,http://www.imsdb.com/Movie Scripts/Grand Hotel...
...,...,...,...,...,...,...
1995,tt3733778,8533,Pay the Ghost,2015,,"Pay The Ghost (Dan Kay, 9-1-09).pdf"
1996,tt3808342,5499,Son of Saul,2015,0.0,http://gointothestory.blcklst.com/wp-content/u...
1997,tt3850214,8056,Dope,2015,18.0,Dope (2013.10.31) [Digital].pdf
1998,tt3859076,5507,Truth,2015,2.0,http://gointothestory.blcklst.com/wp-content/u...


In [22]:
#using the links in the "link" column, download the first 1000 characters of each script
#use requests and bs4, remember to remove all html tags
def download_script_text(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    text = soup.get_text()  
    return text[:1000]  

In [23]:
#add a new column to the df with the text downloaded
#save this new dataframe as "pudding_texts.csv"
scripts['script_text'] = scripts['link'].apply(download_script_text)
scripts.to_csv('pudding_texts.csv', index=False)

KeyboardInterrupt: 

## Part 4: TMDB database

#### Browse the documentation at https://developer.themoviedb.org/reference/intro/getting-started. Create an account to authenticate

In [24]:
#create a dataset of the movies in theaters now. Include metadata fields you are interested in. 
# Create a dataset of the movies in theaters now using TMDB API
api_key = "d7741d3e395a5c77addf43f0321a8305"  
url = f"https://api.themoviedb.org/3/movie/now_playing?api_key={api_key}&language=en-US&page=1"
response = requests.get(url)
movies_data = response.json()['results']

movies = []
for movie in movies_data:
    movies.append({
        'title': movie['title'],
        'release_date': movie['release_date'],
        'vote_average': movie['vote_average'],
        'overview': movie['overview'],
        'poster_path': f"https://image.tmdb.org/t/p/w500{movie['poster_path']}"  # Poster URL
    })

movies_df = pd.DataFrame(movies)
movies_df.to_csv('movies_in_theaters_now.csv', index=False)

In [25]:
#download the movie posters for 10 of these movies and save them to this repository
for i, movie in enumerate(movies[:10]):
    poster_url = movie['poster_path']
    poster_response = requests.get(poster_url)
    
    with open(f"poster_{i+1}.jpg", 'wb') as file:
        file.write(poster_response.content)
