# Project 2: Web Scraping and API access

In [3]:
!pip install beautifulsoup4



## Part 1: Explore the html for Wikipedia articles. 

### A. Using inspect element, copy the html code for a table.

In [4]:
### "<table class="infobox ib-settlement vcard">"

### B. Using inspect element, find the html syntax for a link. 

In [5]:
### "<a class="mw-jump-link" href="#bodyContent">Jump to content</a>"

### C. Using inspect element, find the html syntax for linking an image

In [6]:
### <a href="/wiki/File:Chicago_Skyline_in_September_2023_(cropped).jpg" class="mw-file-description"><img alt="" src="//upload.wikimedia.org/wikipedia/commons/thumb/9/91/Chicago_Skyline_in_September_2023_%28cropped%29.jpg/268px-Chicago_Skyline_in_September_2023_%28cropped%29.jpg" decoding="async" width="268" height="108" class="mw-file-element" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/9/91/Chicago_Skyline_in_September_2023_%28cropped%29.jpg/402px-Chicago_Skyline_in_September_2023_%28cropped%29.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/9/91/Chicago_Skyline_in_September_2023_%28cropped%29.jpg/536px-Chicago_Skyline_in_September_2023_%28cropped%29.jpg 2x" data-file-width="3127" data-file-height="1258"></a>

## Part 2: Explore one Wikipedia page with the beautifulsoup package

In [7]:
import bs4
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [8]:
#save and print the text content of a page with all tags removed

In [9]:
url = "https://en.wikipedia.org/wiki/Chicago"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
text_content = soup.get_text()
print(text_content[:1000])





Chicago - Wikipedia



































Jump to content







Main menu





Main menu
move to sidebar
hide



		Navigation
	


Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us





		Contribute
	


HelpLearn to editCommunity portalRecent changesUpload fileSpecial pages



















Search











Search






















Appearance
















Donate

Create account

Log in








Personal tools





Donate Create account Log in





		Pages for logged out editors learn more



ContributionsTalk




























Contents
move to sidebar
hide




(Top)





1
Etymology and nicknames








2
History




Toggle History subsection





2.1
Beginnings








2.2
19th century








2.3
20th and 21st centuries






2.3.1
1900 to 1939








2.3.2
1940 to 1979








2.3.3
1980 to present












3
Geography




Toggle Geography subsection





3.1
Topography








3.2
Communities








3.3
Streetscape






In [10]:
#download an image with beautifulsoup and save it in this repository
import os
url = "https://en.wikipedia.org/wiki/Chicago"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
image_tag = soup.find("img")

if image_tag:
    image_url = image_tag["src"]
    if image_url.startswith("//"):
        image_url = "https:" + image_url
    elif image_url.startswith("/"):
        image_url = "https://en.wikipedia.org" + image_url

    image_data = requests.get(image_url).content

    with open("wikipedia_image.jpg", "wb") as file:
        file.write(image_data)

    print(f"Image saved: {image_url}")
else:
    print("No image found on the page.")

Image saved: https://en.wikipedia.org/static/images/icons/wikipedia.png


In [11]:
#find all the links in a page with beautifulsoup
#print the first 100 characters of ten of these links

links = [a["href"] for a in soup.find_all("a", href=True)]

full_links = ["https://en.wikipedia.org" + link if link.startswith("/wiki/") else link for link in links]

for i, link in enumerate(full_links[:10]):
    print(f"Link {i+1}: {link[:100]}")


Link 1: #bodyContent
Link 2: https://en.wikipedia.org/wiki/Main_Page
Link 3: https://en.wikipedia.org/wiki/Wikipedia:Contents
Link 4: https://en.wikipedia.org/wiki/Portal:Current_events
Link 5: https://en.wikipedia.org/wiki/Special:Random
Link 6: https://en.wikipedia.org/wiki/Wikipedia:About
Link 7: //en.wikipedia.org/wiki/Wikipedia:Contact_us
Link 8: https://en.wikipedia.org/wiki/Help:Contents
Link 9: https://en.wikipedia.org/wiki/Help:Introduction
Link 10: https://en.wikipedia.org/wiki/Wikipedia:Community_portal


## Part 3: Downloading scripts

In [12]:
scripts=pd.read_csv('pudding_data.csv')

In [13]:
scripts

Unnamed: 0,imdb_id,script_id,title,year,gross (inflation-adjusted),link
0,tt0019777,4031,The Cocoanuts,1929,,http://www.pages.drexel.edu/~ina22/splaylib/Sc...
1,tt0021884,8521,Frankenstein,1931,298.0,Frankenstein (Florey & Fort) [1931-5-23] [Scan...
2,tt0022054,1086,The Last Flight,1931,,"film_20100519/all_imsdb_05_19_10/Last-Flight,-..."
3,tt0022626,1631,American Madness,1932,,http://www.imsdb.com/Movie Scripts/American Ma...
4,tt0022958,2438,Grand Hotel,1932,,http://www.imsdb.com/Movie Scripts/Grand Hotel...
...,...,...,...,...,...,...
1995,tt3733778,8533,Pay the Ghost,2015,,"Pay The Ghost (Dan Kay, 9-1-09).pdf"
1996,tt3808342,5499,Son of Saul,2015,0.0,http://gointothestory.blcklst.com/wp-content/u...
1997,tt3850214,8056,Dope,2015,18.0,Dope (2013.10.31) [Digital].pdf
1998,tt3859076,5507,Truth,2015,2.0,http://gointothestory.blcklst.com/wp-content/u...


In [14]:
pip install lxml

Note: you may need to restart the kernel to use updated packages.


In [15]:
#using the links in the "link" column, download the first 1000 characters of each script
#use requests and bs4, remember to remove all html tags

df = pd.read_csv("pudding_data.csv")

# 初始化新的列
df["script_text"] = ""

# 自定义请求头，模拟浏览器访问
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# 遍历 link 列，下载并解析文本
for index, row in df.iterrows():
    url = row["link"]
    if pd.notna(url) and url.startswith("http"):  # 确保 URL 存在且有效
        try:
            response = requests.get(url, headers=headers, timeout=5)
            response.raise_for_status()  # 确保请求成功

            # 检查 Content-Type，避免解析 PDF、TXT
            content_type = response.headers.get("Content-Type", "")
            if "text/html" not in content_type:
                print(f"Skipping non-HTML content: {url}")
                df.at[index, "script_text"] = "NON-HTML CONTENT"
                continue

            # 解析 HTML，使用 lxml 以提高成功率
            soup = BeautifulSoup(response.text, "lxml")
            text = soup.get_text()

            # 获取前 1000 个字符
            df.at[index, "script_text"] = text[:1000]

        except requests.RequestException as e:
            print(f"Failed to fetch {url}: {e}")
            df.at[index, "script_text"] = "ERROR"  # 标记错误
            
# 保存 DataFrame
df.to_csv("pudding_texts.csv", index=False)

print("Finished downloading and saving scripts.")

Failed to fetch http://www.pages.drexel.edu/~ina22/splaylib/Screenplay-Cocoanuts,_The.pdf: HTTPConnectionPool(host='www.pages.drexel.edu', port=80): Max retries exceeded with url: /~ina22/splaylib/Screenplay-Cocoanuts,_The.pdf (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x0000013B82A94B50>: Failed to resolve 'www.pages.drexel.edu' ([Errno 11001] getaddrinfo failed)"))
Failed to fetch http://www.aellea.com/scripts/WHAT.pdf: 403 Client Error: Forbidden for url: http://www.aellea.com/scripts/WHAT.pdf
Failed to fetch http://www.aellea.com/script/qWAX.htm: 403 Client Error: Forbidden for url: http://www.aellea.com/script/qWAX.htm
Skipping non-HTML content: http://www.dailyscript.com/scripts/Mr%20Smith%20Goes%20To%20Washington.txt
Skipping non-HTML content: http://www.dailyscript.com/scripts/Ninotchka.txt
Failed to fetch http://www.aellea.com/script/itsawonderfullife.txt: 403 Client Error: Forbidden for url: http://www.aellea.com/script/itsawonderfullife.txt


In [16]:
#add a new column to the df with the text downloaded
#save this new dataframe as "pudding_texts.csv"

## Part 4: TMDB database

#### Browse the documentation at https://developer.themoviedb.org/reference/intro/getting-started. Create an account to authenticate

In [19]:
#create a dataset of the movies in theaters now. Include metadata fields you are interested in. 

API_KEY = "82b6a9602146d347451f577295cacf1e"

In [None]:
import requests
import pandas as pd

URL = f"https://api.themoviedb.org/3/movie/now_playing?api_key={API_KEY}&language=en-US&page=1"
response = requests.get(URL)

if response.status_code == 200:
    data = response.json()
    movies = data["results"]

    movie_data = []
    for movie in movies:
        movie_data.append({
            "id": movie["id"],
            "title": movie["title"],
            "release_date": movie["release_date"],
            "overview": movie["overview"],
            "poster_path": movie["poster_path"],
            "vote_average": movie["vote_average"],
            "popularity": movie["popularity"]
        })

    df = pd.DataFrame(movie_data)
    df.to_csv("tmdb_movies.csv", index=False)

    print("Movies dataset saved as 'tmdb_movies.csv'")

else:
    print(f"Error fetching data: {response.status_code}")


Movies dataset saved as 'tmdb_movies.csv'


In [21]:
#download the movie posters for 10 of these movies and save them to this repository

import os

IMAGE_BASE_URL = "https://image.tmdb.org/t/p/w500"

df = pd.read_csv("tmdb_movies.csv")
if not os.path.exists("posters"):
    os.makedirs("posters")

for index, row in df.head(10).iterrows():
    poster_path = row["poster_path"]
    if pd.notna(poster_path):
        image_url = IMAGE_BASE_URL + poster_path
        image_data = requests.get(image_url).content
        with open(f"posters/{row['title'].replace(' ', '_')}.jpg", "wb") as file:
            file.write(image_data)
        
        print(f"Downloaded: {row['title']}")

print("All posters downloaded successfully!")

Downloaded: The Gorge
Downloaded: Flight Risk
Downloaded: Sonic the Hedgehog 3
Downloaded: Captain America: Brave New World
Downloaded: Panda Plan
Downloaded: Companion
Downloaded: My Fault: London
Downloaded: Kraven the Hunter
Downloaded: Paddington in Peru
Downloaded: Dog Man
All posters downloaded successfully!
