In [1]:
from typing import Iterable

from matplotlib.pyplot import title
from scrapy import Request
!pip install scrapy




[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: C:\Users\rails\AppData\Local\Programs\Python\Python312\python.exe -m pip install --upgrade pip


In [2]:
from typing import Iterable
import scrapy
from scrapy.crawler import CrawlerProcess
import json
from scrapy import Request

class WikipediaFilmSpider(scrapy.Spider):
    name = 'wikipedia_film'
    start_urls = ['https://en.wikipedia.org/wiki/']

    def start_requests(self) -> Iterable[Request]:
        url = 'https://en.wikipedia.org/wiki/List_of_highest-grossing_films'
        yield scrapy.Request(url, callback=self.page_parse)

    def page_parse(self, response):
        seen_movies = set()
        tables = response.xpath('//table[contains(@class, "wikitable") and contains(@class, "plainrowheaders")]')

        i = 0
        for table in tables:
            i += 1
            if i > 3:
                break

            rows = table.xpath('.//tbody/tr')

            before = 1925
            for row in rows:
                if i == 3:
                    link = row.xpath('.//td[1]//a/@href').get()
                    name = row.xpath('.//td[1]//a/text()').get()
                    year = row.xpath('.//th[@scope="row"]//a/text()').get()
                    if not year:
                        year = before
                    else:
                        before = year
                else:
                    link = row.xpath('.//th[@scope="row"]//a/@href').get()
                    name = row.xpath('.//th[@scope="row"]//a/text()').get()
                    if i == 1:
                        year = row.xpath('.//td[4]/text()').get()
                    else:
                        year = row.xpath('./td[last()]/text()').get()

                if name and name not in seen_movies:
                    seen_movies.add(name)

                    if link:
                        film_link = 'https://en.wikipedia.org' + link
                        yield scrapy.Request(film_link, callback=self.parse, meta={'name': name, 'link': link, 'year': year})

    def parse(self, response):
        film_data = {}
        film_data['title'] = response.meta['name']
        film_data['link'] = response.meta['link']
        film_data['release_year'] = int(response.meta['year'])

        directors = response.xpath('//th[contains(text(), "Directed by")]/following-sibling::td//a/text()').getall()
        film_data['directors'] = directors if directors else None

        box_office = response.xpath(
            '//th[contains(text(), "Box office")]/following-sibling::td//text()[normalize-space()]').get()
        film_data['box_office'] = box_office.strip().replace("\xa0", " ") if box_office else None

        country = response.xpath('//th[contains(text(), "Country")]/following-sibling::td//text()').get()
        if country:
            film_data['country'] = country.strip()
        else:
            countries = response.xpath(
                '//th[contains(text(), "Countries")]/following-sibling::td//li/text()').getall()
            film_data['countries'] = countries

        yield film_data


process = CrawlerProcess(settings={
    "FEEDS": {
        "films.json": {"format": "json"},
    },
})
process.crawl(WikipediaFilmSpider)
process.start()


2025-02-24 13:30:22 [scrapy.utils.log] INFO: Scrapy 2.12.0 started (bot: scrapybot)
2025-02-24 13:30:22 [scrapy.utils.log] INFO: Versions: lxml 5.3.1.0, libxml2 2.11.7, cssselect 1.2.0, parsel 1.10.0, w3lib 2.3.1, Twisted 24.11.0, Python 3.12.3 (tags/v3.12.3:f6650f9, Apr  9 2024, 14:05:25) [MSC v.1938 64 bit (AMD64)], pyOpenSSL 25.0.0 (OpenSSL 3.4.1 11 Feb 2025), cryptography 44.0.1, Platform Windows-11-10.0.26100-SP0
2025-02-24 13:30:22 [scrapy.addons] INFO: Enabled addons:
[]
2025-02-24 13:30:22 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2025-02-24 13:30:22 [scrapy.extensions.telnet] INFO: Telnet Password: 9da3865f97793862
2025-02-24 13:30:22 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2025-02-24 13:30:22 [scrapy.crawler] INFO: Overridden settings:
{}
2025-02-24 13:30:23 

In [1]:
import json

with open('films.json', 'r') as f:
    films_data = json.load(f)

In [2]:
films_data

[{'title': 'Avatar',
  'link': '/wiki/Avatar_(2009_film)',
  'release_year': 2009,
  'directors': ['James Cameron'],
  'box_office': '$2.923 billion',
  'countries': ['United Kingdom', 'United States']},
 {'title': 'Star Wars: The Last Jedi',
  'link': '/wiki/Star_Wars:_The_Last_Jedi',
  'release_year': 2017,
  'directors': ['Rian Johnson'],
  'box_office': '$1.334',
  'country': 'United States'},
 {'title': 'Deadpool & Wolverine',
  'link': '/wiki/Deadpool_%26_Wolverine',
  'release_year': 2024,
  'directors': ['Shawn Levy'],
  'box_office': '$1.338 billion',
  'country': 'United States'},
 {'title': 'Harry Potter and the Deathly Hallows – Part 2',
  'link': '/wiki/Harry_Potter_and_the_Deathly_Hallows_%E2%80%93_Part_2',
  'release_year': 2011,
  'directors': ['David Yates'],
  'box_office': '$1.342 billion',
  'countries': ['United Kingdom', 'United States']},
 {'title': 'Black Panther',
  'link': '/wiki/Black_Panther_(film)',
  'release_year': 2018,
  'directors': ['Ryan Coogler'],
 

Now check for missing values

In [3]:
def validate_entries(entries):
    valid_entries = []
    invalid_entries = []

    for entry in entries:
        if (
            'title' in entry and entry['title'] and
            'release_year' in entry and entry['release_year'] and
            'directors' in entry and entry['directors'] and entry['directors'] != [] and
            'box_office' in entry and entry['box_office'] and
            ('countries' in entry and entry['countries'] and entry['countries'] != [] or
             'country' in entry and entry['country'])
        ):
            valid_entries.append(entry)
        else:
            invalid_entries.append(entry)

    return valid_entries, invalid_entries

In [4]:
valid_entries, invalid_entries = validate_entries(films_data)

In [5]:
with open('invalid_entries.json', 'w', encoding='utf-8') as f:
    json.dump(invalid_entries, f, ensure_ascii=False, indent=4)

print(f"Valid entries: {len(valid_entries)}")

Valid entries: 166


In [6]:
len(invalid_entries)

11

In [7]:
invalid_entries

[{'title': 'Moonraker',
  'link': '/wiki/Moonraker_(film)',
  'release_year': 1979,
  'directors': ['Lewis Gilbert'],
  'box_office': '$210.3 million',
  'countries': []},
 {'title': 'Diamonds Are Forever',
  'link': '/wiki/Diamonds_Are_Forever_(film)',
  'release_year': 1971,
  'directors': ['Guy Hamilton'],
  'box_office': '$116 million',
  'countries': []},
 {'title': 'The Bible: In the Beginning',
  'link': '/wiki/The_Bible:_In_the_Beginning',
  'release_year': 1966,
  'directors': ['John Huston'],
  'box_office': '$34.9 million',
  'countries': []},
 {'title': 'From Russia with Love',
  'link': '/wiki/From_Russia_with_Love_(film)',
  'release_year': 1963,
  'directors': ['Terence Young'],
  'box_office': '$79 million',
  'countries': []},
 {'title': 'Goldfinger',
  'link': '/wiki/Goldfinger_(film)',
  'release_year': 1964,
  'directors': ['Guy Hamilton'],
  'box_office': '$125 million',
  'countries': []},
 {'title': 'Lawrence of Arabia',
  'link': '/wiki/Lawrence_of_Arabia_(film)

Since there are very little amount of invalid entries, i will their data by hand (3 minutes)

In [8]:
with open('invalid_entries_filled.json', 'r') as f:
    invalid_entries_filled = json.load(f)

invalid_entries_filled

[{'title': 'Moonraker',
  'link': '/wiki/Moonraker_(film)',
  'release_year': 1979,
  'directors': ['Lewis Gilbert'],
  'box_office': '$210.3 million',
  'countries': ['United Kingdom', 'United States', 'France']},
 {'title': 'Diamonds Are Forever',
  'link': '/wiki/Diamonds_Are_Forever_(film)',
  'release_year': 1971,
  'directors': ['Guy Hamilton'],
  'box_office': '$116 million',
  'countries': ['United Kingdom', 'United States']},
 {'title': 'The Bible: In the Beginning',
  'link': '/wiki/The_Bible:_In_the_Beginning',
  'release_year': 1966,
  'directors': ['John Huston'],
  'box_office': '$34.9 million',
  'countries': ['Italy', 'United States']},
 {'title': 'From Russia with Love',
  'link': '/wiki/From_Russia_with_Love_(film)',
  'release_year': 1963,
  'directors': ['Terence Young'],
  'box_office': '$79 million',
  'countries': ['United Kingdom', 'United States']},
 {'title': 'Goldfinger',
  'link': '/wiki/Goldfinger_(film)',
  'release_year': 1964,
  'directors': ['Guy Hamilt

In [9]:
all_entries = valid_entries + invalid_entries_filled

In [10]:
all_entries

[{'title': 'Avatar',
  'link': '/wiki/Avatar_(2009_film)',
  'release_year': 2009,
  'directors': ['James Cameron'],
  'box_office': '$2.923 billion',
  'countries': ['United Kingdom', 'United States']},
 {'title': 'Star Wars: The Last Jedi',
  'link': '/wiki/Star_Wars:_The_Last_Jedi',
  'release_year': 2017,
  'directors': ['Rian Johnson'],
  'box_office': '$1.334',
  'country': 'United States'},
 {'title': 'Deadpool & Wolverine',
  'link': '/wiki/Deadpool_%26_Wolverine',
  'release_year': 2024,
  'directors': ['Shawn Levy'],
  'box_office': '$1.338 billion',
  'country': 'United States'},
 {'title': 'Harry Potter and the Deathly Hallows – Part 2',
  'link': '/wiki/Harry_Potter_and_the_Deathly_Hallows_%E2%80%93_Part_2',
  'release_year': 2011,
  'directors': ['David Yates'],
  'box_office': '$1.342 billion',
  'countries': ['United Kingdom', 'United States']},
 {'title': 'Black Panther',
  'link': '/wiki/Black_Panther_(film)',
  'release_year': 2018,
  'directors': ['Ryan Coogler'],
 

In [11]:
len(all_entries)

177

# Cleaning

In [16]:
box_oficces = []

count = 0
for entry in all_entries:
    box_oficces.append(entry['box_office'])
    count += 1

box_oficces

['$2.923 billion',
 '$1.334',
 '$1.338 billion',
 '$1.342 billion',
 '$1.35 billion',
 '$1.236 billion',
 '$1.243 billion',
 '$1.159 billion',
 '$1.31 billion',
 '$1.266 billion',
 '$1.280 billion',
 '$1.266',
 '$1.360 billion',
 '$897.5 million',
 '$963.4 million',
 '$1.405 billion',
 '$1.446 billion',
 '$1.006',
 '$935.5 million',
 '$553.7 million',
 '$1.026 billion',
 '$951.6 million',
 '$366.1 million',
 '$546.3 million',
 '$817.4 million',
 '$354.8',
 '$505.7 million',
 '$474.2 million',
 '$394.4 million',
 '$1.453',
 '$320.1 million',
 '$519–520.9',
 '$357.3 million',
 '$388.8 million',
 '$200.1 million',
 '$370 million',
 '$482 million',
 '$396.3 million',
 '$538–549',
 '$389.9',
 '$203.3 million',
 '$257 million',
 '$476.5 million',
 '$225 million',
 '$83.3 million',
 '$75 million (worldwide',
 '$441.3 million',
 '$250–291 million',
 '$173.4 million',
 '$58.5 million',
 '$102.3 million',
 '$104.9 million (North America)',
 '$378 million',
 '$33.7 million',
 '$34.5 million',
 '$

In [38]:
import re

def clean_box_office(box_office_str):
    box_office_str = box_office_str.replace('$', '').replace('US', '').replace('>', '').replace('<', '').strip()
    box_office_str = box_office_str.replace('$', '').replace('US', '').strip()
    box_office_str = re.sub(r'\(.*', '', box_office_str).strip()
    
    if '–' in box_office_str:
        box_office_str = box_office_str.split('–')[1].strip()
    
    if 'billion' not in box_office_str and 'million' not in box_office_str:
        if re.match(r'^\d{1,3}(,\d{3})+$', box_office_str):
            # Handle values with commas, e.g., '6,665,592'
            box_office_str = box_office_str.replace(',', '')
            if int(box_office_str) >= 1_000_000_000:
                box_office_str = str(int(box_office_str) / 1_000_000_000) + ' billion'
            else:
                box_office_str = str(int(box_office_str) / 1_000_000) + ' million'
        else:
            if '.' in box_office_str:
                whole_part = box_office_str.split('.')[0]
            else:
                whole_part = box_office_str
            if whole_part.isdigit() and int(whole_part) < 10:
                box_office_str += ' billion'
            else:
                box_office_str += ' million'
    
    box_office_str += ' $'
    
    return box_office_str

def convert_to_numeric(box_office_str):
    value = box_office_str.replace(' $', '')
    if 'billion' in value:
        return float(value.replace(' billion', '')) * 1_000_000_000
    elif 'million' in value:
        return float(value.replace(' million', '')) * 1_000_000
    return 0

In [50]:
def clean_entries(entries):
    cleaned_entries = []
    for index, entry in enumerate(entries, start=1):
        entry['link'] = 'https://en.wikipedia.org' + entry['link']
        
        if 'country' in entry:
            if 'countries' not in entry:
                entry['countries'] = [entry['country']]
            del entry['country']
        
        
        box_office_str = entry['box_office']
        box_office_str = clean_box_office(box_office_str)
        num_val = convert_to_numeric(box_office_str)
        entry['box_office'] = box_office_str
        entry['num_val_box_office'] = num_val
        
        entry['id'] = index
        
        cleaned_entries.append(entry)
    return cleaned_entries

In [51]:
cleaned_entries = all_entries
cleaned_entries = clean_entries(cleaned_entries)

In [52]:
for entry in cleaned_entries:
    print(entry['box_office'])

2.923 billion $
1.334 billion $
1.338 billion $
1.342 billion $
1.35 billion $
1.236 billion $
1.243 billion $
1.159 billion $
1.31 billion $
1.266 billion $
1.280 billion $
1.266 billion $
1.360 billion $
897.5 million $
963.4 million $
1.405 billion $
1.446 billion $
1.006 billion $
935.5 million $
553.7 million $
1.026 billion $
951.6 million $
366.1 million $
546.3 million $
817.4 million $
354.8 million $
505.7 million $
474.2 million $
394.4 million $
1.453 billion $
320.1 million $
520.9 million $
357.3 million $
388.8 million $
200.1 million $
370 million $
482 million $
396.3 million $
549 million $
389.9 million $
203.3 million $
257 million $
476.5 million $
225 million $
83.3 million $
75 million $
441.3 million $
291 million $
173.4 million $
58.5 million $
102.3 million $
104.9 million $
378 million $
33.7 million $
34.5 million $
146 million $
103.1 million $
50 million $
50.1 million $
72.7 million $
507.1 million $
303 million $
50 million $
44.1 million $
17.5 million

In [53]:
cleaned_entries

[{'title': 'Avatar',
  'link': 'https://en.wikipedia.orghttps://en.wikipedia.orghttps://en.wikipedia.orghttps://en.wikipedia.orghttps://en.wikipedia.orghttps://en.wikipedia.orghttps://en.wikipedia.orghttps://en.wikipedia.orghttps://en.wikipedia.org/wiki/Avatar_(2009_film)',
  'release_year': 2009,
  'directors': ['James Cameron'],
  'box_office': '2.923 billion $',
  'countries': ['United Kingdom', 'United States'],
  'num_val_box_office': 2923000000.0,
  '_id': ObjectId('67bc98e135a9409ecd46d034'),
  'id': 1},
 {'title': 'Star Wars: The Last Jedi',
  'link': 'https://en.wikipedia.orghttps://en.wikipedia.orghttps://en.wikipedia.orghttps://en.wikipedia.orghttps://en.wikipedia.orghttps://en.wikipedia.orghttps://en.wikipedia.orghttps://en.wikipedia.orghttps://en.wikipedia.org/wiki/Star_Wars:_The_Last_Jedi',
  'release_year': 2017,
  'directors': ['Rian Johnson'],
  'box_office': '1.334 billion $',
  'countries': ['United States'],
  'num_val_box_office': 1334000000.0,
  '_id': ObjectId('6

# Database setup

## Changes in structure

### Title - same as in PDF
### Release_year - same as in PDF
### director - list of strings
### box_office - same as in PDF
### num_val_box_office - numeric value of box office
### link to the resource - added
### countries - list of origin countries

In [54]:
!pip install pymongo
#!pip install --upgrade pymongo
#!pip install pymongo==3.4.0
#!python -m pip install "pymongo[srv]"




[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: C:\Users\rails\AppData\Local\Programs\Python\Python312\python.exe -m pip install --upgrade pip


In [55]:
from pymongo import MongoClient


username = "karma"
password = "myPass123"
URL = "@dwassignment1.ycohu.mongodb.net/?retryWrites=true&w=majority&appName=DWAssignment1"

mongo_uri = f"mongodb+srv://{username}:{password}{URL}"

client = MongoClient(mongo_uri)

In [56]:
try:
    client.admin.command("ping")
    print("Successfully connected to the server!")
except Exception as e:
    print(f"Failed to connect: {e}")

Successfully connected to the server!


In [57]:
db = client['films_database']
films_collection = db['films']

In [59]:
result = films_collection.delete_many({})

In [60]:
films_collection.insert_many(cleaned_entries)

InsertManyResult([ObjectId('67bc98e135a9409ecd46d034'), ObjectId('67bc98e135a9409ecd46d035'), ObjectId('67bc98e135a9409ecd46d036'), ObjectId('67bc98e135a9409ecd46d037'), ObjectId('67bc98e135a9409ecd46d038'), ObjectId('67bc98e135a9409ecd46d039'), ObjectId('67bc98e135a9409ecd46d03a'), ObjectId('67bc98e135a9409ecd46d03b'), ObjectId('67bc98e135a9409ecd46d03c'), ObjectId('67bc98e135a9409ecd46d03d'), ObjectId('67bc98e135a9409ecd46d03e'), ObjectId('67bc98e135a9409ecd46d03f'), ObjectId('67bc98e135a9409ecd46d040'), ObjectId('67bc98e135a9409ecd46d041'), ObjectId('67bc98e135a9409ecd46d042'), ObjectId('67bc98e135a9409ecd46d043'), ObjectId('67bc98e135a9409ecd46d044'), ObjectId('67bc98e135a9409ecd46d045'), ObjectId('67bc98e135a9409ecd46d046'), ObjectId('67bc98e135a9409ecd46d047'), ObjectId('67bc98e135a9409ecd46d048'), ObjectId('67bc98e135a9409ecd46d049'), ObjectId('67bc98e135a9409ecd46d04a'), ObjectId('67bc98e135a9409ecd46d04b'), ObjectId('67bc98e135a9409ecd46d04c'), ObjectId('67bc98e135a9409ecd46d0

# Export DB to JSON

In [61]:
from pymongo import MongoClient

username = "karma"
password = "myPass123"
URL = "@dwassignment1.ycohu.mongodb.net/?retryWrites=true&w=majority&appName=DWAssignment1"

mongo_uri = f"mongodb+srv://{username}:{password}{URL}"

client = MongoClient(mongo_uri)

In [62]:
try:
    client.admin.command("ping")
    print("Successfully connected to the server!")
except Exception as e:
    print(f"Failed to connect: {e}")

Successfully connected to the server!


In [63]:
db = client['films_database']
films_collection = db['films']

In [64]:
documents = list(films_collection.find())

In [65]:
for doc in documents:
    doc.pop('_id', None)

output_file_path = 'films_data.json'
with open(output_file_path, 'w', encoding='utf-8') as file:
    json.dump(documents, file, ensure_ascii=False, indent=4)

print(f"Exported {len(documents)} documents to {output_file_path}.")

Exported 177 documents to films_data.json.
