In [1]:
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
import numpy as np
import json

In [2]:
url = 'https://www.imdb.com/chart/top/'

In [3]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}
response = requests.get(url, headers=headers)

In [4]:
if response.status_code == 200:
    soup = BeautifulSoup(response.text, "html.parser")

    movie_list = soup.find("ul", class_="ipc-metadata-list")

    if movie_list:
        movie_list_dict = {}  

        movie_names = []
        movie_years = []

        for movie in movie_list.find_all("h3", class_="ipc-title__text"):
            movie_name = movie.text.strip()
            movie_names.append(movie_name)

        for movie in movie_list.find_all("span", class_="sc-c7e5f54-8 hgjcbi cli-title-metadata-item"):
            movie_year = movie.text.strip()
            movie_years.append(movie_year)

        movie_list_dict["Movie Names"] = movie_names
        movie_list_dict["Release Years"] = movie_years

        print("Movie list:")
        print(movie_list_dict)
    else:
        print("Movie list not found on the page. Website structure may have changed.")
else:
    print("Failed to retrieve the webpage.")


Movie list:
{'Movie Names': ['1. The Shawshank Redemption', '2. The Godfather', '3. The Dark Knight', '4. The Godfather Part II', '5. 12 Angry Men', "6. Schindler's List", '7. The Lord of the Rings: The Return of the King', '8. Pulp Fiction', '9. The Lord of the Rings: The Fellowship of the Ring', '10. Il buono, il brutto, il cattivo', '11. Forrest Gump', '12. Fight Club', '13. The Lord of the Rings: The Two Towers', '14. Inception', '15. Star Wars: Episode V - The Empire Strikes Back', '16. The Matrix', '17. Goodfellas', "18. One Flew Over the Cuckoo's Nest", '19. Se7en', "20. It's a Wonderful Life", '21. Shichinin no samurai', '22. Spider-Man: Across the Spider-Verse', '23. Interstellar', '24. The Silence of the Lambs', '25. Saving Private Ryan', '26. Cidade de Deus', '27. La vita è bella', '28. The Green Mile', '29. Star Wars', '30. Terminator 2: Judgment Day', '31. Back to the Future', '32. Sen to Chihiro no kamikakushi', '33. The Pianist', '34. Psycho', '35. Gisaengchung', '36. Gl

In [5]:
movies_list = []

for movie_name in movie_list_dict['Movie Names']:
    rank, movie_title = movie_name.split('. ', 1)
    movie_data = {'rank': rank, 'movie_title': movie_title}
    movies_list.append(movie_data)

In [6]:
with open('movie_data.json', 'w') as json_file:
    json.dump(movies_list, json_file)

print("Movie data has been saved to 'movie_data.json'")

Movie data has been saved to 'movie_data.json'


In [7]:
with open('movie_data.json', 'r') as json_file:
    movie_data = json.load(json_file)

In [8]:
formatted_movie_data = [{'Rank': movie['rank'], 'Movie Title': movie['movie_title']} for movie in movie_data]

In [9]:
formatted_movie_data

[{'Rank': '1', 'Movie Title': 'The Shawshank Redemption'},
 {'Rank': '2', 'Movie Title': 'The Godfather'},
 {'Rank': '3', 'Movie Title': 'The Dark Knight'},
 {'Rank': '4', 'Movie Title': 'The Godfather Part II'},
 {'Rank': '5', 'Movie Title': '12 Angry Men'},
 {'Rank': '6', 'Movie Title': "Schindler's List"},
 {'Rank': '7', 'Movie Title': 'The Lord of the Rings: The Return of the King'},
 {'Rank': '8', 'Movie Title': 'Pulp Fiction'},
 {'Rank': '9',
  'Movie Title': 'The Lord of the Rings: The Fellowship of the Ring'},
 {'Rank': '10', 'Movie Title': 'Il buono, il brutto, il cattivo'},
 {'Rank': '11', 'Movie Title': 'Forrest Gump'},
 {'Rank': '12', 'Movie Title': 'Fight Club'},
 {'Rank': '13', 'Movie Title': 'The Lord of the Rings: The Two Towers'},
 {'Rank': '14', 'Movie Title': 'Inception'},
 {'Rank': '15',
  'Movie Title': 'Star Wars: Episode V - The Empire Strikes Back'},
 {'Rank': '16', 'Movie Title': 'The Matrix'},
 {'Rank': '17', 'Movie Title': 'Goodfellas'},
 {'Rank': '18', 'Movi

In [10]:
# Create an empty dictionary to store the converted data
converted_data = {'Rank': [], 'Movie Title': []}

for movie in formatted_movie_data:
    converted_data['Rank'].append(movie['Rank'])
    converted_data['Movie Title'].append(movie['Movie Title'])

print(converted_data)

{'Rank': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '150', '151', '152', '153', '154', '155', '156', '157',

In [11]:
years_dic = {}
years_dic['Release Years'] = [year for year in movie_list_dict['Release Years'] if re.match(r'\d{4}', year)]
print(years_dic)

{'Release Years': ['1994', '1972', '2008', '1974', '1957', '1993', '2003', '1994', '2001', '1966', '1994', '1999', '2002', '2010', '1980', '1999', '1990', '1975', '1995', '1946', '1954', '2023', '2014', '1991', '1998', '2002', '1997', '1999', '1977', '1991', '1985', '2001', '2002', '1960', '2019', '2000', '1994', '1994', '1998', '2006', '2014', '2006', '1995', '1988', '1962', '1942', '2023', '2011', '1936', '1988', '1968', '1954', '1979', '1931', '1979', '2000', '2012', '1981', '2008', '2006', '1950', '1957', '2018', '1980', '1940', '1957', '2018', '1986', '2009', '2012', '1999', '1964', '2003', '2017', '1984', '1995', '1981', '1995', '2019', '2019', '1997', '1997', '2016', '1984', '2009', '1963', '1952', '2018', '2000', '2010', '1985', '1983', '2004', '1968', '2012', '1992', '1952', '1962', '1941', '1931', '1960', '1959', '1958', '1944', '2001', '1983', '1987', '1971', '2010', '1995', '2009', '2020', '1962', '1973', '2011', '1989', '1927', '1988', '2007', '2000', '1948', '1997', '1976

In [12]:
time_dic = {}
time_dic['Movie Duration'] = [value for value in movie_list_dict['Release Years'] if re.match(r'\d+h \d+m', value)]
print(time_dic)

{'Movie Duration': ['2h 22m', '2h 55m', '2h 32m', '3h 22m', '1h 36m', '3h 15m', '3h 21m', '2h 34m', '2h 58m', '2h 41m', '2h 22m', '2h 19m', '2h 59m', '2h 28m', '2h 4m', '2h 16m', '2h 25m', '2h 13m', '2h 7m', '2h 10m', '3h 27m', '2h 20m', '2h 49m', '1h 58m', '2h 49m', '2h 10m', '1h 56m', '3h 9m', '2h 1m', '2h 17m', '1h 56m', '2h 5m', '2h 30m', '1h 49m', '2h 12m', '2h 35m', '1h 28m', '1h 50m', '1h 59m', '2h 31m', '1h 46m', '2h 10m', '1h 46m', '1h 29m', '2h 13m', '1h 42m', '1h 52m', '1h 27m', '2h 35m', '2h 45m', '1h 52m', '1h 57m', '1h 27m', '2h 27m', '1h 53m', '2h 45m', '1h 55m', '1h 38m', '2h 17m', '1h 50m', '1h 28m', '2h 29m', '2h 26m', '2h 5m', '1h 56m', '1h 57m', '2h 17m', '2h 33m', '2h 44m', '2h 2m', '1h 35m', '1h 45m', '2h 40m', '1h 21m', '2h 29m', '2h 58m', '3h 1m', '2h 2m', '2h 14m', '2h 6m', '1h 46m', '3h 49m', '2h 50m', '2h 23m', '1h 43m', '2h 6m', '1h 42m', '1h 43m', '2h 22m', '2h 11m', '1h 48m', '2h 29m', '1h 55m', '1h 39m', '2h 23m', '3h 38m', '1h 59m', '1h 57m', '2h 5m', '2

In [13]:
movie_types = {}
movie_types['Movie Type'] = [value for value in movie_list_dict['Release Years'] if re.match(r'[A-Z]', value)]
print(movie_types)

{'Movie Type': ['R', 'R', 'PG-13', 'R', 'Approved', 'R', 'PG-13', 'R', 'PG-13', 'Approved', 'PG-13', 'R', 'PG-13', 'PG-13', 'PG', 'R', 'R', 'R', 'R', 'PG', 'Not Rated', 'PG', 'PG-13', 'R', 'R', 'R', 'PG-13', 'R', 'PG', 'R', 'PG', 'PG', 'R', 'R', 'R', 'R', 'G', 'R', 'R', 'R', 'R', 'PG-13', 'R', 'Not Rated', 'Not Rated', 'PG', 'R', 'R', 'G', 'PG', 'PG-13', 'PG', 'R', 'G', 'R', 'R', 'R', 'PG', 'G', 'R', 'Passed', 'Approved', 'PG-13', 'R', 'G', 'Approved', 'PG', 'R', 'R', 'PG-13', 'R', 'PG', 'R', 'PG', 'PG', 'G', 'R', 'PG-13', 'R', 'PG-13', 'R', 'TV-PG', 'R', 'PG-13', 'Not Rated', 'G', 'R', 'Unrated', 'G', 'Not Rated', 'PG', 'R', 'G', 'R', 'R', 'Not Rated', 'Approved', 'PG', 'Passed', 'Approved', 'Approved', 'PG', 'Passed', 'R', 'R', 'X', 'R', 'R', 'PG', 'PG-13', 'Approved', 'PG', 'PG-13', 'PG-13', 'Not Rated', 'R', 'PG', 'R', 'Not Rated', 'R', 'R', 'R', 'R', 'Not Rated', 'R', 'PG-13', 'PG-13', 'Passed', 'Passed', 'R', 'PG-13', 'PG-13', 'Passed', 'Approved', 'PG', 'R', 'R', 'R', 'R', 'R', 

In [14]:
combined_data = {**converted_data, **years_dic, **time_dic, **movie_types}
print(combined_data)

{'Rank': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '150', '151', '152', '153', '154', '155', '156', '157',

In [15]:
max_length = max(len(combined_data['Rank']), len(combined_data['Movie Title']))

In [16]:
for key in combined_data:
    if len(combined_data[key]) < max_length:
        combined_data[key] += [np.nan] * (max_length - len(combined_data[key]))

df = pd.DataFrame(combined_data)

In [17]:
df.head()

Unnamed: 0,Rank,Movie Title,Release Years,Movie Duration,Movie Type
0,1,The Shawshank Redemption,1994,2h 22m,R
1,2,The Godfather,1972,2h 55m,R
2,3,The Dark Knight,2008,2h 32m,PG-13
3,4,The Godfather Part II,1974,3h 22m,R
4,5,12 Angry Men,1957,1h 36m,Approved


In [21]:
df.to_csv('top_250_movies.csv', index=False)