In [None]:
from requests import get
from bs4 import BeautifulSoup
from warnings import warn
from time import sleep
from random import randint
import numpy as np, pandas as pd
import seaborn as sns
import requests
import re
import time

In [None]:
pages = np.arange(1, 9951, 50) 
headers = {'Accept-Language': 'en-US,en;q=0.8'} # If this is not specified, the default language is Mandarin

#initialize empty lists to store the variables scraped
titles = []
years = []
ratings = []
genres = []
runtimes = []
imdb_ratings = []
imdb_ratings_standardized = []
metascores = []
votes = []

for page in pages:
  
   #get request for sci-fi
   response = get("https://www.imdb.com/search/title?genres=sci-fi&"
                  + "start="
                  + str(page)
                  + "&explore=title_type,genres&ref_=adv_prv", headers=headers)
  
   sleep(randint(8,15))
   
   #throw warning for status codes that are not 200
   if response.status_code != 200:
       warn('Request: {}; Status code: {}'.format(requests, response.status_code))

   #parse the content of current iteration of request
   page_html = BeautifulSoup(response.text, 'html.parser')
      
   movie_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')
  
   #extract the 50 movies for that page
   for container in movie_containers:

       #conditional for all with metascore
       if container.find('div', class_ = 'ratings-metascore') is not None:

           #title
           title = container.h3.a.text
           titles.append(title)

           if container.h3.find('span', class_= 'lister-item-year text-muted unbold') is not None:
            
             #year released
             year = container.h3.find('span', class_= 'lister-item-year text-muted unbold').text # remove the parentheses around the year and make it an integer
             years.append(year)

           else:
             years.append(None) # each of the additional if clauses are to handle type None data, replacing it with an empty string so the arrays are of the same length at the end of the scraping

           if container.p.find('span', class_ = 'certificate') is not None:
            
             #rating
             rating = container.p.find('span', class_= 'certificate').text
             ratings.append(rating)

           else:
             ratings.append("")

           if container.p.find('span', class_ = 'genre') is not None:
            
             #genre
             genre = container.p.find('span', class_ = 'genre').text.replace("\n", "").rstrip().split(',') # remove the whitespace character, strip, and split to create an array of genres
             genres.append(genre)
          
           else:
             genres.append("")

           if container.p.find('span', class_ = 'runtime') is not None:

             #runtime
             time = int(container.p.find('span', class_ = 'runtime').text.replace(" min", "")) # remove the minute word from the runtime and make it an integer
             runtimes.append(time)

           else:
             runtimes.append(None)

           if float(container.strong.text) is not None:

             #IMDB ratings
             imdb = float(container.strong.text) # non-standardized variable
             imdb_ratings.append(imdb)

           else:
             imdb_ratings.append(None)

           if container.find('span', class_ = 'metascore').text is not None:

             #Metascore
             m_score = int(container.find('span', class_ = 'metascore').text) # make it an integer
             metascores.append(m_score)

           else:
             metascores.append(None)

           if container.find('span', attrs = {'name':'nv'})['data-value'] is not None:

             #Number of votes
             vote = int(container.find('span', attrs = {'name':'nv'})['data-value'])
             votes.append(vote)

           else:
               votes.append(None)

       else:
          votes.append(None)

In [None]:
sci_fi_df = pd.DataFrame({'movie': titles,
                      'year': years,
                      'rating': ratings,
                      'genre': genres,
                      'runtime_min': runtimes,
                      'imdb': imdb_ratings,
                      'metascore': metascores,})
                      #'votes': votes}
                      

sci_fi_df.loc[:, 'year'] = sci_fi_df['year'].str[-5:-1] # two more data transformations after scraping
sci_fi_df['n_imdb'] = sci_fi_df['imdb'] * 10
final_df = sci_fi_df.loc[sci_fi_df['year'] != 'ovie']
final_df.loc[:, 'year'] = pd.to_numeric(final_df['year'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [None]:
sci_fi_df

Unnamed: 0,movie,year,rating,genre,runtime_min,imdb,metascore,n_imdb
0,Ant-Man and the Wasp: Quantumania,2023,PG-13,"[Action, Adventure, Comedy]",124,6.5,48,65.0
1,Black Panther: Wakanda Forever,2022,PG-13,"[Action, Adventure, Drama]",161,6.8,67,68.0
2,Infinity Pool,2023,R,"[Crime, Horror, Mystery]",117,6.1,72,61.0
3,Avatar: The Way of Water,2022,PG-13,"[Action, Adventure, Fantasy]",192,7.8,67,78.0
4,Everything Everywhere All at Once,2022,R,"[Action, Adventure, Comedy]",139,8.0,81,80.0
...,...,...,...,...,...,...,...,...
1349,King Car,2021,,"[Fantasy, Sci-Fi]",99,5.5,71,55.0
1350,Closer to God,2014,,"[Horror, Sci-Fi, Thriller]",81,4.6,54,46.0
1351,Uncle Kent 2,2015,Unrated,"[Comedy, Sci-Fi]",73,5.3,54,53.0
1352,4,2004,Not Rated,"[Drama, Mystery, Sci-Fi]",126,6.5,72,65.0


In [None]:
# Downloading imdb top 250 movie's data
#url = 'http://www.imdb.com/chart/top'
url = 'https://www.imdb.com/chart/toptv?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=470df400-70d9-4f35-bb05-8646a1195842&pf_rd_r=H97E3KDAPESFSG3M1N8P&pf_rd_s=right-4&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_ql_6'
#url = 'https://www.imdb.com/search/title/?groups=top_250'
headers = {'Accept-Language': 'en-US,en;q=0.8'}
#response = requests.get(url,headers = hearders)
response = get(url,headers = headers)
soup = BeautifulSoup(response.text, "html.parser")
movies = soup.select('td.titleColumn')
crew = [a.attrs.get('title') for a in soup.select('td.titleColumn a')]
ratings = [b.attrs.get('data-value')
        for b in soup.select('td.posterColumn span[name=ir]')]
 
 
 
 
# create a empty list for storing
# movie information
list = []
 
# Iterating over movies to extract
# each movie's details
for index in range(0, len(movies)):
     
    # Separating movie into: 'place',
    # 'title', 'year'
    movie_string = movies[index].get_text()
    movie = (' '.join(movie_string.split()).replace('.', ''))
    movie_title = movie[len(str(index))+1:-7]
    year = re.search('\((.*?)\)', movie_string).group(1)
    place = movie[:len(str(index))-(len(movie))]
    data = {"place": place,
            "movie_title": movie_title,
            "rating": ratings[index],
            "year": year,
            "star_cast": crew[index],
            }
    list.append(data)
 
# printing movie details with its rating.
for movie in list:
    print(movie['place'], '-', movie['movie_title'], '('+movie['year'] +
        ') -', 'Starring:', movie['star_cast'], movie['rating'])
 
 
##.......##
df = pd.DataFrame(list)
#df.to_csv('imdb_top_250_movies.csv',index=False)
df.to_csv('imdb_top_250_TVShows.csv',index=False)

1 - Planet Earth II (2016) - Starring: David Attenborough, Chadden Hunter 9.434127866969837
2 - Breaking Bad (2008) - Starring: Bryan Cranston, Aaron Paul 9.428984443133594
3 - Planet Earth (2006) - Starring: Sigourney Weaver, David Attenborough 9.413462331901338
4 - Band of Brothers (2001) - Starring: Scott Grimes, Damian Lewis 9.389331491667111
5 - Chernobyl (2019) - Starring: Jessie Buckley, Jared Harris 9.315570339347165
6 - The Wire (2002) - Starring: Dominic West, Lance Reddick 9.288949208580387
7 - Avatar: The Last Airbender (2005) - Starring: Dee Bradley Baker, Zach Tyler Eisen 9.231121072154993
8 - Blue Planet II (2017) - Starring: David Attenborough, Peter Drost 9.230447562838199
9 - The Sopranos (1999) - Starring: James Gandolfini, Lorraine Bracco 9.201739000451113
1 -  Cosmos: A Spacetime Odyssey (2014) - Starring: Neil deGrasse Tyson, Christopher Emerson 9.20167183359827
11 - Cosmos (1980) - Starring: Carl Sagan, Jaromír Hanzlík 9.187163757714634
12 - Our Planet (2019) - S

In [None]:

url = 'http://www.imdb.com/chart/top'
#url = 'https://www.imdb.com/chart/toptv?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=470df400-70d9-4f35-bb05-8646a1195842&pf_rd_r=H97E3KDAPESFSG3M1N8P&pf_rd_s=right-4&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_ql_6'
#url = 'https://www.imdb.com/search/title/?groups=top_250'
headers = {'Accept-Language': 'en-US,en;q=0.8'}
#response = requests.get(url,headers = hearders)
response = get(url,headers = headers)
soup = BeautifulSoup(response.text, "html.parser")
movies = soup.select('td.titleColumn')
crew = [a.attrs.get('title') for a in soup.select('td.titleColumn a')]
ratings = [b.attrs.get('data-value')
        for b in soup.select('td.posterColumn span[name=ir]')]
 
 
 
 
# create a empty list for storing
# movie information
list = []
 
# Iterating over movies to extract
# each movie's details
for index in range(0, len(movies)):
     
    # Separating movie into: 'place',
    # 'title', 'year'
    movie_string = movies[index].get_text()
    movie = (' '.join(movie_string.split()).replace('.', ''))
    movie_title = movie[len(str(index))+1:-7]
    year = re.search('\((.*?)\)', movie_string).group(1)
    place = movie[:len(str(index))-(len(movie))]
    data = {"place": place,
            "movie_title": movie_title,
            "rating": ratings[index],
            "year": year,
            "star_cast": crew[index],
            }
    list.append(data)
 
# printing movie details with its rating.
for movie in list:
    print(movie['place'], '-', movie['movie_title'], '('+movie['year'] +
        ') -', 'Starring:', movie['star_cast'], movie['rating'])
 
 
##.......##
df = pd.DataFrame(list)
df.to_csv('imdb_top_250_movies.csv',index=False)
#df.to_csv('imdb_top_250_TVShows.csv',index=False)

1 - The Shawshank Redemption (1994) - Starring: Frank Darabont (dir.), Tim Robbins, Morgan Freeman 9.235680191697096
2 - The Godfather (1972) - Starring: Francis Ford Coppola (dir.), Marlon Brando, Al Pacino 9.155951663875934
3 - The Dark Knight (2008) - Starring: Christopher Nolan (dir.), Christian Bale, Heath Ledger 8.991261163569767
4 - The Godfather Part II (1974) - Starring: Francis Ford Coppola (dir.), Al Pacino, Robert De Niro 8.983868908887796
5 - 12 Angry Men (1957) - Starring: Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb 8.952796779372571
6 - Schindler's List (1993) - Starring: Steven Spielberg (dir.), Liam Neeson, Ralph Fiennes 8.939751183116414
7 - The Lord of the Rings: The Return of the King (2003) - Starring: Peter Jackson (dir.), Elijah Wood, Viggo Mortensen 8.931891816020805
8 - Pulp Fiction (1994) - Starring: Quentin Tarantino (dir.), John Travolta, Uma Thurman 8.846696316923408
9 - The Lord of the Rings: The Fellowship of the Ring (2001) - Starring: Peter Jackson (d

In [None]:
HEADERS ={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}
#url = "https://www.imdb.com/search/title/?genres=Adventure&sort=user_rating,desc&title_type=feature&num_votes=25000,&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=5aab685f-35eb-40f3-95f7-c53f09d542c3&pf_rd_r=N97GEQS6R7J9EV7V770D&pf_rd_s=right-6&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_gnr_16"
#url = "https://www.imdb.com/search/title/?groups=top_250"
def get_movies(url, interval, file_name):
# Sending a request to the speciifed URL
  resp = requests.get(url, headers=headers)

  # Converting the response to Beautiful Soup Object
  content = BeautifulSoup(resp.content, 'lxml')
  
  movie_list = []
  # Iterating throught the list of movies 
  for movie in content.select('.lister-item-content'):
          
      try:
          #movietitle = movie.select('.lister-item-header')[0],
          #p = re.compile(r'[（](.*?)[）]', re.S)
          #new_title = re.sub(p,'',str(movietitle))
          #titles = bytes(new_title, encoding = "utf8")  
          # Creating a python dictonary
          data = {
              
              "title":movie.select('.lister-item-header')[0].get_text().strip(),
              "year":movie.select('.lister-item-year')[0].get_text().strip(),
              "certificate":movie.select('.certificate')[0].get_text().strip(),
              "time":movie.select('.runtime')[0].get_text().strip(),
              "genre":movie.select('.genre')[0].get_text().strip(),
              "rating":movie.select('.ratings-imdb-rating')[0].get_text().strip(),
              "metascore":movie.select('.ratings-metascore')[0].get_text().strip(),
              "simple_desc":movie.select('.text-muted')[2].get_text().strip(),
              "votes":movie.select('.sort-num_votes-visible')[0].get_text().strip()
              
                  
          }
      except IndexError:
          continue
      movie_list.append(data)
         
        
  dataframe = pd.DataFrame(movie_list)
  dataframe.to_csv(file_name)    

In [None]:
url = "https://www.imdb.com/search/title/?groups=top_1000"
resp = requests.get(url, headers=headers)
headers = {'Accept-Language': 'en-US,en;q=0.8'}

# Converting the response to Beautiful Soup Object
content = BeautifulSoup(resp.content, 'lxml')
  
movie_list = []
  # Iterating throught the list of movies 
for movie in content.select('.lister-item-content'):
  movie0 = movie.select('.lister-item-header')[0]


In [None]:
#movie0.get_text().strip()
type(movie0)

bs4.element.Tag

In [None]:
p = re.compile(r'[（](.*?)[）]', re.S)
new_title = re.sub(p,'',str(movietitle))
titles = bytes(new_title, encoding = "utf8") 
titles

b'(<h3 class="lister-item-header">\n<span class="lister-item-index unbold text-primary">50.</span>\n<a href="/title/tt0361748/">Inglourious Basterds</a>\n<span class="lister-item-year text-muted unbold">(2009)</span>\n</h3>,)'

In [None]:
url = "https://www.imdb.com/search/title/?groups=top_1000"
get_movies(url, 0, 'Top1000_movies.csv')

In [None]:
url = "https://www.imdb.com/search/title/?release_date=1900-01-01,2023-12-31&count=250&start=4001&ref_=adv_nxt"
get_movies(url, 0, '16.csv')

In [None]:
url = "https://www.imdb.com/search/title/?release_date=1900-01-01,2023-12-31&count=250&start=3751&ref_=adv_nxt"
get_movies(url, 0, '15.csv')

In [None]:
t14 = pd.read_csv('test.csv')
t15 = pd.read_csv('15.csv')
t16 = pd.read_csv('16.csv')
t17 = pd.read_csv('17.csv')
t18 = pd.read_csv('18.csv')
t19 = pd.read_csv('19.csv')
t20 = pd.read_csv('20.csv')
t21 = pd.read_csv('21.csv')
t22 = pd.read_csv('22.csv')
t23 = pd.read_csv('23.csv')
t24 = pd.read_csv('24.csv')
t25 = pd.read_csv('25.csv')
t26 = pd.read_csv('26.csv')
t27 = pd.read_csv('27.csv')
t28 = pd.read_csv('28.csv')
t29 = pd.read_csv('29.csv')
t30 = pd.read_csv('30.csv')
t31 = pd.read_csv('31.csv')
t32 = pd.read_csv('32.csv')
t33 = pd.read_csv('33.csv')
t34 = pd.read_csv('34.csv')
t35 = pd.read_csv('35.csv')
t36 = pd.read_csv('36.csv')
data = pd.concat([t14,t15,t16,t17,t18,t19,t20,t21,t22,t23,t24,t25,t26,t28,t29,t30,t31,t32,t33,t34,t35,t36],axis=0)
data
data.to_csv('top5000.csv')

In [None]:
pip install imdbpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting imdbpy
  Downloading IMDbPY-2022.7.9-py3-none-any.whl (1.2 kB)
Collecting cinemagoer
  Downloading cinemagoer-2022.12.27-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.2/297.2 KB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: cinemagoer, imdbpy
Successfully installed cinemagoer-2022.12.27 imdbpy-2022.7.9


In [None]:
import imdb
ia = imdb.IMDb()

In [None]:
TV250 = pd.read_csv('imdb_top_250_TVShows.csv')
tv250=TV250['movie_title']
tv250_countries = []
for i in range (len(tv250)):
  movie = ia.search_movie(tv250[i])[0]
  ia.update(movie,'main')
  country = movie.get('countries')[0]
  tv250_countries.append(country)
tv250_countries

['United Kingdom',
 'United States',
 'United Kingdom',
 'United Kingdom',
 'United States',
 'United States',
 'United States',
 'United Kingdom',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'United Kingdom',
 'United States',
 'Japan',
 'United States',
 'Canada',
 'United States',
 'United States',
 'United Kingdom',
 'United States',
 'Australia',
 'United States',
 'India',
 'United States',
 'Japan',
 'United Kingdom',
 'United States',
 'United States',
 'United Kingdom',
 'United Kingdom',
 'Japan',
 'United States',
 'United Kingdom',
 'United States',
 'United Kingdom',
 'United States',
 'Japan',
 'United States',
 'Turkey',
 'Poland',
 'United Kingdom',
 'United States',
 'Japan',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'India',
 'France',
 'United Kingdom',
 'United States',
 'United Kingdom',
 'West Germany',
 'United States',
 'United States',
 'Turkey',
 '

In [None]:
TV250['countries'] = tv250_countries
TV250.to_csv('top_250_TVShows.csv')

In [None]:
mv250 = pd.read_csv('imdb_top_250_movies.csv')
mv250=mv250['movie_title']
mv250_countries = []
for i in range (len(mv250)):
  movie = ia.search_movie(mv250[i])[0]
  ia.update(movie,'main')
  country = movie.get('countries')[0]
  mv250_countries.append(country)
mv250_countries
mv250['countries'] = mv250_countries
mv250.to_csv('top_250_movies.csv')

In [None]:
mv250 = pd.read_csv('imdb_top_250_movies.csv')
mv250['countries'] = mv250_countries
mv250
mv250.to_csv('top_250_movies.csv')

In [None]:
Top1000 = pd.read_csv('Top1000_movies1.csv')
topname = Top1000['title']
topcountries = []
for i in range (len(topname)):
  if ia.search_movie(topname[i])==[]:
    cotest = 'Unknown'
  else:
    movie = ia.search_movie(topname[i])[0]
    ia.update(movie,'main')
    if movie.get('countries')==None:
      cotest = 'Unknown'
    else:
      cotest = movie.get('countries')[0]
  topcountries.append(cotest)
topcountries

['United States',
 'United Kingdom',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'Germany',
 'United Kingdom',
 'United States',
 'United Kingdom',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'Unknown',
 'United States',
 'New Zealand',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'Germany',
 'United States',
 'Unknown',
 'United States',
 'United States',
 'United States',
 'United Kingdom',
 'United States',
 'South Korea',
 'United Kingdom',
 'United States',
 'United States',
 'United States',
 'United States',
 'Germany',
 'United States',
 'United States',
 'United States',
 'Hong Kong',
 'United States',
 'United States',
 'United Kingdom',
 'United Kingdom',
 'United States',
 'Uni

In [None]:
Top1000['countries'] = topcountries
Top1000
Top1000.to_csv('top_1000_movies.csv')

In [None]:

topname = 'Avengers: Endgame'
if ia.search_movie(topname)==[]:
  cotest = 'Unknown'
else:
  movie = ia.search_movie(topname)[0]
  ia.update(movie,'main')
  if movie.get('countries')==None:
    cotest = 'Unknown'
  else:
    cotest = movie.get('countries')[0]
cotest

'United States'

In [None]:
data=pd.read_csv('top5000.csv')
top5000 = data.reset_index(drop=True)
top5000

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,title,year,certificate,time,genre,rating,metascore,simple_desc,votes
0,0,0,0.0,2.\nAnt-Man and the Wasp: Quantumania\n(2023),(2023),PG-13,124 min,"Action, Adventure, Comedy",6.5,48 \n Metascore,"Scott Lang and Hope Van Dyne, along with Hank ...","Votes:\n71,838"
1,1,1,1.0,3.\nCocaine Bear\n(2023),(2023),R,95 min,"Comedy, Thriller",6.5,54 \n Metascore,"An oddball group of cops, criminals, tourists ...","Votes:\n10,536"
2,2,2,2.0,9.\nThe Whale\n(2022),(2022),R,117 min,Drama,7.8,60 \n Metascore,"A reclusive, morbidly obese English teacher at...","Votes:\n59,077"
3,3,3,3.0,10.\nBabylon\n(I) (2022),(I) (2022),R,189 min,"Comedy, Drama, History",7.3,60 \n Metascore,A tale of outsized ambition and outrageous exc...,"Votes:\n81,750"
4,4,4,4.0,11.\nKnock at the Cabin\n(2023),(2023),R,100 min,"Horror, Mystery, Thriller",6.2,63 \n Metascore,"While vacationing, a girl and her parents are ...","Votes:\n39,406"
...,...,...,...,...,...,...,...,...,...,...,...,...
4987,122,122,,"9,242.\nGigi\n(1958)",(1958),G,115 min,"Comedy, Musical, Romance",6.6,82 \n Metascore,"Weary of the conventions of Parisian society, ...","Votes:\n23,409"
4988,123,123,,"9,244.\nCharlie Says\n(2018)",(2018),R,110 min,"Biography, Crime, Drama",5.9,57 \n Metascore,The tragic tale of an all-American girl who wa...,"Votes:\n4,977\n| Gross:\n$0.04M"
4989,124,124,,"9,246.\nSon in Law\n(1993)",(1993),PG-13,95 min,"Comedy, Drama, Romance",5.8,33 \n Metascore,"Having gotten a taste of college life, a drast...","Votes:\n21,147\n| Gross:\n$36.45M"
4990,125,125,,"9,247.\nThe Secret of My Success\n(1987)",(1987),PG-13,111 min,"Comedy, Romance",6.5,36 \n Metascore,A talented young man can't get an executive po...,"Votes:\n34,798\n| Gross:\n$67.00M"


In [None]:
topname = top5000['title']
topcountries = []
record = []
for i in range (4494,4993):
  if ia.search_movie(topname[i])==[]:
    cotest = 'Unknown'
  else:
    movie = ia.search_movie(topname[i])[0]
    ia.update(movie,'main')
    if movie.get('countries')==None:
      cotest = 'Unknown'
    else:
      cotest = movie.get('countries')[0]
  topcountries.append(cotest)
  record.append(topname[i])
topcountries

KeyError: ignored

In [None]:
len(topcountries)

498

In [None]:
len(topname)

4992

In [None]:
import numpy as np
f=np.array(topcountries)
np.save('f.npy',f)

In [None]:
record[-1]

'8,273.\nAssault on Precinct 13\n(1976)'

In [None]:
topname[4493]

'8,273.\nAssault on Precinct 13\n(1976)'

In [None]:
top5000 = data.reset_index(drop=True)
top5000
topname = top5000['title']
topname[0]

'2.\nAnt-Man and the Wasp: Quantumania\n(2023)'

In [None]:
top5000

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,title,year,certificate,time,genre,rating,metascore,simple_desc,votes
0,0,0,0.0,2.\nAnt-Man and the Wasp: Quantumania\n(2023),(2023),PG-13,124 min,"Action, Adventure, Comedy",6.5,48 \n Metascore,"Scott Lang and Hope Van Dyne, along with Hank ...","Votes:\n71,838"
1,1,1,1.0,3.\nCocaine Bear\n(2023),(2023),R,95 min,"Comedy, Thriller",6.5,54 \n Metascore,"An oddball group of cops, criminals, tourists ...","Votes:\n10,536"
2,2,2,2.0,9.\nThe Whale\n(2022),(2022),R,117 min,Drama,7.8,60 \n Metascore,"A reclusive, morbidly obese English teacher at...","Votes:\n59,077"
3,3,3,3.0,10.\nBabylon\n(I) (2022),(I) (2022),R,189 min,"Comedy, Drama, History",7.3,60 \n Metascore,A tale of outsized ambition and outrageous exc...,"Votes:\n81,750"
4,4,4,4.0,11.\nKnock at the Cabin\n(2023),(2023),R,100 min,"Horror, Mystery, Thriller",6.2,63 \n Metascore,"While vacationing, a girl and her parents are ...","Votes:\n39,406"
...,...,...,...,...,...,...,...,...,...,...,...,...
4987,122,122,,"9,242.\nGigi\n(1958)",(1958),G,115 min,"Comedy, Musical, Romance",6.6,82 \n Metascore,"Weary of the conventions of Parisian society, ...","Votes:\n23,409"
4988,123,123,,"9,244.\nCharlie Says\n(2018)",(2018),R,110 min,"Biography, Crime, Drama",5.9,57 \n Metascore,The tragic tale of an all-American girl who wa...,"Votes:\n4,977\n| Gross:\n$0.04M"
4989,124,124,,"9,246.\nSon in Law\n(1993)",(1993),PG-13,95 min,"Comedy, Drama, Romance",5.8,33 \n Metascore,"Having gotten a taste of college life, a drast...","Votes:\n21,147\n| Gross:\n$36.45M"
4990,125,125,,"9,247.\nThe Secret of My Success\n(1987)",(1987),PG-13,111 min,"Comedy, Romance",6.5,36 \n Metascore,A talented young man can't get an executive po...,"Votes:\n34,798\n| Gross:\n$67.00M"


In [None]:
a=np.load('a.npy')
a=a.tolist()
b=np.load('b.npy')
b=b.tolist()
c=np.load('c.npy')
c=c.tolist()
d=np.load('d.npy')
d=d.tolist()
e=np.load('e.npy')
e=e.tolist()
f=np.load('f.npy')
f=f.tolist()
countries5000 = a+b+c+d+e+f
top5000['countries'] = countries5000
top5000
top5000.to_csv('top_5000_movies.csv')