# Scraping netflix series data from otten Tomatoes

## Importing libraries

In [None]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

## Get request to the required url

In [None]:
url="https://editorial.rottentomatoes.com/guide/best-netflix-shows-and-movies-to-binge-watch-now/"
response=requests.get(url)
print(response.status_code)

200


## creating a soup object

In [None]:
soup=BeautifulSoup(response.content)

In [None]:

def getSeriesNames(obj):
    """this function is used to get the netflix series name.it takes one parameter that is the div that contains the information about the pparticular series"""
    return obj.find("h2").find("a").text
def getPoster(obj):
    """
    this function is used to get the poster source of the netflix series
    it takes one parameter that is the div that contains the information about the particular series.
    """
    return obj.find("img",class_="article_poster")["src"]
def getReleaseYear(obj):
    """
    this function is used to get the release year of the netflix series
    it takes one parameter that is the div that contains the information about the particular series.
    """
    return obj.find("h2").find("span").text.strip(")(")
def getRatings(obj):
    """
    this function is used to get the ratings of the netflix series
    it takes one parameter that is the div that contains the information about the particular series.
    """
    return obj.find("span",class_="tMeterScore").text.strip("%")
def getSynopsis(obj):
    """
    this function is used to get the synopsis of the netflix series
    it takes one parameter that is the div that contains the information about the particular series.
    """
    syn=obj.find("div",class_="synopsis").text[9:]
    syn=syn[0:len(syn)-6]
    return syn
def getCast(obj):
    """
    this function is used to get the cast of the netflix series
    it takes one parameter that is the div that contains the information about the particular series.
    """
    cast=obj.find("div",class_="cast").find_all("a")
    name=""
    for j in cast:
        name+=j.text+","
    return name.strip(",")
def getDirector(obj):
    """
    this function is used to get the director of the netflix series
    it takes one parameter that is the div that contains the information about the particular series.
    """
    director=""
    try:
        a=obj.find("div",class_="director").find_all("a")
        for j in a:
            director+=j.text+","
        return director.strip(",")
    except:
        return np.nan

In [None]:
series_name=[]
posters=[]
release_years=[]
synopsis=[]
ratings=[]
casts=[]
directors=[]
network=[]
genre=[]
language=[]
release_date=[]
def getMoreDetails(obj):
    """
    this function is used to get the more details of the netflix series
    it takes one parameter that is the div that contains the information about the particular series.
    """
    global network
    global genre
    global language
    global release_date
    link=obj.find("h2").find("a")["href"]
    response=requests.get("https:"+link)
    if(response.status_code==200):
        data=BeautifulSoup(response.content)
        for category in data.find_all("div", class_="category-wrap"):
            key = category.find("dt").get_text(strip=True)  # Extract the key (dt content
            value = category.find("dd").get_text("", strip=True) # Extract the value (dd content)
            if(key=="Network"):
                network.append(value)
            elif(key=="Genre"):
                genre.append(value)
            elif(key=="Original Language"):
                language.append(value)
            elif(key=="Release Date"):
                release_date.append(value)
            else:
                continue
    else:
        network.append(np.nan)
        genre.append(np.nan)
        language.append(np.nan)
        release_date.append(np.nan)
    return [network,genre,language,release_date]
#getting all the div tags with class countdown-item that contains the details about the netflix series
# here i is the div that contains the information about the particular series
netflix_series_list=soup.find_all("div",class_="countdown-item")
for i in netflix_series_list:
    series_name.append(getSeriesNames(i))
    posters.append(getPoster(i))
    release_years.append(getReleaseYear(i))
    synopsis.append(getSynopsis(i))
    ratings.append(getRatings(i))
    casts.append(getCast(i))
    directors.append(getDirector(i))
    getMoreDetails(i)

**Creating a dictionary with the scaped data**

In [None]:
series_data={"SeriesName":series_name,
             "Poster":posters,
             "ReleaseYear":release_years,
             "Synopsis":synopsis,
             "Rating":ratings,
             "Cast":casts,
             "Director":directors,
             "Network":network,
             "Genre":genre,
             "Language":language,
             "ReleaseDate":release_date,
}
print(len(series_name))
print(len(posters))
print(len(release_years))
print(len(synopsis))
print(len(ratings))
print(len(casts))
print(len(directors))
print(len(network))
print(len(genre))
print(len(language))
print(len(release_date))


100
100
100
100
100
100
100
100
100
100
100


**converting the dictionary into the dataframe**

In [None]:
netflix_series_data=pd.DataFrame(series_data)

**Exporting the dataframe to csv file**

In [None]:
netflix_series_data.to_csv("netflix_series_data.csv")
print("successfull")

successfull


**Web Scraping Financial News Using Python**

**Importing the libraries**

In [None]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

**Making get request and creating the soup object**

In [None]:
url = "https://www.businesstoday.in/latest/economy"
response = requests.get(url)

**Creating a soup object**

In [None]:
soup=BeautifulSoup(response.content)
news=soup.find("div",class_="section-listing-LHS")
news_data=news.find_all("div",class_="widget-listing")
links=[]
posters=[]
updated=[]
titles=[]
descriptions=[]
for i in news_data:
  links.append(i.find("div",class_="widget-listing-thumb").find("a")["href"])
  posters.append(i.find("div",class_="widget-listing-thumb").find("a").find("img")["data-src"])
  updated.append(i.find("div",class_="widget-listing-content-section").find("span").text[10:])
  titles.append(i.find("div",class_="widget-listing-content-section").find("h2").find("a").text)
  descriptions.append(i.find("div",class_="widget-listing-content-section").find("p").text)

**creating a dictionary using the data**

In [None]:
news={
    "Links":links,
    "Posters":posters,
    "Updated":updated,
    "Titles":titles,
    "Descriptions":descriptions
}

**creating a dataframe using the dictionary**

In [None]:
df=pd.DataFrame(news)
df

Unnamed: 0,Links,Posters,Updated,Titles,Descriptions
0,https://www.businesstoday.in/latest/economy/st...,https://akm-img-a-in.tosshub.com/businesstoday...,"Aug 18, 2024",FPIs turn net sellers; sell shares worth Rs 21...,"From August 1 to 17, FPIs withdrew a net amoun..."
1,https://www.businesstoday.in/latest/economy/st...,https://akm-img-a-in.tosshub.com/businesstoday...,"Aug 16, 2024",‘Did much better than expected’: IMF’s Gita Go...,"""India's growth did much better than we expect..."
2,https://www.businesstoday.in/latest/economy/st...,https://akm-img-a-in.tosshub.com/businesstoday...,"Aug 14, 2024",Wholesale inflation eases to 2.04% in July as ...,The annual rate of inflation for Primary Artic...
3,https://www.businesstoday.in/latest/economy/st...,https://akm-img-a-in.tosshub.com/businesstoday...,"Aug 13, 2024",‘Need more homegrown firms to list’: Nithin Ka...,"Ola Electric, Unicommerce, FirstCry IPOs: Cong..."
4,https://www.businesstoday.in/latest/economy/st...,https://akm-img-a-in.tosshub.com/businesstoday...,"Aug 13, 2024","Indian Railways cancels Rs 30,000-crore tender...",The tender panel found that the company’s bidd...
5,https://www.businesstoday.in/latest/economy/st...,https://akm-img-a-in.tosshub.com/businesstoday...,"Aug 13, 2024",CPI inflation at a 5-yr low: SBI warns about b...,CPI inflation for July: The SBI Ecowrap predic...
6,https://www.businesstoday.in/personal-finance/...,https://akm-img-a-in.tosshub.com/businesstoday...,"Aug 12, 2024",RBI mandates full emergency refunds for NBFC d...,For non-emergency premature withdrawals within...
7,https://www.businesstoday.in/latest/economy/st...,https://akm-img-a-in.tosshub.com/businesstoday...,"Aug 12, 2024",CPI inflation at 59-month low but food price p...,"Month-on-month vegetable inflation rises, anal..."
8,https://www.businesstoday.in/latest/economy/st...,https://akm-img-a-in.tosshub.com/businesstoday...,"Aug 12, 2024",Ministry of Corporate Affairs sets up a dedica...,It has also begun discussions with industry re...
9,https://www.businesstoday.in/latest/economy/st...,https://akm-img-a-in.tosshub.com/businesstoday...,"Aug 10, 2024","Bangladesh update: Garment, Knitted sector is ...",Bangladesh's textile industry is a crucial com...


**Scraping Top Rated Indian Movies from IMDB**

In [None]:
#importing the important libraries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [None]:
# defining the url and headers
url="https://www.imdb.com/india/top-rated-indian-movies/"
headers = {"User-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36","Accept-Language": "en-US,en;q=0.5"}

In [None]:
response=requests.get(url,headers=headers)
if(response.status_code==200):
  soup=BeautifulSoup(response.content)
else:
  print("something went wrong")

In [None]:
movies_list=soup.find_all("li",class_="ipc-metadata-list__item ipc-metadata-list__item--inline ipc-metadata-list-item--link")
posters=[]
names=[]
ratings=[]
directors=[]
for movie in movies_list:
  posters.append(movie.find("img",class_="ipc-image")["src"])
  extra=len(movie.find("span",class_="sc-551fcf62-5 jdAoeV").text)
  names.append(movie.find("span",class_="sc-551fcf62-4 fuEsMb").text[extra:])
  ratings.append(movie.find("span",class_="ipc-rating-star--rating").text)
  link=movie.find("a",class_="ipc-metadata-list-item__icon-link")["href"]
  response=requests.get("https://imdb.com"+link,headers=headers)
  if(response.status_code==200):
    obj=BeautifulSoup(response.content)
    directors.append(obj.find("a",class_="ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link").text)
  else:
    directors.append(np.nan)

['Vidhu Vinod Chopra',
 'Hrishikesh Mukherjee',
 'Mani Ratnam',
 'Nithilan Saminathan',
 'Satyajit Ray',
 'Sundar C.',
 'Mari Selvaraj',
 'Rajkumar Hirani',
 'Rojin Thomas',
 'Fazil',
 'Anurag Kashyap',
 'Madhavan',
 'Madhu C. Narayanan',
 'Kiranraj K',
 'Sibi Malayil',
 'Venkatesh Maha',
 'Aamir Khan',
 'Sathyan Anthikad',
 'Kiran Rao',
 'Nitesh Tiwari',
 'Gowtam Tinnanuri',
 'Sudha Kongara',
 'C. Prem Kumar',
 'Mahesh Manjrekar',
 'Vetrimaaran',
 'Jeethu Joseph',
 'Lokesh Kanagaraj',
 'Kadiri Venkata Reddy',
 'Sathyan Anthikad',
 'Bharathan',
 'Vetrimaaran',
 'T.J. Gnanavel',
 'Mani Ratnam',
 'Hanu Raghavapudi',
 'Jeethu Joseph',
 'S. Shankar',
 'Kundan Shah',
 'Dibakar Banerjee',
 'Satyajit Ray',
 'Pa. Ranjith',
 'Vijay K. Bhaskar',
 'Vetrimaaran',
 'Mohan Raja',
 'Satyajit Ray',
 'Ram Gopal Varma',
 'Anjali Menon',
 'Hrishikesh Mukherjee',
 'Alphonse Puthren',
 'Swaroop Rsj',
 'I.V. Sasi',
 'Nishikant Kamat',
 'Anurag Kashyap',
 'Ram Kumar',
 'Rakeysh Omprakash Mehra',
 'Vijay Anan

In [None]:
movies_dict=dict({
     "poster":posters,
     "MovieName":names,
     "Rating":ratings,
     "Director":directors
 })
movies_df=pd.DataFrame(movies_dict)
movies_df.to_csv("top_rated_indian_movies.csv")

**Scraping book details from books.toscrape.com**

In [None]:
import requests
from bs4 import BeautifulSoup


**Creating a soup object**

In [None]:
posters=[]
names=[]
ratings=[]
prices=[]
availability=[]
for j in range(1,51,1):
  url = "https://books.toscrape.com/catalogue/page-{}.html".format(j)
  headers = {
      "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
      "Accept-Language": "en-US,en;q=0.5",
  }
  response = requests.get(url, headers=headers)
  soup=BeautifulSoup(response.content)
  #getting all the list items where each item is a book
  data=soup.find("section").find_all("div")[1].find("ol",class_="row").find_all("li")
  for i in data:
    posters.append(i.find("article").find("div",class_="image_container").find("img")["src"])
    names.append(i.find("article").find("h3").get_text())
    ratings.append(i.find("article").find("p").get("class")[1])
    prices.append(i.find("article").find("div",class_="product_price").find('p').text)
    availability.append(i.find("article").find("div",class_="product_price").find("p",class_="instock availability").get_text().strip())


In [None]:
#converting the list of lists into dictionary
book_dict={
    "poster":posters,
    "name":names,
    "rating":ratings,
    "price":prices,
    "availability":availability
}

In [None]:
#converting it into datadframe
import pandas as pd
book_df=pd.DataFrame(book_dict)
book_df

Unnamed: 0,poster,name,rating,price,availability
0,../media/cache/2c/da/2cdad67c44b002e7ead0cc356...,A Light in the ...,Three,£51.77,In stock
1,../media/cache/26/0c/260c6ae16bce31c8f8c95dadd...,Tipping the Velvet,One,£53.74,In stock
2,../media/cache/3e/ef/3eef99c9d9adef34639f51066...,Soumission,One,£50.10,In stock
3,../media/cache/32/51/3251cf3a3412f53f339e42cac...,Sharp Objects,Four,£47.82,In stock
4,../media/cache/be/a5/bea5697f2534a2f86a3ef27b5...,Sapiens: A Brief History ...,Five,£54.23,In stock
...,...,...,...,...,...
995,../media/cache/96/ee/96ee77d71a31b7694dac6855f...,Alice in Wonderland (Alice's ...,One,£55.53,In stock
996,../media/cache/09/7c/097cb5ecc6fb3fbe1690cf0cb...,"Ajin: Demi-Human, Volume 1 ...",Four,£57.06,In stock
997,../media/cache/1b/5f/1b5ff86f3c75e51e24c573d3f...,A Spy's Devotion (The ...,Five,£16.97,In stock
998,../media/cache/2b/41/2b4161c5b72a4ae386b644682...,1st to Die (Women's ...,One,£53.98,In stock


In [None]:
#exporting the dataframe to csv file
book_df.to_csv("books.csv")

In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np

# Initialize lists to store data
links = []
posters = []
authors = []
dates = []
ratings = []
prices = []
names = []
delivery_dates = []

# URL for Amazon search
for page in range(1,101,1):
  url = 'https://www.amazon.com/s?k=best+machine+learning+books&qid=1724405523&ref=sr_pg_{}'.format(page)

  # Headers to mimic a real browser
  custom_headers = {
      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
      'Accept-Language': 'da, en-gb, en',
      'Accept-Encoding': 'gzip, deflate, br',
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
      'Referer': 'https://www.google.com/'
  }

  # Sending the request
  response = requests.get(url, headers=custom_headers)
  if response.status_code == 200:
    # Creating the soup object
    soup = BeautifulSoup(response.content, 'html.parser')
    res_data = soup.find_all("div", class_="s-result-item")
    for item in res_data:
        # Extract links
        a_tag = item.find("a", class_="a-link-normal")
        if a_tag:
          links.append("https://www.amazon.com" + a_tag["href"])
        else:
          links.append(np.nan)
        # Extract posters
        img_tag = item.find("img", class_="s-image")
        if img_tag and img_tag.has_attr("src"):
          posters.append(img_tag["src"])
        else:
          posters.append(np.nan)
        # Extract names
        name_tag = item.find("span", class_="a-size-medium")
        if name_tag:
          names.append(name_tag.text.strip())
        else:
          names.append(np.nan)
        # Extract authors
        author_div = item.find("div", class_="a-row a-size-base a-color-secondary")
        if author_div:
          author_links = author_div.find_all("a")
          authors.append(", ".join([author.text for author in author_links]))
        else:
          authors.append(np.nan)
        # Extract dates
        date_tag = item.find("span", class_="a-size-base a-color-secondary a-text-normal")
        if date_tag:
          dates.append(date_tag.text.strip())
        else:
          dates.append(np.nan)
        # Extract ratings
        rating_tag = item.find("span", class_="a-icon-alt")
        if rating_tag:
          ratings.append(rating_tag.text.split()[0])  # Extract the numeric rating
        else:
          ratings.append(np.nan)
        # Extract prices
        price_whole = item.find("span", class_="a-price-whole")
        price_fraction = item.find("span", class_="a-price-fraction")
        if price_whole and price_fraction:
          prices.append(price_whole.text + price_fraction.text)
        else:
          prices.append(np.nan)
        # Extract delivery dates
        delivery_tag = item.find("span", class_="a-color-base a-text-bold")
        if delivery_tag:
          delivery_dates.append(delivery_tag.text.strip())
        else:
          delivery_dates.append(np.nan)
print("yes")

yes


In [2]:
#converting lists into the dictionary
product_details={
    "links":links,
    "posters":posters,
    "names":names,
    "authors":authors,
    "date_of_release":dates,
    "ratings":ratings,
    "prices":prices,
}
print("yes")

yes


In [3]:
#converting to dataframe
import pandas as pd
df=pd.DataFrame(product_details)

In [4]:
df.dropna(inplace=True)

In [5]:
# printing the dataframe
df.reset_index(drop=True,inplace=True)
df.to_csv("machine_learning_books.csv")
print("yes")

yes
