# Webscraping Netnaija 
### Author: Sachimugu 
Email: <a href="mailto:sachimugu@outlook.com"> Email</a>

### Importing all relevant libraries

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm_notebook

**pip install all these if you dont have them**

### Creating resuable fuctions

In [2]:
#function to load a url and parse it content
def browse(url): #browse function
    page= requests.get(url) #to load a url
    soup= BeautifulSoup(page.content, 'lxml') # parse it content
    return soup


#function to get the next page link
def nextpage(soup): #nextpage function
    try:
        nxtpage=soup.find('a', class_="next page-numbers", href=True)['href']
        return nxtpage #return the page link
    except:
        return #return nothing if at the last page
        

In [3]:
#gets last page
url="https://www.thenetnaija.com/videos/movies"
soup=browse(url) 
x= int(soup.find('ul', class_='pagination').text[7:-1])-1

###  Loop to get link of all pages

In [4]:
#Base url
url="https://www.thenetnaija.com/videos/movies" #open on your browser to get a feel of the site architecture

page_link=[] # empty list to store list of all pages

for i in tqdm_notebook(range(x), desc= 'Loading....'):
    soup=browse(url) # load the base url
    url=nextpage(soup)# get next page on base url append it to page list and become the new base url
    if not url:
        break
    page_link.append(url)
    

Loading....:   0%|          | 0/171 [00:00<?, ?it/s]

In [5]:
len(page_link)

171

### Loop to get the links to all movies on each page

In [6]:
movie_links=[] # empty list to store movie links form each page
for page in tqdm_notebook(page_link, desc='Loading...'): # a for loop to get each page from the page list
    soup= browse(page) # load each page and parse
    
    # this series of code get all link to movies on each page and append it to movie_links
    video_files=soup.find("div", class_="video-files")
    class_info=video_files.findAll("div", class_="info")
    for x in class_info:
        link=x.find("a", href=True)['href']
        movie_links.append(link)
        

Loading...:   0%|          | 0/171 [00:00<?, ?it/s]

### Empty list to store data we need about each movie

In [7]:
titles=[]  # movie titles
movie_linkss=[] # movie links
movie_types=[] #video type
time_of_uplos=[] # date of upload
movie_lengths=[] # lenght of movie
num_of_comments=[] #numbers of comment
mo_summarys=[] # moive summary
Genres=[] #movie genre
Release_Dates=[] #release date
Starss=[] # actors and actress
Languages=[] #movie language
Subtitles=[] #available subtitle
imdb_links=[] #imdb link

### Getting data about each movie

In [9]:

for link in tqdm_notebook(movie_links, desc='Loading'):
    soup= browse(link) # browse movie link and parse
    
    #This series of code get the requried data and append to the data list
    try:
        title=soup.find('h1', class_="page-h1").text 
        titles.append(title)
    except:
        titles.append(' ')
    post_meta=soup.find("div", class_="post-meta")
    try:
        movie_link=post_meta.find('a', href=True)['href']
        movie_linkss.append(movie_link)
    except:
        movie_links.append(' ')
    meta_one=soup.findAll('span', class_='meta-one')
    try:
        movie_type=meta_one[0].text.split()
        movie_types.append(movie_type)
    except:
        movie_types.append(' ')
    x=meta_one[1].text.split()
    try:
        time_of_uplo=' '.join(x)
        time_of_uplos.append(time_of_uplo)
    except:
        time_of_uplos.append(' ')
    try:
        movie_length=meta_one[2].text.split()
        movie_lengths.append(movie_length)
    except:
        movie_lengths.append(' ')
    try:
        num_of_comment=meta_one[3].text.split()
        num_of_comments.append(num_of_comment)
    except:
        num_of_comments.append('0')
    try:
        mo_summary=soup.find('p').next_element
        mo_summarys.append(mo_summary)
    except:
        mo_summarys.append(' ')
    try:
        block=soup.find('blockquote', class_='quote-content')
        y=block.findAll('p')
        
        try:
            Genre=y[1].text.split(':')[1:]
            Genres.append(Genre)
        except:
            Genres.append('missing')
        try:
            Release_Date=y[2].text
            Release_Dates.append(Release_Date)
        except:
            Release_Dates.append('missing')

        try:
            Stars=y[3].text.split(':')[1:]
            Starss.append(Stars)
        except:
            Starss.append('missing')

        try:
            Language=y[5].text.split(':')[1:]
            Languages.append(Language)
        except:
              Languages.append('missing')
        try:
            Subtitle=y[6].text.split(':')[1:]
            Subtitles.append(Subtitle)
        except:
            Subtitles.append('missing')

    except:
        Genres.append('missing')
        Release_Dates.append('missing')
        Starss.append('missing')
        Languages.append('missing')
        Subtitles.append('missing')
            
   
    try:
        imdb_link=block.find('a', href=True)['href']
        imdb_links.append(imdb_link)
    except:
         imdb_links.append('missing')
        

Loading:   0%|          | 0/3073 [00:00<?, ?it/s]

### Creating a table of all data with pandas dataframe

In [10]:
df=pd.DataFrame({"titles":titles,       
              "movie_types":movie_types,
              "time_of_uplos":time_of_uplos,
              "movie_lengths":movie_lengths,
              "num_of_comments":num_of_comments,
              "Genres":Genres,
              "Release_Dates":Release_Dates,
              "Starss":Starss,
              "Languages":Languages,
              "Subtitles":Subtitles,
              "movie_linkss":movie_linkss,  
              "imdb_links":imdb_links,
              "mo_summarys":mo_summarys,   
             })

### save data in cvs and excel format 

In [11]:
df.to_csv('./Data/netnaija_movie.csv')
df.to_excel('./Data/netnaija_movie.xlsx')

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3073 entries, 0 to 3072
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   titles           3073 non-null   object
 1   movie_types      3073 non-null   object
 2   time_of_uplos    3073 non-null   object
 3   movie_lengths    3073 non-null   object
 4   num_of_comments  3073 non-null   object
 5   Genres           3073 non-null   object
 6   Release_Dates    3073 non-null   object
 7   Starss           3073 non-null   object
 8   Languages        3073 non-null   object
 9   Subtitles        3073 non-null   object
 10  movie_linkss     3073 non-null   object
 11  imdb_links       3073 non-null   object
 12  mo_summarys      3073 non-null   object
dtypes: object(13)
memory usage: 312.2+ KB


In [13]:
df.head(5)

Unnamed: 0,titles,movie_types,time_of_uplos,movie_lengths,num_of_comments,Genres,Release_Dates,Starss,Languages,Subtitles,movie_linkss,imdb_links,mo_summarys
0,Clean (2022),[Movies],Jan 28,[01:33:27],[56],"[ Crime, Drama, Thriller]","Release Date: Jan 28, 2022 (United States)","[ Adrien Brody, Glenn Fleshler, Richie Merritt]",[ English],[ English],https://www.thenetnaija.co/videos/movies,https://www.secure.zabira.ng/register?utm_sour...,[Revenge is a dirty business.]
1,The Ice Age Adventures of Buck Wild (2022),[Movies],Jan 28,[01:21:05],[81],"[ Adventure, Animation, Comedy, Family]","Release Date: Jan 28, 2022 (United States)","[ Simon Pegg, Vincent Tong, Aaron Harris]",[ English],[ English],https://www.thenetnaija.co/videos/movies,https://www.secure.zabira.ng/register?utm_sour...,[Disney+ gets wild.]
2,In the Forest (2022),[Movies],Jan 28,[01:23:11],[39],"[ Horror, Thriller]","Release Date: Jan 28, 2022 (United States)","[ Debbon Ayer, Cristina Spruell, Lyman Ward]",[ English],[ English],https://www.thenetnaija.co/videos/movies,https://www.secure.zabira.ng/register?utm_sour...,Helen and her daughter Emily reluctantly accom...
3,Twice Bitten (2021),[Movies],Jan 28,[01:28:07],[18],"[ Jul 11, 2021 (United States)]","Stars: LisaRaye McCoy, Ledisi, Kevin A. Walton...",[ Twice.Bitten.2021.1080p.AMZN.WEBRip.DDP2.0.x...,[ English],"[ https, //www.imdb.com/title/tt14191916/]",https://www.thenetnaija.co/videos/movies,https://www.secure.zabira.ng/register?utm_sour...,An unmitigated and sexy conman targets his nex...
4,The Fallout (2022),[Movies],Jan 27,[01:36:00],[34],[ Drama],"Release Date: Jan 27, 2022 (United States)","[ Jenna Ortega, Shailene Woodley, Julie Bowen]",[ English],[ English],https://www.thenetnaija.co/videos/movies,https://www.secure.zabira.ng/register?utm_sour...,[Nothing will ever be the same.]
