# What Makes a Good Nigerian Movie?

## Importing Libraries

In [343]:
import requests
import re
import numpy as np
import urllib.request
import pandas as pd
import csv
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from bs4 import BeautifulSoup as bs

## Web Scraping Function

In [344]:
def get_movie_data(url):
    page_text = requests.get(url).text
    doc = bs(page_text, "html.parser")
    movies= doc.find(class_ = "lister list detail sub-list")
    movie_containers = movies.find_all(class_= "lister-item mode-advanced")
    movie_names=[]
    for container in movie_containers:
        title= container.h3.a.text
        movie_names.append(title)
    year_containers = movies.find_all(class_ = "lister-item-year text-muted unbold")
    movie_years=[]
    for year in year_containers:
        movie_years.append(year.text) 
    duration_containers = movies.find_all(class_ = "runtime")
    movie_duration=[]
    for duration in duration_containers:
        movie_duration.append(duration.text)
    genre_containers = movies.find_all(class_ = "genre")
    movie_genre=[]
    for genre in genre_containers:
        movie_genre.append(genre.text)
    rating_containers = movies.find_all(class_ = "inline-block ratings-imdb-rating")
    movie_rating=[]
    for rating in rating_containers:
        movie_rating.append(rating.text)
    star_containers = movies.find_all(class_ = "lister-item-content")
    movie_stars=[]
    for star in star_containers:
        p_links = star.find_all("a", id= False)
        movie_stars.append(p_links[12:])
    votes_containers = movies.find_all(class_ = "sort-num_votes-visible")
    movie_votes=[]
    for vote in votes_containers:
        movie_votes.append(vote.text)
    return pd.DataFrame(list(zip(movie_names, movie_years, movie_duration, movie_genre, movie_rating, movie_stars, movie_votes
                          )), columns=["Names","Years", "Duration", "Genre", "Rating", "Stars", "Votes"])


## Data Cleaning Function

In [345]:
def clean_dataframe(df):
    def cleaner1(text):
        return re.sub('\n', '',text)
    df["Genre"] = df["Genre"].apply(cleaner1)
    df["Rating"] = df["Rating"].apply(cleaner1)
    df["Votes"] = df["Votes"].apply(cleaner1)
    def cleaner2(text):
        return re.sub('[^0-9]', '',text)
    df["Votes"] = df["Votes"].apply(cleaner2)
    df["Duration"] = df["Duration"].apply(cleaner2)
    df["Years"] = df["Years"].apply(cleaner2)
    df["Stars"]=df["Stars"].astype(str)
    def cleaner3(text):
        return re.sub('<.*?>','',text)
    df['Stars'] = df['Stars'].apply(cleaner3)
    def cleaner4(text):
        return re.sub('\[','',text)
    df['Stars'] = df['Stars'].apply(cleaner4)
    def cleaner4(text):
        return re.sub('\[','',text)
    df['Stars'] = df['Stars'].apply(cleaner4)
    def cleaner5(text):
        return text[-4:]
    df["Years"] = df["Years"].apply(cleaner5)
    return df

## Function to Change the Datatype of the columns

In [346]:
def change_datatypes(df):
    df["Duration"] = df["Duration"].astype(int)
    df["Rating"] = df["Rating"].astype(float)
    df["Votes"] = df["Votes"].astype(int)

In [347]:
# Creating lists for the urls and dataframe names
#url_numbers=[1,48,98,148,198,248,298,348,398,448,498]
url_numbers=[1,51,101,151,201,251,301,351,401,451,501]
df_name= ["df1","df2","df3","df4","df5","df6","df7","df8","df9","df10","df11"]
urls = []
URL = 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start='
for i in url_numbers:
    url = URL + str(i)
    urls.append(url)
print(urls)

['https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=1', 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=51', 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=101', 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=151', 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=201', 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=251', 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=301', 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=351', 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=401', 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=451', 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=501']


In [348]:
# Scraping data using the urls:
try:
    for i, j in zip(urls,df_name):
        globals()[j] = get_movie_data(i)
    print("Images Retrieved!")
except:
    print("Failed to retrieve images")

Images Retrieved!


In [349]:
df1.head()

Unnamed: 0,Names,Years,Duration,Genre,Rating,Stars,Votes
0,Blue,(VI) (2020),91 min,\nDrama,\n\n1.4\n,"[[God'spower Okoh], [Ray Emodi], [Tina Mba], [...",\nVotes:\n22\n
1,The Same Day,(2012),90 min,\nCrime,\n\n1.7\n,"[[Uche Chika], [Don Alphonso], [Michael Arg], ...",\nVotes:\n29\n
2,Chain,(IV) (2018),74 min,\nDrama,\n\n1.9\n,"[[Awal Abdulfatai Rahmat], [Chisimdi Benjamin]...",\nVotes:\n13\n
3,Rising Moon,(2005 Video),85 min,\nDrama,\n\n2.1\n,"[[Add a Plot], [Andy Nwakalor], [Akume Akume],...",\nVotes:\n10\n
4,The Python,(2003 Video),118 min,"\nDrama, Horror",\n\n2.2\n,"[[Add a Plot], [Amayo Uzo Philips], [Stephen A...",\nVotes:\n5\n


In [350]:
# Concatenating the 11 dataframes created:
series = [df1,df2,df3,df4,df5,df6,df7,df8,df9,df10,df11]
new_df = pd.concat(series)
new_df.head()

Unnamed: 0,Names,Years,Duration,Genre,Rating,Stars,Votes
0,Blue,(VI) (2020),91 min,\nDrama,\n\n1.4\n,"[[God'spower Okoh], [Ray Emodi], [Tina Mba], [...",\nVotes:\n22\n
1,The Same Day,(2012),90 min,\nCrime,\n\n1.7\n,"[[Uche Chika], [Don Alphonso], [Michael Arg], ...",\nVotes:\n29\n
2,Chain,(IV) (2018),74 min,\nDrama,\n\n1.9\n,"[[Awal Abdulfatai Rahmat], [Chisimdi Benjamin]...",\nVotes:\n13\n
3,Rising Moon,(2005 Video),85 min,\nDrama,\n\n2.1\n,"[[Add a Plot], [Andy Nwakalor], [Akume Akume],...",\nVotes:\n10\n
4,The Python,(2003 Video),118 min,"\nDrama, Horror",\n\n2.2\n,"[[Add a Plot], [Amayo Uzo Philips], [Stephen A...",\nVotes:\n5\n


In [351]:
# Making a copy of the dataframe for cleaning
mdf = new_df.copy()
mdf.head()

Unnamed: 0,Names,Years,Duration,Genre,Rating,Stars,Votes
0,Blue,(VI) (2020),91 min,\nDrama,\n\n1.4\n,"[[God'spower Okoh], [Ray Emodi], [Tina Mba], [...",\nVotes:\n22\n
1,The Same Day,(2012),90 min,\nCrime,\n\n1.7\n,"[[Uche Chika], [Don Alphonso], [Michael Arg], ...",\nVotes:\n29\n
2,Chain,(IV) (2018),74 min,\nDrama,\n\n1.9\n,"[[Awal Abdulfatai Rahmat], [Chisimdi Benjamin]...",\nVotes:\n13\n
3,Rising Moon,(2005 Video),85 min,\nDrama,\n\n2.1\n,"[[Add a Plot], [Andy Nwakalor], [Akume Akume],...",\nVotes:\n10\n
4,The Python,(2003 Video),118 min,"\nDrama, Horror",\n\n2.2\n,"[[Add a Plot], [Amayo Uzo Philips], [Stephen A...",\nVotes:\n5\n


In [352]:
mdf = clean_dataframe(mdf)

In [353]:
mdf.head()

Unnamed: 0,Names,Years,Duration,Genre,Rating,Stars,Votes
0,Blue,2020,91,Drama,1.4,"God'spower Okoh, Ray Emodi, Tina Mba, Odera Ol...",22
1,The Same Day,2012,90,Crime,1.7,"Uche Chika, Don Alphonso, Michael Arg, Pat Att...",29
2,Chain,2018,74,Drama,1.9,"Awal Abdulfatai Rahmat, Chisimdi Benjamin, And...",13
3,Rising Moon,2005,85,Drama,2.1,"Add a Plot, Andy Nwakalor, Akume Akume, Nzubec...",10
4,The Python,2003,118,"Drama, Horror",2.2,"Add a Plot, Amayo Uzo Philips, Stephen Ahanaon...",5


In [354]:
change_datatypes(mdf)

In [355]:
mdf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 384 entries, 0 to 16
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Names     384 non-null    object 
 1   Years     384 non-null    object 
 2   Duration  384 non-null    int32  
 3   Genre     384 non-null    object 
 4   Rating    384 non-null    float64
 5   Stars     384 non-null    object 
 6   Votes     384 non-null    int32  
dtypes: float64(1), int32(2), object(4)
memory usage: 21.0+ KB


In [357]:
mdf.to_csv("Nigerian_movies_data.csv", index=False)