# What Makes a Good Nigerian Movie?

## Importing Libraries

In [14]:
import requests
import re
import numpy as np
import urllib.request
import pandas as pd
import csv
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from bs4 import BeautifulSoup as bs

## Web Scraping Function

In [93]:
def get_movie_data(url):
    page_text = requests.get(url).text
    doc = bs(page_text, "html.parser")
    movies= doc.find(class_ = "lister list detail sub-list")
    movie_containers = movies.find_all(class_= "lister-item mode-advanced")
    movie_names=[]
    for container in movie_containers:
        title= container.h3.a.text
        movie_names.append(title)
    year_containers = movies.find_all(class_ = "lister-item-year text-muted unbold")
    movie_years=[]
    for year in year_containers:
        movie_years.append(year.text) 
    duration_containers = movies.find_all(class_ = "runtime")
    movie_duration=[]
    for duration in duration_containers:
        movie_duration.append(duration.text)
    genre_containers = movies.find_all(class_ = "genre")
    movie_genre=[]
    for genre in genre_containers:
        movie_genre.append(genre.text)
    rating_containers = movies.find_all(class_ = "inline-block ratings-imdb-rating")
    movie_rating=[]
    for rating in rating_containers:
        movie_rating.append(rating.text)
    star_containers = movies.find_all(class_ = "lister-item-content")
    movie_stars=[]
    for star in star_containers:
        p_links = star.find_all("a", id= False)
        movie_stars.append(p_links[12:])
    votes_containers = movies.find_all(class_ = "sort-num_votes-visible")
    movie_votes=[]
    for vote in votes_containers:
        movie_votes.append(vote.text)
    return pd.DataFrame(list(zip(movie_names, movie_years, movie_duration, movie_genre, movie_rating, movie_stars, movie_votes
                          )), columns=["Names","Years", "Duration", "Genre", "Rating", "Stars", "Votes"])


## Data Cleaning Function

In [112]:
def clean_dataframe(df):
    def cleaner_g1_y1(text):
        return re.sub('[^A-Za-z0-9]', ' ',text)
    df['Genre'] = df['Genre'].apply(cleaner_g1_y1)
    df['Years'] = df['Years'].apply(cleaner_g1_y1)
    def cleaner_v1(text):
        return re.sub('[^A-Za-z0-9]', '',text)
        df['Votes'] = df['Votes'].apply(cleaner_v1)
    def cleaner_r1(text):
        return re.sub('\n', '',text)
    df['Rating'] = df['Rating'].apply(cleaner_r1)
    def cleaner_y2(text):
        return re.sub('[I, \s]', '',text)
    df['Years'] = df['Years'].apply(cleaner_y2)
    def cleaner_s1(text):
        text = str(text)[1:-1]             
        return text
    df['Stars'] = df['Stars'].apply(cleaner_s1)
    def cleaner_d1_v2_s2(text):
        return re.sub('[^0-9]', '',text)
    df['Duration'] = df['Duration'].apply(cleaner_d1_v2_s2)
    df['Votes'] = df['Votes'].apply(cleaner_d1_v2_s2)
    df['Years'] = df['Years'].apply(cleaner_d1_v2_s2)
    def cleaner_s3(text):
        text = re.sub('<.*?>','',text)            
        return text
    df['Stars'] = df['Stars'].apply(cleaner_s3)
    return df.head()

In [95]:
#url_numbers=[1,48,98,148,198,248,298,348,398,448,498]
url_numbers=[1,51,101,151,201,251,301,351,401,451,501]
#url_numbers={"df1":1, "df2":51, "df3":101, "df4":151, "df5":201, "df6":251,"df7":301, "df8":351, "df9":401, "df10":451, "df11":501}
df_name= ["df1","df2","df3","df4","df5","df6","df7","df8","df9","df10","df11"]
urls = []
URL = 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start='
for i in url_numbers:
    url = URL + str(i)
    urls.append(url)
print(urls)

['https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=1', 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=51', 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=101', 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=151', 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=201', 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=251', 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=301', 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=351', 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=401', 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=451', 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=501']


In [96]:
url_df= dict(zip(urls,df_name))
print(url_df)

{'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=1': 'df1', 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=51': 'df2', 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=101': 'df3', 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=151': 'df4', 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=201': 'df5', 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=251': 'df6', 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=301': 'df7', 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=351': 'df8', 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=401': 'df9', 'https://www.imdb.com/search/title/?country_of_origin=NG&sort=user_rating,asc&start=451': 'df10', 'https://www.imdb.com/search/ti

In [98]:
df1 = get_movie_data(urls[0])
df2 = get_movie_data(urls[1])
df3 = get_movie_data(urls[2])
df4 = get_movie_data(urls[3])
df5 = get_movie_data(urls[4])
df6 = get_movie_data(urls[5])
df7 = get_movie_data(urls[6])
df8 = get_movie_data(urls[7])
df9 = get_movie_data(urls[8])
df10 = get_movie_data(urls[9])
df11 = get_movie_data(urls[10])

In [107]:
series = [df1,df2,df3,df4,df5,df6,df7,df8,df9,df10,df11]
new_df = pd.concat(series)
new_df.head()

Unnamed: 0,Names,Years,Duration,Genre,Rating,Stars,Votes
0,Blue,(VI) (2020),91 min,\nDrama,\n\n1.4\n,"[[God'spower Okoh], [Ray Emodi], [Tina Mba], [...",\nVotes:\n22\n
1,The Same Day,(2012),90 min,\nCrime,\n\n1.7\n,"[[Uche Chika], [Don Alphonso], [Michael Arg], ...",\nVotes:\n29\n
2,Chain,(IV) (2018),74 min,\nDrama,\n\n1.9\n,"[[Awal Abdulfatai Rahmat], [Chisimdi Benjamin]...",\nVotes:\n13\n
3,Rising Moon,(2005 Video),85 min,\nDrama,\n\n2.1\n,"[[Add a Plot], [Andy Nwakalor], [Akume Akume],...",\nVotes:\n10\n
4,The Python,(2003 Video),118 min,"\nDrama, Horror",\n\n2.2\n,"[[Add a Plot], [Amayo Uzo Philips], [Stephen A...",\nVotes:\n5\n


In [116]:
movie_df = clean_dataframe(new_df)
movie_df.head()
print(len(movie_df))

5


In [115]:
movie_df.to_csv("movie_data.csv")

In [117]:
new_df.to_csv("movies_data.csv")

In [118]:
new_df.head()

Unnamed: 0,Names,Years,Duration,Genre,Rating,Stars,Votes
0,Blue,2020,91,Drama,1.4,"d'spower Okoh, Ray Emodi, Tina Mba, Odera Oliv...",22
1,The Same Day,2012,90,Crime,1.7,"he Chika, Don Alphonso, Michael Arg, Pat Attah...",29
2,Chain,2018,74,Drama,1.9,"al Abdulfatai Rahmat, Chisimdi Benjamin, Andre...",13
3,Rising Moon,2005,85,Drama,2.1,"d a Plot, Andy Nwakalor, Akume Akume, Nzubechi...",10
4,The Python,2003,118,Drama Horror,2.2,"d a Plot, Amayo Uzo Philips, Stephen Ahanaonu,...",5


In [119]:
len(new_df)

384