# Import libraries and imdb dataset

In [48]:
import pandas as pd
import numpy as np
import random

In [49]:
imdb = pd.read_csv(r'C:\Users\oskarkrahe\Documents\Git\imdb-web-scraping\dataset\imdb.csv')
imdb.head(3)

Unnamed: 0,director,duration,genre,num_critic_reviews,num_user_reviews,rating,stars,summary,title,url,votes,year
0,Prabhu Deva,135.0,"Action,Crime,Thriller",22,2.5K,1.9,"Salman Khan,Disha Patani,Randeep Hooda,Jackie ...","""After taking the dreaded gangster Gani Bhai, ...",Radhe,https://www.imdb.com/title/tt10888594/?ref_=ad...,178458,2021
1,M. Night Shyamalan,103.0,"Action,Adventure,Family",291,1.7K,1.9,"Noah Ringer,Nicola Peltz Beckham,Jackson Rathb...","""Follows the adventures of Aang, a young succe...",The Last Airbender,https://www.imdb.com/title/tt0938283/?ref_=adv...,166760,2010
2,David Benioff,80.0,"Action,Adventure,Drama",361,5.6K,1.9,"Emilia Clarke,Peter Dinklage,Kit Harington,Len...","""Nine noble families fight for control over th...",Game of Thrones,https://www.imdb.com/title/tt0944947/?ref_=adv...,253670,2011


# Sanitization process
The objective of this process is to create a sanitized dataset that is not vinculated to IMDb so that we can upload the file to Zenodo. The real dataset will be provided via Github private.

In [50]:
# Generate a new dataset.
movies_tv_shows = imdb.copy()

# Remove the 'url' column.
movies_tv_shows.drop(columns=['url'], inplace=True)

# Substitute 'rating' with a random number between 0 and 10 rounded to 2 digits.
movies_tv_shows['rating'] = round(np.random.uniform(0, 10), ndigits=2)

# Substitute 'votes' with random numbers between 150000  to 500000 rounded to 0 digits. 
movies_tv_shows['votes'] = round(np.random.uniform(150000, 500000))


Modifying the summary is more tricky: even though film and tv shows descriptions are pretty similar in all websites, I am not one hundred percent sure these summarys we're not written specifically for IMDb, so the data could still be related to them. We're going to remove the summarys by Lorem Ipsum text to avoid that chance.

In [51]:
# Lorem ipsum word list
word_list = ['lorem', 'ipsum', 'dolor', 'sit', 'amet', 'consectetur',
             'adipiscing', 'elit', 'integer', 'nec', 'odio', 'praesent']

# Function that replaces any word in the summary column by a random word in word_list
def replace_with_random_words(text, words):
    split_text = text.split()
    new_text = ' '.join([random.choice(words) for _ in split_text])
    return f'"{new_text}"'

movies_tv_shows['summary'] = movies_tv_shows['summary'].apply(lambda x: replace_with_random_words(x, word_list))

In [52]:
movies_tv_shows.head(10)

Unnamed: 0,director,duration,genre,num_critic_reviews,num_user_reviews,rating,stars,summary,title,votes,year
0,Prabhu Deva,135.0,"Action,Crime,Thriller",22,2.5K,9.6,"Salman Khan,Disha Patani,Randeep Hooda,Jackie ...","""elit sit odio consectetur elit integer lorem ...",Radhe,337703,2021
1,M. Night Shyamalan,103.0,"Action,Adventure,Family",291,1.7K,9.6,"Noah Ringer,Nicola Peltz Beckham,Jackson Rathb...","""lorem adipiscing consectetur ipsum praesent p...",The Last Airbender,337703,2010
2,David Benioff,80.0,"Action,Adventure,Drama",361,5.6K,9.6,"Emilia Clarke,Peter Dinklage,Kit Harington,Len...","""nec consectetur dolor odio sit integer nec lo...",Game of Thrones,337703,2011
3,Sam Taylor-Johnson,125.0,"Drama,Romance,Thriller",409,1.6K,9.6,"Dakota Johnson,Jamie Dornan,Jennifer Ehle,Eloi...","""integer praesent dolor adipiscing dolor praes...",Fifty Shades of Grey,337703,2015
4,Josh Trank,100.0,"Action,Adventure,Sci-Fi",420,967,9.6,"Miles Teller,Kate Mara,Michael B. Jordan,Jamie...","""praesent lorem sit praesent adipiscing elit c...",Fantastic Four,337703,2015
5,Chris Weitz,130.0,"Adventure,Drama,Fantasy",301,800,9.6,"Kristen Stewart,Robert Pattinson,Taylor Lautne...","""nec adipiscing lorem elit lorem integer odio ...",The Twilight Saga: New Moon,337703,2009
6,M. Night Shyamalan,100.0,"Action,Adventure,Sci-Fi",280,934,9.6,"Jaden Smith,David Denman,Will Smith,Sophie Oko...","""consectetur integer elit nec amet lorem nec a...",After Earth,337703,2013
7,Bill Condon,117.0,"Adventure,Drama,Fantasy",328,438,9.6,"Kristen Stewart,Robert Pattinson,Taylor Lautne...","""adipiscing nec ipsum odio nec amet amet sit p...",The Twilight Saga: Breaking Dawn - Part 1,337703,2011
8,David Slade,124.0,"Action,Adventure,Drama",289,510,9.6,"Kristen Stewart,Robert Pattinson,Taylor Lautne...","""odio elit praesent praesent dolor praesent in...",The Twilight Saga: Eclipse,337703,2010
9,M. Night Shyamalan,91.0,"Adventure,Drama,Sci-Fi",191,1.5K,9.6,"Mark Wahlberg,Zooey Deschanel,John Leguizamo,A...","""ipsum elit elit ipsum lorem integer praesent ...",The Happening,337703,2008


# Save the sanitized dataset

In [53]:
movies_tv_shows.to_csv(r'C:\Users\oskarkrahe\Documents\Git\imdb-web-scraping\dataset\movies_tv_shows.csv', index=False)