## Module 1 Project - Data Cleaning

In this notebook, I imported and cleaned up some of the supplied datasets.



In [1]:
#import all libraries that I'll need
from bs4 import BeautifulSoup
import requests
import re
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import seaborn as sns
import itertools

In [2]:
#import datasets from cloned repo
base_df = pd.read_csv('Data/zippedData/title.ratings.csv')
base_df.head()

Unnamed: 0,tconst,averagerating,numvotes
0,tt10356526,8.3,31
1,tt10384606,8.9,559
2,tt1042974,6.4,20
3,tt1043726,4.2,50352
4,tt1060240,6.5,21


In [3]:
prin_df = pd.read_csv('Data/zippedData/title.principals.csv')
prin_df.head() # not sure what this data means, will ignore for the time being unless I can find a use

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0111414,1,nm0246005,actor,,"[""The Man""]"
1,tt0111414,2,nm0398271,director,,
2,tt0111414,3,nm3739909,producer,producer,
3,tt0323808,10,nm0059247,editor,,
4,tt0323808,1,nm3579312,actress,,"[""Beth Boothby""]"


In [4]:
prin_df.category.unique()

array(['actor', 'director', 'producer', 'editor', 'actress', 'composer',
       'cinematographer', 'writer', 'self', 'production_designer',
       'archive_footage', 'archive_sound'], dtype=object)

In [5]:
len(prin_df.characters.unique())

174763

In [6]:
prin_df.ordering.unique()

array([ 1,  2,  3, 10,  4,  5,  6,  7,  8,  9], dtype=int64)

In [7]:
basics_df = pd.read_csv('Data/zippedData/title.basics.csv')
basics_df.head() # This seems to be pretty useful data about titles and genres that I can use

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama"
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama"
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy"


In [8]:
root_df = base_df.set_index('tconst').join(basics_df.set_index('tconst'), how='inner')
root_df.head() # combined the 2 dfs using the tconst as my index with an inner join

Unnamed: 0_level_0,averagerating,numvotes,primary_title,original_title,start_year,runtime_minutes,genres
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
tt10356526,8.3,31,Laiye Je Yaarian,Laiye Je Yaarian,2019,117.0,Romance
tt10384606,8.9,559,Borderless,Borderless,2019,87.0,Documentary
tt1042974,6.4,20,Just Inès,Just Inès,2010,90.0,Drama
tt1043726,4.2,50352,The Legend of Hercules,The Legend of Hercules,2014,99.0,"Action,Adventure,Fantasy"
tt1060240,6.5,21,Até Onde?,Até Onde?,2011,73.0,"Mystery,Thriller"


In [9]:
#len(root_df.original_title.unique())
#len(root_df.primary_title.unique())
print(len(root_df.original_title.unique()), len(root_df.primary_title.unique())) 
# I want to see what title is listed for most of the movies

71097 69993


In [10]:
del root_df['primary_title'] # removing primary title column as it's mostly redundant with original_title
root_df.head(10)

Unnamed: 0_level_0,averagerating,numvotes,original_title,start_year,runtime_minutes,genres
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt10356526,8.3,31,Laiye Je Yaarian,2019,117.0,Romance
tt10384606,8.9,559,Borderless,2019,87.0,Documentary
tt1042974,6.4,20,Just Inès,2010,90.0,Drama
tt1043726,4.2,50352,The Legend of Hercules,2014,99.0,"Action,Adventure,Fantasy"
tt1060240,6.5,21,Até Onde?,2011,73.0,"Mystery,Thriller"
tt1069246,6.2,326,Habana Eva,2010,106.0,"Comedy,Romance"
tt1094666,7.0,1613,Hamill,2010,108.0,"Biography,Drama,Sport"
tt1130982,6.4,571,Avant l'aube,2011,104.0,"Drama,Thriller"
tt1156528,7.2,265,Circus Fantasticus,2011,77.0,"Drama,War"
tt1161457,4.2,148,The Vanquisher,2016,90.0,"Action,Adventure,Sci-Fi"


In [11]:
del root_df['numvotes'] # deleting numvotes column as that data isn't useful for the purpose of my investigations
root_df.head(10)

Unnamed: 0_level_0,averagerating,original_title,start_year,runtime_minutes,genres
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tt10356526,8.3,Laiye Je Yaarian,2019,117.0,Romance
tt10384606,8.9,Borderless,2019,87.0,Documentary
tt1042974,6.4,Just Inès,2010,90.0,Drama
tt1043726,4.2,The Legend of Hercules,2014,99.0,"Action,Adventure,Fantasy"
tt1060240,6.5,Até Onde?,2011,73.0,"Mystery,Thriller"
tt1069246,6.2,Habana Eva,2010,106.0,"Comedy,Romance"
tt1094666,7.0,Hamill,2010,108.0,"Biography,Drama,Sport"
tt1130982,6.4,Avant l'aube,2011,104.0,"Drama,Thriller"
tt1156528,7.2,Circus Fantasticus,2011,77.0,"Drama,War"
tt1161457,4.2,The Vanquisher,2016,90.0,"Action,Adventure,Sci-Fi"


In [12]:
akas_df = pd.read_csv('Data/zippedData/title.akas.csv')
akas_df.head() # importing title.akas.csv to see what kind of data it holds

Unnamed: 0,title_id,ordering,title,region,language,types,attributes,is_original_title
0,tt0369610,10,Джурасик свят,BG,bg,,,0.0
1,tt0369610,11,Jurashikku warudo,JP,,imdbDisplay,,0.0
2,tt0369610,12,Jurassic World: O Mundo dos Dinossauros,BR,,imdbDisplay,,0.0
3,tt0369610,13,O Mundo dos Dinossauros,BR,,,short title,0.0
4,tt0369610,14,Jurassic World,FR,,imdbDisplay,,0.0


In [13]:
crew_df = pd.read_csv('Data/zippedData/title.crew.csv')
crew_df.head() # don't know what this stuff means yet

Unnamed: 0,tconst,directors,writers
0,tt0285252,nm0899854,nm0899854
1,tt0438973,,"nm0175726,nm1802864"
2,tt0462036,nm1940585,nm1940585
3,tt0835418,nm0151540,"nm0310087,nm0841532"
4,tt0878654,"nm0089502,nm2291498,nm2292011",nm0284943


In [14]:
name_df = pd.read_csv('Data/zippedData/name.basics.csv')
name_df.head() # looks like the name.basics.csv can work with the title.crew.csv to get some useful data from. 

Unnamed: 0,nconst,primary_name,birth_year,death_year,primary_profession,known_for_titles
0,nm0061671,Mary Ellen Bauder,,,"miscellaneous,production_manager,producer","tt0837562,tt2398241,tt0844471,tt0118553"
1,nm0061865,Joseph Bauer,,,"composer,music_department,sound_department","tt0896534,tt6791238,tt0287072,tt1682940"
2,nm0062070,Bruce Baum,,,"miscellaneous,actor,writer","tt1470654,tt0363631,tt0104030,tt0102898"
3,nm0062195,Axel Baumann,,,"camera_department,cinematographer,art_department","tt0114371,tt2004304,tt1618448,tt1224387"
4,nm0062798,Pete Baxter,,,"production_designer,art_department,set_decorator","tt0452644,tt0452692,tt3458030,tt2178256"


In [15]:
gross_df = pd.read_csv('Data/zippedData/bom.movie_gross.csv')
gross_df.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


In [16]:
print(len(gross_df['title']), len(root_df['original_title'])) # compare the lengths of titles in gross table vs root

3387 73856


In [17]:
print(len(gross_df['studio']))

3387


In [18]:
movies_df = pd.read_csv('Data/zippedData/tmdb.movies.csv')
movies_df.head()

Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186


In [19]:
len(movies_df.title)

26517

In [20]:
movbud_df = pd.read_csv('Data/zippedData/tn.movie_budgets.csv')
movbud_df.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


In [21]:
len(movbud_df.movie)

5782

In [22]:
root_df.head()

Unnamed: 0_level_0,averagerating,original_title,start_year,runtime_minutes,genres
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tt10356526,8.3,Laiye Je Yaarian,2019,117.0,Romance
tt10384606,8.9,Borderless,2019,87.0,Documentary
tt1042974,6.4,Just Inès,2010,90.0,Drama
tt1043726,4.2,The Legend of Hercules,2014,99.0,"Action,Adventure,Fantasy"
tt1060240,6.5,Até Onde?,2011,73.0,"Mystery,Thriller"


In [23]:
# will join root_df to movbud_df to create a main working database 
main_df = root_df.set_index('original_title').join(movbud_df.set_index('movie'), how='inner')
main_df.head()

Unnamed: 0,averagerating,start_year,runtime_minutes,genres,id,release_date,production_budget,domestic_gross,worldwide_gross
#Horror,3.0,2015,101.0,"Crime,Drama,Horror",16,"Nov 20, 2015","$1,500,000",$0,$0
10 Cloverfield Lane,7.2,2016,103.0,"Drama,Horror,Mystery",54,"Mar 11, 2016","$5,000,000","$72,082,999","$108,286,422"
10 Days in a Madhouse,6.7,2015,111.0,Drama,48,"Nov 11, 2015","$12,000,000","$14,616","$14,616"
12 Rounds,8.1,2017,,"Action,Drama,Romance",37,"Mar 27, 2009","$20,000,000","$12,234,694","$17,306,648"
12 Strong,6.6,2018,130.0,"Action,Drama,History",64,"Jan 19, 2018","$35,000,000","$45,819,713","$71,118,378"


In [24]:
main_df.set_index('id')

Unnamed: 0_level_0,averagerating,start_year,runtime_minutes,genres,release_date,production_budget,domestic_gross,worldwide_gross
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
16,3.0,2015,101.0,"Crime,Drama,Horror","Nov 20, 2015","$1,500,000",$0,$0
54,7.2,2016,103.0,"Drama,Horror,Mystery","Mar 11, 2016","$5,000,000","$72,082,999","$108,286,422"
48,6.7,2015,111.0,Drama,"Nov 11, 2015","$12,000,000","$14,616","$14,616"
37,8.1,2017,,"Action,Drama,Romance","Mar 27, 2009","$20,000,000","$12,234,694","$17,306,648"
64,6.6,2018,130.0,"Action,Drama,History","Jan 19, 2018","$35,000,000","$45,819,713","$71,118,378"
...,...,...,...,...,...,...,...,...
26,6.1,2015,96.0,"Animation,Comedy,Drama","Aug 11, 2006","$35,000,000","$11,989,328","$12,506,188"
26,5.4,2016,,Horror,"Aug 11, 2006","$35,000,000","$11,989,328","$12,506,188"
57,8.0,2016,108.0,"Adventure,Animation,Comedy","Mar 4, 2016","$150,000,000","$341,268,248","$1,019,429,616"
82,6.7,2013,110.0,"Crime,Drama,Thriller","Dec 31, 2013","$16,000,000",$0,"$1,844,228"


In [25]:
main_df # whoops. Shouldn't have changed index to title. will retry with another join. 

Unnamed: 0,averagerating,start_year,runtime_minutes,genres,id,release_date,production_budget,domestic_gross,worldwide_gross
#Horror,3.0,2015,101.0,"Crime,Drama,Horror",16,"Nov 20, 2015","$1,500,000",$0,$0
10 Cloverfield Lane,7.2,2016,103.0,"Drama,Horror,Mystery",54,"Mar 11, 2016","$5,000,000","$72,082,999","$108,286,422"
10 Days in a Madhouse,6.7,2015,111.0,Drama,48,"Nov 11, 2015","$12,000,000","$14,616","$14,616"
12 Rounds,8.1,2017,,"Action,Drama,Romance",37,"Mar 27, 2009","$20,000,000","$12,234,694","$17,306,648"
12 Strong,6.6,2018,130.0,"Action,Drama,History",64,"Jan 19, 2018","$35,000,000","$45,819,713","$71,118,378"
...,...,...,...,...,...,...,...,...,...
Zoom,6.1,2015,96.0,"Animation,Comedy,Drama",26,"Aug 11, 2006","$35,000,000","$11,989,328","$12,506,188"
Zoom,5.4,2016,,Horror,26,"Aug 11, 2006","$35,000,000","$11,989,328","$12,506,188"
Zootopia,8.0,2016,108.0,"Adventure,Animation,Comedy",57,"Mar 4, 2016","$150,000,000","$341,268,248","$1,019,429,616"
Zulu,6.7,2013,110.0,"Crime,Drama,Thriller",82,"Dec 31, 2013","$16,000,000",$0,"$1,844,228"


In [26]:
root_df

Unnamed: 0_level_0,averagerating,original_title,start_year,runtime_minutes,genres
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tt10356526,8.3,Laiye Je Yaarian,2019,117.0,Romance
tt10384606,8.9,Borderless,2019,87.0,Documentary
tt1042974,6.4,Just Inès,2010,90.0,Drama
tt1043726,4.2,The Legend of Hercules,2014,99.0,"Action,Adventure,Fantasy"
tt1060240,6.5,Até Onde?,2011,73.0,"Mystery,Thriller"
...,...,...,...,...,...
tt9805820,8.1,Caisa,2018,84.0,Documentary
tt9844256,7.5,Code Geass: Lelouch of the Rebellion Episode III,2018,120.0,"Action,Animation,Sci-Fi"
tt9851050,4.7,Sisters,2019,,"Action,Drama"
tt9886934,7.0,The Projectionist,2019,81.0,Documentary


In [27]:
movbud_df

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"
...,...,...,...,...,...,...
5777,78,"Dec 31, 2018",Red 11,"$7,000",$0,$0
5778,79,"Apr 2, 1999",Following,"$6,000","$48,482","$240,495"
5779,80,"Jul 13, 2005",Return to the Land of Wonders,"$5,000","$1,338","$1,338"
5780,81,"Sep 29, 2015",A Plague So Pleasant,"$1,400",$0,$0


In [28]:
root_df.rename(columns={'original_title':'title'}, inplace=True)
movbud_df.rename(columns={'movie':'title'}, inplace=True)
# rename the title column to make uniform

In [29]:
root_df # check to make sure it worked

Unnamed: 0_level_0,averagerating,title,start_year,runtime_minutes,genres
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tt10356526,8.3,Laiye Je Yaarian,2019,117.0,Romance
tt10384606,8.9,Borderless,2019,87.0,Documentary
tt1042974,6.4,Just Inès,2010,90.0,Drama
tt1043726,4.2,The Legend of Hercules,2014,99.0,"Action,Adventure,Fantasy"
tt1060240,6.5,Até Onde?,2011,73.0,"Mystery,Thriller"
...,...,...,...,...,...
tt9805820,8.1,Caisa,2018,84.0,Documentary
tt9844256,7.5,Code Geass: Lelouch of the Rebellion Episode III,2018,120.0,"Action,Animation,Sci-Fi"
tt9851050,4.7,Sisters,2019,,"Action,Drama"
tt9886934,7.0,The Projectionist,2019,81.0,Documentary


In [30]:
root_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 73856 entries, tt10356526 to tt9894098
Data columns (total 5 columns):
averagerating      73856 non-null float64
title              73856 non-null object
start_year         73856 non-null int64
runtime_minutes    66236 non-null float64
genres             73052 non-null object
dtypes: float64(2), int64(1), object(2)
memory usage: 3.4+ MB


In [31]:
movbud_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
id                   5782 non-null int64
release_date         5782 non-null object
title                5782 non-null object
production_budget    5782 non-null object
domestic_gross       5782 non-null object
worldwide_gross      5782 non-null object
dtypes: int64(1), object(5)
memory usage: 271.2+ KB


In [32]:
main_df = pd.merge(root_df, movbud_df, on='title',how='left') # left joined the movbud to root to add what budget info is available

In [33]:
main_df

Unnamed: 0,averagerating,title,start_year,runtime_minutes,genres,id,release_date,production_budget,domestic_gross,worldwide_gross
0,8.3,Laiye Je Yaarian,2019,117.0,Romance,,,,,
1,8.9,Borderless,2019,87.0,Documentary,,,,,
2,6.4,Just Inès,2010,90.0,Drama,,,,,
3,4.2,The Legend of Hercules,2014,99.0,"Action,Adventure,Fantasy",42.0,"Jan 10, 2014","$70,000,000","$18,848,538","$58,953,319"
4,6.5,Até Onde?,2011,73.0,"Mystery,Thriller",,,,,
...,...,...,...,...,...,...,...,...,...,...
73954,8.1,Caisa,2018,84.0,Documentary,,,,,
73955,7.5,Code Geass: Lelouch of the Rebellion Episode III,2018,120.0,"Action,Animation,Sci-Fi",,,,,
73956,4.7,Sisters,2019,,"Action,Drama",57.0,"Dec 18, 2015","$30,000,000","$87,044,645","$106,030,660"
73957,7.0,The Projectionist,2019,81.0,Documentary,,,,,


In [34]:
main_df.to_csv('Data/zippedData/cleaned.main.csv')
# export the slightly cleaned up df to csv so I'll have a better starting point later

In [35]:
imdb_df = pd.read_csv('Data/IMDBscrape-DESKTOP-NROVL9F.csv')
imdb_df.head() # import scraped imdb data

Unnamed: 0,ids,titles,genres,directors,budget,openingWeekendUSA,grossUSA,cumulativeWorldwideGross,metascore,metacritic,m-rating,imdbVotes,imdbScore,year
0,tt4520988,Frozen II,"Animation, Adventure, Comedy, Family, Fantasy,...","Chris Buck, Jennifer Lee","$150,000,000 (estimated)","$130,263,358, 24 November 2019","$476,774,707","$1,446,412,221",64,47,PG,100731,7.0 / 10,2019
1,tt2762506,Bacurau,"Action, Adventure, Mystery, Sci-Fi, Thriller, ...","Juliano Dornelles, Kleber Mendonça Filho",,"$14,691, 8 March 2020","$58,115","$3,423,433",82,26,none,11259,7.7 / 10,2019
2,tt10199640,Beanpole,"Drama, War",Kantemir Balagov,,"$10,008, 2 February 2020","$187,024","$1,941,180",84,25,none,4357,7.2 / 10,2019
3,tt9351980,American Factory,Documentary,"Steven Bognar, Julia Reichert",,,,,86,23,none,13211,7.4 / 10,2019
4,tt7549996,Judy,"Biography, Drama, Romance",Rupert Goold,,"$2,916,548, 29 September 2019","$24,313,888","$41,515,546",66,46,PG-13,28699,6.9 / 10,2019


In [36]:
imdb2_df = pd.read_csv('Data/all2.csv') # import scraped IMDB director data

In [37]:
imdb2_df.head()

Unnamed: 0,ids,release_date,writers,stars,runtime_mins
0,tt4520988,2019-11-20,"Jennifer Lee, Chris Buck, Kristen Anderson-Lop...","Kristen Bell, Idina Menzel, Josh Gad, Jonathan...",103
1,tt2762506,2019-08-29,"Kleber Mendonça Filho, Juliano Dornelles","Bárbara Colen, Thomas Aquino, Silvero Pereira,...",131
2,tt10199640,2019-06-20,"Kantemir Balagov, Aleksandr Terekhov","Viktoria Miroshnichenko, Vasilisa Perelygina, ...",130
3,tt9351980,2019-08-21,,"Junming 'Jimmy' Wang, Robert Allen, Sherrod Br...",110
4,tt7549996,2019-09-27,"Tom Edge, Peter Quilter","Renée Zellweger, Jessie Buckley, Finn Wittrock...",118


In [38]:
imdb_df = pd.merge(imdb_df, imdb2_df, on='ids',how='left')
imdb_df #merged the two imdb scraped dfs

Unnamed: 0,ids,titles,genres,directors,budget,openingWeekendUSA,grossUSA,cumulativeWorldwideGross,metascore,metacritic,m-rating,imdbVotes,imdbScore,year,release_date,writers,stars,runtime_mins
0,tt4520988,Frozen II,"Animation, Adventure, Comedy, Family, Fantasy,...","Chris Buck, Jennifer Lee","$150,000,000 (estimated)","$130,263,358, 24 November 2019","$476,774,707","$1,446,412,221",64,47,PG,100731,7.0 / 10,2019,2019-11-20,"Jennifer Lee, Chris Buck, Kristen Anderson-Lop...","Kristen Bell, Idina Menzel, Josh Gad, Jonathan...",103
1,tt2762506,Bacurau,"Action, Adventure, Mystery, Sci-Fi, Thriller, ...","Juliano Dornelles, Kleber Mendonça Filho",,"$14,691, 8 March 2020","$58,115","$3,423,433",82,26,none,11259,7.7 / 10,2019,2019-08-29,"Kleber Mendonça Filho, Juliano Dornelles","Bárbara Colen, Thomas Aquino, Silvero Pereira,...",131
2,tt10199640,Beanpole,"Drama, War",Kantemir Balagov,,"$10,008, 2 February 2020","$187,024","$1,941,180",84,25,none,4357,7.2 / 10,2019,2019-06-20,"Kantemir Balagov, Aleksandr Terekhov","Viktoria Miroshnichenko, Vasilisa Perelygina, ...",130
3,tt9351980,American Factory,Documentary,"Steven Bognar, Julia Reichert",,,,,86,23,none,13211,7.4 / 10,2019,2019-08-21,,"Junming 'Jimmy' Wang, Robert Allen, Sherrod Br...",110
4,tt7549996,Judy,"Biography, Drama, Romance",Rupert Goold,,"$2,916,548, 29 September 2019","$24,313,888","$41,515,546",66,46,PG-13,28699,6.9 / 10,2019,2019-09-27,"Tom Edge, Peter Quilter","Renée Zellweger, Jessie Buckley, Finn Wittrock...",118
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
973,tt1386925,The Dead,"Action, Adventure, Drama, Horror, Sci-Fi, Thri...","Howard J. Ford, Jonathan Ford",,,,"$4,891",59,6,R,8622,5.7 / 10,2010,2010-08-30,"Howard J. Ford, Jonathan Ford","Rob Freeman, Prince David Oseia, David Dontoh,...",105
974,tt0758752,Love & Other Drugs,"Comedy, Drama, Romance",Edward Zwick,"$30,000,000 (estimated)","$9,739,161, 28 November 2010","$32,367,005","$102,820,008",55,38,R,173855,6.7 / 10,2010,2010-11-24,"Charles Randolph, Edward Zwick, Marshall Hersk...","Jake Gyllenhaal, Anne Hathaway, Judy Greer, Ol...",112
975,tt1504320,The King's Speech,"Biography, Drama, History",Tom Hooper,"$15,000,000 (estimated)","$355,450, 28 November 2010","$138,797,449","$427,374,317",88,41,R,617619,8.0 / 10,2010,2010-12-16,David Seidler,"Colin Firth, Geoffrey Rush, Helena Bonham Cart...",118
976,tt1001526,Megamind,"Animation, Action, Comedy, Family, Sci-Fi",Tom McGrath,"$130,000,000 (estimated)","$46,016,833, 7 November 2010","$148,415,853","$321,885,765",63,33,PG,217780,7.2 / 10,2010,2010-10-28,"Alan Schoolcraft, Brent Simons","Will Ferrell, Jonah Hill, Brad Pitt, Tina Fey",95


In [39]:
# export the new df as csv 
imdb_df.to_csv('Data/zippedData/cleaned.imdb.csv')

In [40]:
len(imdb_df['grossUSA'])

978

In [41]:
imdb_df['grossUSA'].isnull().sum() # see how many missing values we have for revenue

138

In [42]:
imdb_df['cumulativeWorldwideGross'].isnull().sum() 
# less missing values for cumulative WW gross. Will probably use this over USA

51

In [43]:
imdb_df.head()

Unnamed: 0,ids,titles,genres,directors,budget,openingWeekendUSA,grossUSA,cumulativeWorldwideGross,metascore,metacritic,m-rating,imdbVotes,imdbScore,year,release_date,writers,stars,runtime_mins
0,tt4520988,Frozen II,"Animation, Adventure, Comedy, Family, Fantasy,...","Chris Buck, Jennifer Lee","$150,000,000 (estimated)","$130,263,358, 24 November 2019","$476,774,707","$1,446,412,221",64,47,PG,100731,7.0 / 10,2019,2019-11-20,"Jennifer Lee, Chris Buck, Kristen Anderson-Lop...","Kristen Bell, Idina Menzel, Josh Gad, Jonathan...",103
1,tt2762506,Bacurau,"Action, Adventure, Mystery, Sci-Fi, Thriller, ...","Juliano Dornelles, Kleber Mendonça Filho",,"$14,691, 8 March 2020","$58,115","$3,423,433",82,26,none,11259,7.7 / 10,2019,2019-08-29,"Kleber Mendonça Filho, Juliano Dornelles","Bárbara Colen, Thomas Aquino, Silvero Pereira,...",131
2,tt10199640,Beanpole,"Drama, War",Kantemir Balagov,,"$10,008, 2 February 2020","$187,024","$1,941,180",84,25,none,4357,7.2 / 10,2019,2019-06-20,"Kantemir Balagov, Aleksandr Terekhov","Viktoria Miroshnichenko, Vasilisa Perelygina, ...",130
3,tt9351980,American Factory,Documentary,"Steven Bognar, Julia Reichert",,,,,86,23,none,13211,7.4 / 10,2019,2019-08-21,,"Junming 'Jimmy' Wang, Robert Allen, Sherrod Br...",110
4,tt7549996,Judy,"Biography, Drama, Romance",Rupert Goold,,"$2,916,548, 29 September 2019","$24,313,888","$41,515,546",66,46,PG-13,28699,6.9 / 10,2019,2019-09-27,"Tom Edge, Peter Quilter","Renée Zellweger, Jessie Buckley, Finn Wittrock...",118


In [44]:
del imdb_df['imdbVotes'] # remove column I don't need data from

In [45]:
imdb_df.shape # find dimensions of df

(978, 17)

In [46]:
main_df.shape

(73959, 10)

In [47]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73959 entries, 0 to 73958
Data columns (total 10 columns):
averagerating        73959 non-null float64
title                73959 non-null object
start_year           73959 non-null int64
runtime_minutes      66330 non-null float64
genres               73154 non-null object
id                   2638 non-null float64
release_date         2638 non-null object
production_budget    2638 non-null object
domestic_gross       2638 non-null object
worldwide_gross      2638 non-null object
dtypes: float64(3), int64(1), object(6)
memory usage: 6.2+ MB


In [48]:
main_df.head() 
# will start working on removing all rows where budget data isnt available

Unnamed: 0,averagerating,title,start_year,runtime_minutes,genres,id,release_date,production_budget,domestic_gross,worldwide_gross
0,8.3,Laiye Je Yaarian,2019,117.0,Romance,,,,,
1,8.9,Borderless,2019,87.0,Documentary,,,,,
2,6.4,Just Inès,2010,90.0,Drama,,,,,
3,4.2,The Legend of Hercules,2014,99.0,"Action,Adventure,Fantasy",42.0,"Jan 10, 2014","$70,000,000","$18,848,538","$58,953,319"
4,6.5,Até Onde?,2011,73.0,"Mystery,Thriller",,,,,


In [49]:
main_df.isna().sum() #find how many rows are missing data

averagerating            0
title                    0
start_year               0
runtime_minutes       7629
genres                 805
id                   71321
release_date         71321
production_budget    71321
domestic_gross       71321
worldwide_gross      71321
dtype: int64

In [50]:
budgets_df = main_df[~main_df['production_budget'].isnull()]
# drop all rows where the production budget isn't present

In [51]:
budgets_df.head()

Unnamed: 0,averagerating,title,start_year,runtime_minutes,genres,id,release_date,production_budget,domestic_gross,worldwide_gross
3,4.2,The Legend of Hercules,2014,99.0,"Action,Adventure,Fantasy",42.0,"Jan 10, 2014","$70,000,000","$18,848,538","$58,953,319"
10,5.1,Baggage Claim,2013,96.0,Comedy,38.0,"Sep 27, 2013","$8,500,000","$21,569,509","$22,885,836"
16,7.6,Moneyball,2011,133.0,"Biography,Drama,Sport",15.0,"Sep 23, 2011","$50,000,000","$75,605,492","$111,300,835"
17,6.5,Hereafter,2010,129.0,"Drama,Fantasy,Romance",61.0,"Oct 15, 2010","$50,000,000","$32,746,941","$108,660,270"
20,7.2,21 Jump Street,2012,109.0,"Action,Comedy,Crime",44.0,"Mar 16, 2012","$42,000,000","$138,447,667","$202,812,429"


In [52]:
budgets_df # now I am only left with movies with financial data available

Unnamed: 0,averagerating,title,start_year,runtime_minutes,genres,id,release_date,production_budget,domestic_gross,worldwide_gross
3,4.2,The Legend of Hercules,2014,99.0,"Action,Adventure,Fantasy",42.0,"Jan 10, 2014","$70,000,000","$18,848,538","$58,953,319"
10,5.1,Baggage Claim,2013,96.0,Comedy,38.0,"Sep 27, 2013","$8,500,000","$21,569,509","$22,885,836"
16,7.6,Moneyball,2011,133.0,"Biography,Drama,Sport",15.0,"Sep 23, 2011","$50,000,000","$75,605,492","$111,300,835"
17,6.5,Hereafter,2010,129.0,"Drama,Fantasy,Romance",61.0,"Oct 15, 2010","$50,000,000","$32,746,941","$108,660,270"
20,7.2,21 Jump Street,2012,109.0,"Action,Comedy,Crime",44.0,"Mar 16, 2012","$42,000,000","$138,447,667","$202,812,429"
...,...,...,...,...,...,...,...,...,...,...
73690,7.5,Edmond,2018,110.0,"Comedy,Drama,History",17.0,"Jul 14, 2006","$10,000,000","$131,719","$241,719"
73716,7.9,Closure,2018,90.0,"Comedy,Drama",95.0,"Mar 17, 2015","$100,000",$0,$0
73815,7.4,Teefa in Trouble,2018,155.0,"Action,Comedy,Crime",8.0,"Jul 20, 2018","$1,500,000",$0,"$98,806"
73890,7.3,Heroes,2019,88.0,Documentary,12.0,"Oct 24, 2008","$400,000","$655,538","$655,538"


In [53]:
budgets_df.isna().sum() 
# check for more missing values

averagerating          0
title                  0
start_year             0
runtime_minutes      106
genres                 5
id                     0
release_date           0
production_budget      0
domestic_gross         0
worldwide_gross        0
dtype: int64

In [54]:
imdb_df.isna().sum()
# looking at missing values in this df

ids                           0
titles                        0
genres                        0
directors                     0
budget                      223
openingWeekendUSA           148
grossUSA                    138
cumulativeWorldwideGross     51
metascore                     0
metacritic                    0
m-rating                      0
imdbScore                     0
year                          0
release_date                  0
writers                      25
stars                         0
runtime_mins                  0
dtype: int64

In [55]:
imdb_df = imdb_df[~imdb_df['budget'].isnull()]

In [56]:
imdb_df.isna().sum()

ids                          0
titles                       0
genres                       0
directors                    0
budget                       0
openingWeekendUSA           57
grossUSA                    53
cumulativeWorldwideGross    13
metascore                    0
metacritic                   0
m-rating                     0
imdbScore                    0
year                         0
release_date                 0
writers                      0
stars                        0
runtime_mins                 0
dtype: int64

In [57]:
imdb_df.head()

Unnamed: 0,ids,titles,genres,directors,budget,openingWeekendUSA,grossUSA,cumulativeWorldwideGross,metascore,metacritic,m-rating,imdbScore,year,release_date,writers,stars,runtime_mins
0,tt4520988,Frozen II,"Animation, Adventure, Comedy, Family, Fantasy,...","Chris Buck, Jennifer Lee","$150,000,000 (estimated)","$130,263,358, 24 November 2019","$476,774,707","$1,446,412,221",64,47,PG,7.0 / 10,2019,2019-11-20,"Jennifer Lee, Chris Buck, Kristen Anderson-Lop...","Kristen Bell, Idina Menzel, Josh Gad, Jonathan...",103
5,tt3224458,A Beautiful Day in the Neighborhood,"Biography, Drama",Marielle Heller,"$25,000,000 (estimated)","$13,251,238, 24 November 2019","$61,696,436","$67,448,968",80,50,PG,7.3 / 10,2019,2019-11-22,"Micah Fitzerman-Blue, Noah Harpster, Tom Junod","Tom Hanks, Matthew Rhys, Chris Cooper, Susan K...",109
6,tt6398184,Downton Abbey,"Drama, Romance",Michael Engler,"$20,000,000 (estimated)","$31,033,665, 22 September 2019","$96,854,135","$194,133,989",64,42,PG,7.4 / 10,2019,2019-09-12,Julian Fellowes,"Matthew Goode, Michelle Dockery, Joanne Frogga...",122
9,tt8579674,1917,"Drama, War",Sam Mendes,"$100,000,000 (estimated)","$576,216, 29 December 2019","$152,119,699","$347,819,699",78,57,R,8.4 / 10,2019,2020-01-02,"Sam Mendes, Krysty Wilson-Cairns","Dean-Charles Chapman, George MacKay, Daniel Ma...",119
12,tt4154796,Avengers: Endgame,"Action, Adventure, Drama, Sci-Fi","Anthony Russo, Joe Russo","$356,000,000 (estimated)","$357,115,007, 28 April 2019","$858,373,000","$2,797,800,564",78,57,PG-13,8.4 / 10,2019,2019-04-24,"Christopher Markus, Stephen McFeely, Stan Lee,...","Robert Downey Jr., Chris Evans, Mark Ruffalo, ...",181


In [58]:
imdb_df.rename(columns={'titles':'title'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [59]:
imdb_df.head()
# check to see if it worked

Unnamed: 0,ids,title,genres,directors,budget,openingWeekendUSA,grossUSA,cumulativeWorldwideGross,metascore,metacritic,m-rating,imdbScore,year,release_date,writers,stars,runtime_mins
0,tt4520988,Frozen II,"Animation, Adventure, Comedy, Family, Fantasy,...","Chris Buck, Jennifer Lee","$150,000,000 (estimated)","$130,263,358, 24 November 2019","$476,774,707","$1,446,412,221",64,47,PG,7.0 / 10,2019,2019-11-20,"Jennifer Lee, Chris Buck, Kristen Anderson-Lop...","Kristen Bell, Idina Menzel, Josh Gad, Jonathan...",103
5,tt3224458,A Beautiful Day in the Neighborhood,"Biography, Drama",Marielle Heller,"$25,000,000 (estimated)","$13,251,238, 24 November 2019","$61,696,436","$67,448,968",80,50,PG,7.3 / 10,2019,2019-11-22,"Micah Fitzerman-Blue, Noah Harpster, Tom Junod","Tom Hanks, Matthew Rhys, Chris Cooper, Susan K...",109
6,tt6398184,Downton Abbey,"Drama, Romance",Michael Engler,"$20,000,000 (estimated)","$31,033,665, 22 September 2019","$96,854,135","$194,133,989",64,42,PG,7.4 / 10,2019,2019-09-12,Julian Fellowes,"Matthew Goode, Michelle Dockery, Joanne Frogga...",122
9,tt8579674,1917,"Drama, War",Sam Mendes,"$100,000,000 (estimated)","$576,216, 29 December 2019","$152,119,699","$347,819,699",78,57,R,8.4 / 10,2019,2020-01-02,"Sam Mendes, Krysty Wilson-Cairns","Dean-Charles Chapman, George MacKay, Daniel Ma...",119
12,tt4154796,Avengers: Endgame,"Action, Adventure, Drama, Sci-Fi","Anthony Russo, Joe Russo","$356,000,000 (estimated)","$357,115,007, 28 April 2019","$858,373,000","$2,797,800,564",78,57,PG-13,8.4 / 10,2019,2019-04-24,"Christopher Markus, Stephen McFeely, Stan Lee,...","Robert Downey Jr., Chris Evans, Mark Ruffalo, ...",181


In [60]:
budgets_df.head()
# budget df has same column name.. all should be good to merge.

Unnamed: 0,averagerating,title,start_year,runtime_minutes,genres,id,release_date,production_budget,domestic_gross,worldwide_gross
3,4.2,The Legend of Hercules,2014,99.0,"Action,Adventure,Fantasy",42.0,"Jan 10, 2014","$70,000,000","$18,848,538","$58,953,319"
10,5.1,Baggage Claim,2013,96.0,Comedy,38.0,"Sep 27, 2013","$8,500,000","$21,569,509","$22,885,836"
16,7.6,Moneyball,2011,133.0,"Biography,Drama,Sport",15.0,"Sep 23, 2011","$50,000,000","$75,605,492","$111,300,835"
17,6.5,Hereafter,2010,129.0,"Drama,Fantasy,Romance",61.0,"Oct 15, 2010","$50,000,000","$32,746,941","$108,660,270"
20,7.2,21 Jump Street,2012,109.0,"Action,Comedy,Crime",44.0,"Mar 16, 2012","$42,000,000","$138,447,667","$202,812,429"


In [61]:
# print shapes of each df
print(imdb_df.shape, budgets_df.shape)

(755, 17) (2638, 10)


In [62]:
imdbbudgets_df = pd.merge(budgets_df, imdb_df, on='title',how='left')
# merge the two dfs on the title with a left join

In [63]:
imdbbudgets_df.shape

(2660, 26)

In [64]:
imdbbudgets_df.head()

Unnamed: 0,averagerating,title,start_year,runtime_minutes,genres_x,id,release_date_x,production_budget,domestic_gross,worldwide_gross,...,cumulativeWorldwideGross,metascore,metacritic,m-rating,imdbScore,year,release_date_y,writers,stars,runtime_mins
0,4.2,The Legend of Hercules,2014,99.0,"Action,Adventure,Fantasy",42.0,"Jan 10, 2014","$70,000,000","$18,848,538","$58,953,319",...,,,,,,,,,,
1,5.1,Baggage Claim,2013,96.0,Comedy,38.0,"Sep 27, 2013","$8,500,000","$21,569,509","$22,885,836",...,,,,,,,,,,
2,7.6,Moneyball,2011,133.0,"Biography,Drama,Sport",15.0,"Sep 23, 2011","$50,000,000","$75,605,492","$111,300,835",...,"$110,206,216",87.0,42.0,PG-13,7.6 / 10,2011.0,2011-09-23,"Steven Zaillian, Aaron Sorkin, Stan Chervin, M...","Brad Pitt, Robin Wright, Jonah Hill, Philip Se...",133.0
3,6.5,Hereafter,2010,129.0,"Drama,Fantasy,Romance",61.0,"Oct 15, 2010","$50,000,000","$32,746,941","$108,660,270",...,"$106,956,330",56.0,42.0,PG-13,6.4 / 10,2011.0,2010-10-22,Peter Morgan,"Matt Damon, Cécile de France, Bryce Dallas How...",129.0
4,7.2,21 Jump Street,2012,109.0,"Action,Comedy,Crime",44.0,"Mar 16, 2012","$42,000,000","$138,447,667","$202,812,429",...,"$201,585,328",69.0,41.0,R,7.2 / 10,2012.0,2012-03-14,"Michael Bacall, Jonah Hill, Patrick Hasburgh, ...","Jonah Hill, Channing Tatum, Ice Cube, Brie Larson",109.0


In [65]:
imdbbudgets_df.isna().sum()
# look for more missing data

averagerating                  0
title                          0
start_year                     0
runtime_minutes              107
genres_x                       5
id                             0
release_date_x                 0
production_budget              0
domestic_gross                 0
worldwide_gross                0
ids                         1972
genres_y                    1972
directors                   1972
budget                      1972
openingWeekendUSA           1981
grossUSA                    1981
cumulativeWorldwideGross    1976
metascore                   1972
metacritic                  1972
m-rating                    1972
imdbScore                   1972
year                        1972
release_date_y              1972
writers                     1972
stars                       1972
runtime_mins                1972
dtype: int64

In [66]:
# deleting redundant columns and columns that I don't need

del imdbbudgets_df['genres_y']
del imdbbudgets_df['grossUSA']
del imdbbudgets_df['budget']
del imdbbudgets_df['openingWeekendUSA']
del imdbbudgets_df['cumulativeWorldwideGross']
del imdbbudgets_df['release_date_y']
del imdbbudgets_df['runtime_mins']
del imdbbudgets_df['stars']

In [67]:
imdbbudgets_df.head()

Unnamed: 0,averagerating,title,start_year,runtime_minutes,genres_x,id,release_date_x,production_budget,domestic_gross,worldwide_gross,ids,directors,metascore,metacritic,m-rating,imdbScore,year,writers
0,4.2,The Legend of Hercules,2014,99.0,"Action,Adventure,Fantasy",42.0,"Jan 10, 2014","$70,000,000","$18,848,538","$58,953,319",,,,,,,,
1,5.1,Baggage Claim,2013,96.0,Comedy,38.0,"Sep 27, 2013","$8,500,000","$21,569,509","$22,885,836",,,,,,,,
2,7.6,Moneyball,2011,133.0,"Biography,Drama,Sport",15.0,"Sep 23, 2011","$50,000,000","$75,605,492","$111,300,835",tt1210166,Bennett Miller,87.0,42.0,PG-13,7.6 / 10,2011.0,"Steven Zaillian, Aaron Sorkin, Stan Chervin, M..."
3,6.5,Hereafter,2010,129.0,"Drama,Fantasy,Romance",61.0,"Oct 15, 2010","$50,000,000","$32,746,941","$108,660,270",tt1212419,Clint Eastwood,56.0,42.0,PG-13,6.4 / 10,2011.0,Peter Morgan
4,7.2,21 Jump Street,2012,109.0,"Action,Comedy,Crime",44.0,"Mar 16, 2012","$42,000,000","$138,447,667","$202,812,429",tt1232829,"Phil Lord, Christopher Miller",69.0,41.0,R,7.2 / 10,2012.0,"Michael Bacall, Jonah Hill, Patrick Hasburgh, ..."


In [68]:
imdbbudgets_df.isna().sum()
#print(imdbbudgets_df.shape)

averagerating           0
title                   0
start_year              0
runtime_minutes       107
genres_x                5
id                      0
release_date_x          0
production_budget       0
domestic_gross          0
worldwide_gross         0
ids                  1972
directors            1972
metascore            1972
metacritic           1972
m-rating             1972
imdbScore            1972
year                 1972
writers              1972
dtype: int64

In [69]:
imdbbudgets_df.to_csv('Data/zippedData/cleaned.needrating.csv')
# saving my work to an exported csv

In [70]:
imdbbudgets_df.shape

(2660, 18)

In [71]:
# delete the columns I won't need after I merge this data with my webscraped data
del imdbbudgets_df['averagerating']
del imdbbudgets_df['start_year']
del imdbbudgets_df['runtime_minutes']
del imdbbudgets_df['release_date_x']
del imdbbudgets_df['id']
del imdbbudgets_df['directors']
del imdbbudgets_df['metascore']
del imdbbudgets_df['metacritic']
del imdbbudgets_df['m-rating']
del imdbbudgets_df['imdbScore']
del imdbbudgets_df['year']
del imdbbudgets_df['writers']

In [72]:
imdbbudgets_df

Unnamed: 0,title,genres_x,production_budget,domestic_gross,worldwide_gross,ids
0,The Legend of Hercules,"Action,Adventure,Fantasy","$70,000,000","$18,848,538","$58,953,319",
1,Baggage Claim,Comedy,"$8,500,000","$21,569,509","$22,885,836",
2,Moneyball,"Biography,Drama,Sport","$50,000,000","$75,605,492","$111,300,835",tt1210166
3,Hereafter,"Drama,Fantasy,Romance","$50,000,000","$32,746,941","$108,660,270",tt1212419
4,21 Jump Street,"Action,Comedy,Crime","$42,000,000","$138,447,667","$202,812,429",tt1232829
...,...,...,...,...,...,...
2655,Edmond,"Comedy,Drama,History","$10,000,000","$131,719","$241,719",
2656,Closure,"Comedy,Drama","$100,000",$0,$0,
2657,Teefa in Trouble,"Action,Comedy,Crime","$1,500,000",$0,"$98,806",
2658,Heroes,Documentary,"$400,000","$655,538","$655,538",


In [73]:
imdbbudgets_df.to_csv('Data/zippedData/cleaned.needrating.csv')
# save most recent DF to csv so I can import it to my other Notebook and merge it