In [None]:
import pandas as pd
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot


In [None]:
df = pd.read_csv('netflix_titles.csv')

In [None]:
df.info

<bound method DataFrame.info of        show_id  ...                                        description
0     81145628  ...  Before planning an awesome wedding for his gra...
1     80117401  ...  Jandino Asporaat riffs on the challenges of ra...
2     70234439  ...  With the help of three human allies, the Autob...
3     80058654  ...  When a prison ship crash unleashes hundreds of...
4     80125979  ...  When nerdy high schooler Dani finally attracts...
...        ...  ...                                                ...
6229  80000063  ...  This parody of first-person shooter games, mil...
6230  70286564  ...  Marc Maron stars as Marc Maron, who interviews...
6231  80116008  ...  Nursery rhymes and original music for children...
6232  70281022  ...  Set during the Russian Revolution, this comic ...
6233  70153404  ...  This hit sitcom follows the merry misadventure...

[6234 rows x 12 columns]>

In [None]:
df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [None]:
df['date_added']

0       September 9, 2019
1       September 9, 2016
2       September 8, 2018
3       September 8, 2018
4       September 8, 2017
              ...        
6229                  NaN
6230                  NaN
6231                  NaN
6232                  NaN
6233                  NaN
Name: date_added, Length: 6234, dtype: object

In [None]:
#converting date_added to proper date timne column
df["date_added"] = pd.to_datetime(df['date_added'])


In [None]:
df['date_added']

0      2019-09-09
1      2016-09-09
2      2018-09-08
3      2018-09-08
4      2017-09-08
          ...    
6229          NaT
6230          NaT
6231          NaT
6232          NaT
6233          NaT
Name: date_added, Length: 6234, dtype: datetime64[ns]

In [None]:
#seperating month & year from date_added column
df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month


In [None]:
df['duration']

0           90 min
1           94 min
2         1 Season
3         1 Season
4           99 min
           ...    
6229    13 Seasons
6230     4 Seasons
6231        60 min
6232     2 Seasons
6233    10 Seasons
Name: duration, Length: 6234, dtype: object

In [None]:
#CREATING A NEW COLUMN FOR HAVING NUMBER OF SEASONS COUNT, HERE SEASON LESS THAN 1 IS PUT BLANK
df['season_count'] = df.apply(lambda x : x['duration'].split(" ")[0] if "Season" in x['duration'] else "", axis = 1)


In [None]:
df['season_count']

0         
1         
2        1
3        1
4         
        ..
6229    13
6230     4
6231      
6232     2
6233    10
Name: season_count, Length: 6234, dtype: object

In [None]:
#showing only seasons which were for less than 1 season i.e only were for mins
df['duration'] = df.apply(lambda x : x['duration'].split(" ")[0] if "Season" not in x['duration'] else "", axis = 1)
df['duration']

0       90
1       94
2         
3         
4       99
        ..
6229      
6230      
6231    60
6232      
6233      
Name: duration, Length: 6234, dtype: object

In [None]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,year_added,month_added,season_count
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China",2019-09-09,2019,TV-PG,90.0,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...,2019.0,9.0,
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,2016-09-09,2016,TV-MA,94.0,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...,2016.0,9.0,
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,2018-09-08,2013,TV-Y7-FV,,Kids' TV,"With the help of three human allies, the Autob...",2018.0,9.0,1.0
3,80058654,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,2018-09-08,2016,TV-Y7,,Kids' TV,When a prison ship crash unleashes hundreds of...,2018.0,9.0,1.0
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,2017-09-08,2017,TV-14,99.0,Comedies,When nerdy high schooler Dani finally attracts...,2017.0,9.0,


In [None]:
col = "type"
grouped = df[col].value_counts().reset_index()
grouped = grouped.rename(columns = {col : "count", "index" : col})

grouped
# Movies in the data - 4265 
# TV Shows in the data - 1969

Unnamed: 0,type,count
0,Movie,4265
1,TV Show,1969


In [None]:
## plot
trace = go.Pie(labels=grouped[col], values=grouped['count'], pull=[0.05, 0], marker=dict(colors=["#6ad49b", "#a678de"]))
layout = go.Layout(title="", height=400, legend=dict(x=0.1, y=1.1))
fig = go.Figure(data = [trace], layout = layout)
iplot(fig)
#68.4% Movies 
# 31.6% TV Shows
#2/3rd of the content on netflix is movies and remaining 33% of them are TV Shows.

In [None]:
d1 = df[df["type"] == "TV Show"]
d2 = df[df["type"] == "Movie"]

In [None]:
d1['year_added']

2       2018.0
3       2018.0
5       2017.0
8       2017.0
26      2018.0
         ...  
6228       NaN
6229       NaN
6230       NaN
6232       NaN
6233       NaN
Name: year_added, Length: 1969, dtype: float64

In [None]:
col = "year_added"


In [None]:
vc1 = d1[col].value_counts().reset_index()


In [None]:
vc1 = vc1.rename(columns = {col : "count", "index" : col})

In [None]:
vc1['percent'] = vc1['count'].apply(lambda x : 100*x/sum(vc1['count']))

In [None]:
vc1 = vc1.sort_values(col)
vc1

Unnamed: 0,year_added,count,percent
9,2008.0,1,0.051046
8,2012.0,3,0.153139
7,2013.0,6,0.306279
6,2014.0,6,0.306279
5,2015.0,32,1.633486
3,2016.0,192,9.800919
2,2017.0,387,19.754977
1,2018.0,492,25.114855
0,2019.0,803,40.990301
4,2020.0,37,1.888719


In [None]:
vc1

Unnamed: 0,year_added,count,percent
9,2008.0,1,0.051046
8,2012.0,3,0.153139
7,2013.0,6,0.306279
6,2014.0,6,0.306279
5,2015.0,32,1.633486
3,2016.0,192,9.800919
2,2017.0,387,19.754977
1,2018.0,492,25.114855
0,2019.0,803,40.990301
4,2020.0,37,1.888719


In [None]:
vc2 = d2[col].value_counts().reset_index()


In [None]:
vc2 = vc2.rename(columns = {col : "count", "index" : col})


In [None]:
vc2['percent'] = vc2['count'].apply(lambda x : 100*x/sum(vc2['count']))


In [None]:
vc2 = vc2.sort_values(col)


In [None]:
vc2['count']

12       1
10       2
11       1
7       13
9        4
8        6
6       19
5       58
3      264
2      913
1     1290
0     1546
4      147
Name: count, dtype: int64

In [None]:
trace1 = go.Scatter(x=vc1[col], y=vc1["count"], name="TV Shows", marker=dict(color="#a678de"))
trace2 = go.Scatter(x=vc2[col], y=vc2["count"], name="Movies", marker=dict(color="#6ad49b"))


In [None]:
data = [trace1, trace2]
layout = go.Layout(title="Content added over the years", legend=dict(x=0.1, y=1.1, orientation="h"))


In [None]:
fig = go.Figure(data, layout=layout)
fig.show()

In [None]:
#The growth in number of movies on netflix is much higher than that of TV shows.

In [None]:
col = "release_year"
vc1 = d1[col].value_counts().reset_index()
vc1 = vc1.rename(columns = {col : "count", "index" : col})
vc1['percent'] = vc1['count'].apply(lambda x : 100*x/sum(vc1['count']))
vc1 = vc1.sort_values(col)
vc1

Unnamed: 0,release_year,count,percent
44,1925,1,0.050787
32,1946,1,0.050787
33,1963,1,0.050787
34,1967,1,0.050787
35,1968,1,0.050787
36,1972,1,0.050787
31,1974,1,0.050787
38,1977,1,0.050787
39,1979,1,0.050787
37,1981,1,0.050787


In [None]:
vc2 = d2[col].value_counts().reset_index()
vc2 = vc2.rename(columns = {col : "count", "index" : col})
vc2['percent'] = vc2['count'].apply(lambda x : 100*x/sum(vc2['count']))
vc2 = vc2.sort_values(col)
vc2

Unnamed: 0,release_year,count,percent
57,1942,2,0.046893
55,1943,3,0.070340
51,1944,3,0.070340
52,1945,3,0.070340
61,1946,2,0.046893
...,...,...,...
2,2016,593,13.903869
0,2017,682,15.990621
1,2018,646,15.146542
3,2019,400,9.378664


In [None]:
trace1 = go.Bar(x=vc1[col], y=vc1["count"], name="TV Shows", marker=dict(color="#a678de"))
trace2 = go.Bar(x=vc2[col], y=vc2["count"], name="Movies", marker=dict(color="#6ad49b"))
data = [trace1, trace2]
layout = go.Layout(title="Content added over the years", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

In [None]:
#About 1300 new movies were added in both 2018 and 2019. The growth in content started from 2013. Netflix kept on adding different movies and tv shows on its platform over the years.

In [None]:
col = 'month_added'
vc1 = d1[col].value_counts().reset_index()
vc1 = vc1.rename(columns = {col : "count", "index" : col})
vc1['percent'] = vc1['count'].apply(lambda x : 100*x/sum(vc1['count']))
vc1 = vc1.sort_values(col)
vc1

Unnamed: 0,month_added,count,percent
7,1.0,152,7.759061
11,2.0,115,5.870342
6,3.0,160,8.167432
8,4.0,139,7.095457
10,5.0,133,6.789178
9,6.0,133,6.789178
5,7.0,161,8.218479
3,8.0,175,8.933129
4,9.0,172,8.77999
1,10.0,205,10.464523


In [None]:
trace1 = go.Bar(x=vc1[col], y=vc1["count"], name="TV Shows", marker=dict(color="#a678de"))
data = [trace1]
layout = go.Layout(title="In which month, the conent is added the most?", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

In [None]:
#Some of the oldest movies on Netflix

small = df.sort_values("release_year", ascending = True)
small = small[small['duration'] != ""]
small[['title', "release_year"]][:15]
small

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,year_added,month_added,season_count
2011,60027945,Movie,Prelude to War,Frank Capra,,United States,2017-03-31,1942,TV-PG,52,"Classic Movies, Documentaries",Frank Capra's documentary chronicles the rise ...,2017.0,3.0,
2013,60027942,Movie,The Battle of Midway,John Ford,"Henry Fonda, Jane Darwell",United States,2017-03-31,1942,TV-G,18,"Classic Movies, Documentaries",Director John Ford captures combat footage of ...,2017.0,3.0,
2022,80119186,Movie,Undercover: How to Operate Behind Enemy Lines,John Ford,,United States,2017-03-31,1943,TV-PG,61,"Classic Movies, Documentaries",This World War II-era training film dramatizes...,2017.0,3.0,
2023,70013050,Movie,Why We Fight: The Battle of Russia,"Frank Capra, Anatole Litvak",,United States,2017-03-31,1943,TV-14,82,Documentaries,This installment of Frank Capra's acclaimed do...,2017.0,3.0,
2026,70022548,Movie,WWII: Report from the Aleutians,John Huston,,United States,2017-03-31,1943,NR,45,Documentaries,Filmmaker John Huston narrates this Oscar-nomi...,2017.0,3.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3518,81088083,Movie,Ghost Stories,"Anurag Kashyap, Dibakar Banerjee, Karan Johar,...","Janhvi Kapoor, Sobhita Dhulipala, Sukant Goel,...",India,2020-01-01,2020,TV-MA,145,"Horror Movies, International Movies, Thrillers","The directors of Emmy-nominated ""Lust Stories""...",2020.0,1.0,
3249,81006825,Movie,All the Freckles in the World,Yibrán Asuad,"Hánssel Casillas, Loreto Peralta, Andrea Sutto...",Mexico,2020-01-03,2020,TV-14,90,"Comedies, International Movies, Romantic Movies",Thirteen-year-old José Miguel is immune to 199...,2020.0,1.0,
3352,81127902,Movie,A Fall from Grace,Tyler Perry,"Crystal Fox, Phylicia Rashad, Cicely Tyson, Br...",,2020-01-17,2020,TV-MA,121,"Dramas, Thrillers","When gentle, law-abiding Grace confesses to ki...",2020.0,1.0,
3195,80233408,Movie,"Live Twice, Love Once",Maria Ripoll,"Oscar Martínez, Inma Cuesta, Mafalda Carbonell...",Spain,2020-01-07,2020,TV-MA,102,"Comedies, Dramas, International Movies",When Emilio (Oscar Martínez) is diagnosed with...,2020.0,1.0,


In [None]:
#Some of the oldest TV Shows on Netflix
small = df.sort_values("release_year", ascending = True)
small = small[small['season_count'] != ""]
small[['title', "release_year"]][:15]
small

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,year_added,month_added,season_count
4292,81030762,TV Show,Pioneers: First Women Filmmakers*,,,,2018-12-30,1925,TV-PG,,TV Shows,This collection restores films from women who ...,2018.0,12.0,1
4079,80161851,TV Show,Pioneers of African-American Cinema,"Oscar Micheaux, Spencer Williams, Richard E. N...",,United States,2017-02-01,1946,TV-14,,TV Shows,This newly preserved collection features more ...,2017.0,2.0,1
5981,70172488,TV Show,The Twilight Zone (Original Series),,Rod Serling,United States,2017-07-01,1963,TV-PG,,"Classic & Cult TV, TV Sci-Fi & Fantasy","Hosted by creator Rod Serling, this groundbrea...",2017.0,7.0,4
5980,70155574,TV Show,The Andy Griffith Show,,"Andy Griffith, Ron Howard, Don Knotts, Frances...",United States,2017-07-01,1967,TV-G,,"Classic & Cult TV, TV Comedies",Homespun humor and easygoing Sheriff Andy Tayl...,2017.0,7.0,8
5704,70136140,TV Show,Star Trek,,"William Shatner, Leonard Nimoy, DeForest Kelle...",United States,2017-10-01,1968,TV-PG,,"Classic & Cult TV, TV Action & Adventure, TV S...","Led by unflappable Capt. Kirk, the crew of the...",2017.0,10.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3354,81062580,TV Show,Nailed It! Germany,,"Angelina Kirsch, Bernd Siefert",,2020-01-17,2020,TV-14,,"International TV Shows, Reality TV",Home cooks try – and inevitably fail – to re-c...,2020.0,1.0,1
3189,81039393,TV Show,Cheer,,,United States,2020-01-08,2020,TV-MA,,"Docuseries, Reality TV, Teen TV Shows",This gripping docuseries follows the ups and d...,2020.0,1.0,1
1315,81034946,TV Show,Maradona in Mexico,,Diego Armando Maradona,"Argentina, United States, Mexico",2019-11-13,2020,TV-MA,,"Docuseries, Spanish-Language TV Shows","In this docuseries, soccer great Diego Maradon...",2019.0,11.0,1
3379,81062828,TV Show,Killer Inside: The Mind of Aaron Hernandez,,Aaron Hernandez,United States,2020-01-15,2020,TV-MA,,"Crime TV Shows, Docuseries","Via interviews with friends, players and insid...",2020.0,1.0,1


In [None]:
country_codes = {'afghanistan': 'AFG',
 'albania': 'ALB',
 'algeria': 'DZA',
 'american samoa': 'ASM',
 'andorra': 'AND',
 'angola': 'AGO',
 'anguilla': 'AIA',
 'antigua and barbuda': 'ATG',
 'argentina': 'ARG',
 'armenia': 'ARM',
 'aruba': 'ABW',
 'australia': 'AUS',
 'austria': 'AUT',
 'azerbaijan': 'AZE',
 'bahamas': 'BHM',
 'bahrain': 'BHR',
 'bangladesh': 'BGD',
 'barbados': 'BRB',
 'belarus': 'BLR',
 'belgium': 'BEL',
 'belize': 'BLZ',
 'benin': 'BEN',
 'bermuda': 'BMU',
 'bhutan': 'BTN',
 'bolivia': 'BOL',
 'bosnia and herzegovina': 'BIH',
 'botswana': 'BWA',
 'brazil': 'BRA',
 'british virgin islands': 'VGB',
 'brunei': 'BRN',
 'bulgaria': 'BGR',
 'burkina faso': 'BFA',
 'burma': 'MMR',
 'burundi': 'BDI',
 'cabo verde': 'CPV',
 'cambodia': 'KHM',
 'cameroon': 'CMR',
 'canada': 'CAN',
 'cayman islands': 'CYM',
 'central african republic': 'CAF',
 'chad': 'TCD',
 'chile': 'CHL',
 'china': 'CHN',
 'colombia': 'COL',
 'comoros': 'COM',
 'congo democratic': 'COD',
 'Congo republic': 'COG',
 'cook islands': 'COK',
 'costa rica': 'CRI',
 "cote d'ivoire": 'CIV',
 'croatia': 'HRV',
 'cuba': 'CUB',
 'curacao': 'CUW',
 'cyprus': 'CYP',
 'czech republic': 'CZE',
 'denmark': 'DNK',
 'djibouti': 'DJI',
 'dominica': 'DMA',
 'dominican republic': 'DOM',
 'ecuador': 'ECU',
 'egypt': 'EGY',
 'el salvador': 'SLV',
 'equatorial guinea': 'GNQ',
 'eritrea': 'ERI',
 'estonia': 'EST',
 'ethiopia': 'ETH',
 'falkland islands': 'FLK',
 'faroe islands': 'FRO',
 'fiji': 'FJI',
 'finland': 'FIN',
 'france': 'FRA',
 'french polynesia': 'PYF',
 'gabon': 'GAB',
 'gambia, the': 'GMB',
 'georgia': 'GEO',
 'germany': 'DEU',
 'ghana': 'GHA',
 'gibraltar': 'GIB',
 'greece': 'GRC',
 'greenland': 'GRL',
 'grenada': 'GRD',
 'guam': 'GUM',
 'guatemala': 'GTM',
 'guernsey': 'GGY',
 'guinea-bissau': 'GNB',
 'guinea': 'GIN',
 'guyana': 'GUY',
 'haiti': 'HTI',
 'honduras': 'HND',
 'hong kong': 'HKG',
 'hungary': 'HUN',
 'iceland': 'ISL',
 'india': 'IND',
 'indonesia': 'IDN',
 'iran': 'IRN',
 'iraq': 'IRQ',
 'ireland': 'IRL',
 'isle of man': 'IMN',
 'israel': 'ISR',
 'italy': 'ITA',
 'jamaica': 'JAM',
 'japan': 'JPN',
 'jersey': 'JEY',
 'jordan': 'JOR',
 'kazakhstan': 'KAZ',
 'kenya': 'KEN',
 'kiribati': 'KIR',
 'north korea': 'PRK',
 'south korea': 'KOR',
 'kosovo': 'KSV',
 'kuwait': 'KWT',
 'kyrgyzstan': 'KGZ',
 'laos': 'LAO',
 'latvia': 'LVA',
 'lebanon': 'LBN',
 'lesotho': 'LSO',
 'liberia': 'LBR',
 'libya': 'LBY',
 'liechtenstein': 'LIE',
 'lithuania': 'LTU',
 'luxembourg': 'LUX',
 'macau': 'MAC',
 'macedonia': 'MKD',
 'madagascar': 'MDG',
 'malawi': 'MWI',
 'malaysia': 'MYS',
 'maldives': 'MDV',
 'mali': 'MLI',
 'malta': 'MLT',
 'marshall islands': 'MHL',
 'mauritania': 'MRT',
 'mauritius': 'MUS',
 'mexico': 'MEX',
 'micronesia': 'FSM',
 'moldova': 'MDA',
 'monaco': 'MCO',
 'mongolia': 'MNG',
 'montenegro': 'MNE',
 'morocco': 'MAR',
 'mozambique': 'MOZ',
 'namibia': 'NAM',
 'nepal': 'NPL',
 'netherlands': 'NLD',
 'new caledonia': 'NCL',
 'new zealand': 'NZL',
 'nicaragua': 'NIC',
 'nigeria': 'NGA',
 'niger': 'NER',
 'niue': 'NIU',
 'northern mariana islands': 'MNP',
 'norway': 'NOR',
 'oman': 'OMN',
 'pakistan': 'PAK',
 'palau': 'PLW',
 'panama': 'PAN',
 'papua new guinea': 'PNG',
 'paraguay': 'PRY',
 'peru': 'PER',
 'philippines': 'PHL',
 'poland': 'POL',
 'portugal': 'PRT',
 'puerto rico': 'PRI',
 'qatar': 'QAT',
 'romania': 'ROU',
 'russia': 'RUS',
 'rwanda': 'RWA',
 'saint kitts and nevis': 'KNA',
 'saint lucia': 'LCA',
 'saint martin': 'MAF',
 'saint pierre and miquelon': 'SPM',
 'saint vincent and the grenadines': 'VCT',
 'samoa': 'WSM',
 'san marino': 'SMR',
 'sao tome and principe': 'STP',
 'saudi arabia': 'SAU',
 'senegal': 'SEN',
 'serbia': 'SRB',
 'seychelles': 'SYC',
 'sierra leone': 'SLE',
 'singapore': 'SGP',
 'sint maarten': 'SXM',
 'slovakia': 'SVK',
 'slovenia': 'SVN',
 'solomon islands': 'SLB',
 'somalia': 'SOM',
 'south africa': 'ZAF',
 'south sudan': 'SSD',
 'spain': 'ESP',
 'sri lanka': 'LKA',
 'sudan': 'SDN',
 'suriname': 'SUR',
 'swaziland': 'SWZ',
 'sweden': 'SWE',
 'switzerland': 'CHE',
 'syria': 'SYR',
 'taiwan': 'TWN',
 'tajikistan': 'TJK',
 'tanzania': 'TZA',
 'thailand': 'THA',
 'timor-leste': 'TLS',
 'togo': 'TGO',
 'tonga': 'TON',
 'trinidad and tobago': 'TTO',
 'tunisia': 'TUN',
 'turkey': 'TUR',
 'turkmenistan': 'TKM',
 'tuvalu': 'TUV',
 'uganda': 'UGA',
 'ukraine': 'UKR',
 'united arab emirates': 'ARE',
 'united kingdom': 'GBR',
 'united states': 'USA',
 'uruguay': 'URY',
 'uzbekistan': 'UZB',
 'vanuatu': 'VUT',
 'venezuela': 'VEN',
 'vietnam': 'VNM',
 'virgin islands': 'VGB',
 'west bank': 'WBG',
 'yemen': 'YEM',
 'zambia': 'ZMB',
 'zimbabwe': 'ZWE'}


In [None]:
## countries 
from collections import Counter
colorscale = ["#f7fbff", "#ebf3fb", "#deebf7", "#d2e3f3", "#c6dbef", "#b3d2e9", "#9ecae1",
    "#85bcdb", "#6baed6", "#57a0ce", "#4292c6", "#3082be", "#2171b5", "#1361a9",
    "#08519c", "#0b4083", "#08306b"
]


In [None]:
def geoplot(ddf):
    country_with_code, country = {}, {}
    shows_countries = ", ".join(ddf['country'].dropna()).split(", ")
    for c,v in dict(Counter(shows_countries)).items():
        code = ""
        if c.lower() in country_codes:
            code = country_codes[c.lower()]
        country_with_code[code] = v
        country[c] = v

    data = [dict(
            type = 'choropleth',
            locations = list(country_with_code.keys()),
            z = list(country_with_code.values()),
            colorscale = [[0,"rgb(5, 10, 172)"],[0.65,"rgb(40, 60, 190)"],[0.75,"rgb(70, 100, 245)"],\
                        [0.80,"rgb(90, 120, 245)"],[0.9,"rgb(106, 137, 247)"],[1,"rgb(220, 220, 220)"]],
            autocolorscale = False,
            reversescale = True,
            marker = dict(
                line = dict (
                    color = 'gray',
                    width = 0.5
                ) ),
            colorbar = dict(
                autotick = False,
                title = ''),
          ) ]

    layout = dict(
        title = '',
        geo = dict(
            showframe = False,
            showcoastlines = False,
            projection = dict(
                type = 'Mercator'
            )
        )
    )

    fig = dict( data=data, layout=layout )
    iplot( fig, validate=False, filename='d3-world-map' )
    return country


In [None]:
country_vals = geoplot(df)
tabs = Counter(country_vals).most_common(25)


In [None]:
labels = [_[0] for _ in tabs][::-1]
values = [_[1] for _ in tabs][::-1]
trace1 = go.Bar(y=labels, x=values, orientation="h", name="", marker=dict(color="#a678de"))


In [None]:
data = [trace1]
layout = go.Layout(title="Countries with most content", height=700, legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

In [None]:
#Distribution of Movie Duration
import plotly.figure_factory as ff
x1 = d2['duration'].fillna(0.0).astype(float)
fig = ff.create_distplot([x1], ['a'], bin_size=0.7, curve_type='normal', colors=["#6ad49b"])
fig.update_layout(title_text='Distplot with Normal Distribution')
fig.show()

In [None]:
#TV Shows with many seasons
col = 'season_count'
vc1 = d1[col].value_counts().reset_index()
vc1 = vc1.rename(columns = {col : "count", "index" : col})
vc1['percent'] = vc1['count'].apply(lambda x : 100*x/sum(vc1['count']))
vc1 = vc1.sort_values(col)

vc1

Unnamed: 0,season_count,count,percent
0,1,1321,67.089893
10,10,3,0.152362
9,11,3,0.152362
13,12,2,0.101574
11,13,2,0.101574
14,14,1,0.050787
12,15,2,0.101574
1,2,304,15.439309
2,3,158,8.024378
3,4,61,3.098019


In [None]:
trace1 = go.Bar(x=vc1[col], y=vc1["count"], name="TV Shows", marker=dict(color="#a678de"))
data = [trace1]
layout = go.Layout(title="Seasons", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

In [None]:
#The ratings of the content 

col = "rating"

vc1 = d1[col].value_counts().reset_index()
vc1 = vc1.rename(columns = {col : "count", "index" : col})
vc1['percent'] = vc1['count'].apply(lambda x : 100*x/sum(vc1['count']))
vc1 = vc1.sort_values(col)
vc1

Unnamed: 0,rating,count,percent
9,G,1,0.050839
7,NR,16,0.813421
10,PG,1,0.050839
8,R,2,0.101678
1,TV-14,660,33.553635
5,TV-G,69,3.50788
0,TV-MA,679,34.519573
2,TV-PG,269,13.675648
3,TV-Y,102,5.185562
4,TV-Y7,100,5.083884


In [None]:
vc2 = d2[col].value_counts().reset_index()
vc2 = vc2.rename(columns = {col : "count", "index" : col})
vc2['percent'] = vc2['count'].apply(lambda x : 100*x/sum(vc2['count']))
vc2 = vc2.sort_values(col)
vc2

Unnamed: 0,rating,count,percent
10,G,36,0.845666
13,NC-17,2,0.046981
5,NR,202,4.745126
6,PG,183,4.298802
4,PG-13,286,6.718346
2,R,506,11.886305
1,TV-14,1038,24.383369
7,TV-G,80,1.879258
0,TV-MA,1348,31.665492
3,TV-PG,432,10.147992


In [None]:
trace1 = go.Bar(x=vc1[col], y=vc1["count"], name="TV Shows", marker=dict(color="#a678de"))
trace2 = go.Bar(x=vc2[col], y=vc2["count"], name="Movies", marker=dict(color="#6ad49b"))
data = [trace1, trace2]
layout = go.Layout(title="Content added over the years", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

In [None]:
#top Categories

col = "listed_in"
categories = ", ".join(d2['listed_in']).split(", ")
counter_list = Counter(categories).most_common(50)
labels = [_[0] for _ in counter_list][::-1]
values = [_[1] for _ in counter_list][::-1]
trace1 = go.Bar(y=labels, x=values, orientation="h", name="TV Shows", marker=dict(color="#a678de"))

data = [trace1]
layout = go.Layout(title="Content added over the years", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

In [None]:
#Top Actors on Netflix with Most Movies
def country_trace(country, flag = "movie"):
    df["from_us"] = df['country'].fillna("").apply(lambda x : 1 if country.lower() in x.lower() else 0)
    small = df[df["from_us"] == 1]
    if flag == "movie":
        small = small[small["duration"] != ""]
    else:
        small = small[small["season_count"] != ""]
    cast = ", ".join(small['cast'].fillna("")).split(", ")
    tags = Counter(cast).most_common(25)
    tags = [_ for _ in tags if "" != _[0]]

    labels, values = [_[0]+"  " for _ in tags], [_[1] for _ in tags]
    trace = go.Bar(y=labels[::-1], x=values[::-1], orientation="h", name="", marker=dict(color="#a678de"))
    return trace


In [None]:
from plotly.subplots import make_subplots
traces = []
titles = ["United States", "","India","", "United Kingdom", "Canada","", "Spain","", "Japan"]
for title in titles:
    if title != "":
        traces.append(country_trace(title))

fig = make_subplots(rows=2, cols=5, subplot_titles=titles)
fig.add_trace(traces[0], 1,1)
fig.add_trace(traces[1], 1,3)
fig.add_trace(traces[2], 1,5)
fig.add_trace(traces[3], 2,1)
fig.add_trace(traces[4], 2,3)
fig.add_trace(traces[5], 2,5)

fig.update_layout(height=1200, showlegend=False)
fig.show()

In [None]:
#Top Actors on Netflix with Most TV Shows
titles = ["United States","", "India"]
for title in titles:
    if title != "":
        traces.append(country_trace(title, flag="tv_shows"))

fig = make_subplots(rows=1, cols=3, subplot_titles=titles)
fig.add_trace(traces[0], 1,1)
fig.add_trace(traces[1], 1,3)

fig.update_layout(height=600, showlegend=False)
fig.show()

In [None]:
small = df[df["type"] == "Movie"]
small = small[small["country"] == "India"]

col = "director"
categories = ", ".join(small[col].fillna("")).split(", ")
counter_list = Counter(categories).most_common(12)
counter_list = [_ for _ in counter_list if _[0] != ""]
labels = [_[0] for _ in counter_list][::-1]
values = [_[1] for _ in counter_list][::-1]
trace1 = go.Bar(y=labels, x=values, orientation="h", name="TV Shows", marker=dict(color="orange"))

data = [trace1]
layout = go.Layout(title="Movie Directors from India with most content", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

In [None]:
small = df[df["type"] == "Movie"]
small = small[small["country"] == "United States"]

col = "director"
categories = ", ".join(small[col].fillna("")).split(", ")
counter_list = Counter(categories).most_common(12)
counter_list = [_ for _ in counter_list if _[0] != ""]
labels = [_[0] for _ in counter_list][::-1]
values = [_[1] for _ in counter_list][::-1]
trace1 = go.Bar(y=labels, x=values, orientation="h", name="TV Shows", marker=dict(color="orange"))

data = [trace1]
layout = go.Layout(title="Movie Directors from India with most content", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()

In [None]:
#Standup Comedies 
tag = "jay karas"
df["relevant"] = df['director'].fillna("").apply(lambda x : 1 if tag in x.lower() else 0)
small = df[df["relevant"] == 1]
small[['title', 'release_year', 'listed_in']]

Unnamed: 0,title,release_year,listed_in
620,Anjelah Johnson: Not Fancy,2015,Stand-Up Comedy
773,Christina P: Mother Inferior,2017,Stand-Up Comedy
1580,Ali Wong: Baby Cobra,2016,Stand-Up Comedy
1781,Ali Wong: Hard Knock Wife,2018,Stand-Up Comedy
2485,Bill Burr: You People Are All the Same,2012,Stand-Up Comedy
2613,Adam Devine: Best Time of Our Lives,2019,Stand-Up Comedy
2892,Demetri Martin: The Overthinker,2018,Stand-Up Comedy
3193,Tom Segura: Mostly Stories,2016,Stand-Up Comedy
3237,Bill Burr: Walk Your Way Out,2017,Stand-Up Comedy
3453,Tom Segura: Disgraceful,2018,Stand-Up Comedy


In [None]:
tag = "Stand-Up Comedy"
df["relevant"] = df['listed_in'].fillna("").apply(lambda x : 1 if tag.lower() in x.lower() else 0)
small = df[df["relevant"] == 1]
small[small["country"] == "United States"][["title", "country","release_year"]].head(10)

Unnamed: 0,title,country,release_year
53,Marc Maron: Too Real,United States,2017
113,Def Comedy Jam 25,United States,2017
126,Jeff Dunham: Beside Himself,United States,2019
134,Iliza Shlesinger: Confirmed Kills,United States,2016
181,Jerry Before Seinfeld,United States,2017
202,Sebastian Maniscalco: What's Wrong with People?,United States,2012
210,Cedric the Entertainer: Live from the Ville,United States,2016
260,Norm Macdonald Has a Show,United States,2018
288,Jeff Dunham: Relative Disaster,United States,2017
289,Daniel Sloss: Live Shows,United States,2018


In [None]:
tag = "Stand-Up Comedy"
df["relevant"] = df['listed_in'].fillna("").apply(lambda x : 1 if tag.lower() in x.lower() else 0)
small = df[df["relevant"] == 1]
small[small["country"] == "India"][["title", "country","release_year"]].head(10)

Unnamed: 0,title,country,release_year
2937,Aditi Mittal: Things They Wouldn't Let Me Say,India,2017
5420,Gangs of Hassepur,India,2014


In [None]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,year_added,month_added,season_count,from_us,relevant
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China",2019-09-09,2019,TV-PG,90.0,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...,2019.0,9.0,,1,0
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,2016-09-09,2016,TV-MA,94.0,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...,2016.0,9.0,,0,1
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,2018-09-08,2013,TV-Y7-FV,,Kids' TV,"With the help of three human allies, the Autob...",2018.0,9.0,1.0,0,0
3,80058654,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,2018-09-08,2016,TV-Y7,,Kids' TV,When a prison ship crash unleashes hundreds of...,2018.0,9.0,1.0,0,0
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,2017-09-08,2017,TV-14,99.0,Comedies,When nerdy high schooler Dani finally attracts...,2017.0,9.0,,0,0


In [None]:
tag = "Stand-Up Comedy"
df["relevant"] = df['listed_in'].fillna("").apply(lambda x : 1 if tag.lower() in x.lower() else 0)
small = df[df["relevant"] == 1]
small[small["country"] == "India"][["title", "country","release_year"]].head(10)

In [None]:
col = "rating"

vc1 = df[col].value_counts().reset_index()
vc1 = vc1.rename(columns = {col : "count", "index" : col})
vc1['percent'] = vc1['count'].apply(lambda x : 100*x/sum(vc1['count']))
vc1 = vc1.sort_values(col)
vc1
small[small["country"] == "India"][["title", "country","release_year"]].head(10)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,year_added,month_added,season_count,from_us,relevant
2937,80183329,Movie,Aditi Mittal: Things They Wouldn't Let Me Say,Fazila Allana,Aditi Mittal,India,2017-07-18,2017,TV-MA,63.0,Stand-Up Comedy,Trailblazing comic Aditi Mittal mixes topical ...,2017.0,7.0,,1,1
5420,80122235,TV Show,Gangs of Hassepur,,"Ragini Khanna, Mandira Bedi, Tanishaa Mukerji",India,2017-04-15,2014,TV-PG,,"International TV Shows, Stand-Up Comedy & Talk...",Training a keen eye on hot-button political to...,2017.0,4.0,1.0,1,1


In [None]:
d1[df["country"] == "India"]
d1[df["country"] == "India"][["title", "rating","release_year"]].head(10)


Boolean Series key will be reindexed to match DataFrame index.


Boolean Series key will be reindexed to match DataFrame index.



Unnamed: 0,title,rating,release_year
99,Bard of Blood,TV-MA,2019
648,Mighty Little Bheem: Diwali,TV-Y,2019
689,Devlok with Devdutt Pattanaik,TV-PG,2017
789,College Romance,TV-MA,2018
793,Engineering Girls,TV-14,2018
797,Girls Hostel,TV-MA,2018
804,Inmates,TV-MA,2017
1653,Thackeray,TV-MA,2019
1762,Classic Legends,TV-PG,2012
1838,Anjaan: Special Crimes Unit,TV-14,2018


In [None]:
ratings = df[col].value_counts().reset_index()
ratings['rating'] == "TV-14"


elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
Name: rating, dtype: bool

In [None]:
df['title'].value_counts().reset_index()


Unnamed: 0,index,title
0,Tunnel,3
1,Limitless,3
2,Love,3
3,The Silence,3
4,Oh My Ghost,3
...,...,...
6167,Tukaram,1
6168,Feo pero sabroso,1
6169,Little Sister,1
6170,In This Corner of the World,1
