# Import libraries

In [55]:
import pandas as pd
import numpy as np
import warnings
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

# 1.Data Loading

In [56]:
def load_data(Netflix_file):
    
    #importing amazon product dataset
    data = pd.read_csv(Netflix_file)
    
    #printing the shape of our data set 
    print('The data has {} data points and {} features \n'.format(data.shape[0],data.shape[1]))
    #printing a space for presentation
    
    
    #keeping just the pertinent features
    data = data[['title','country','director','cast','listed_in','description']]
    
    print('The data after removing irrelevant features has {} and it contains these {} features. The names of the features are {} \n \n'.format(data.shape[0],data.shape[1],list(data.columns)))
    
    return data

# 2.Data Analysis

In [63]:
def analysis(data):
    
    #Basic stats for directors
    print('The basic statistics for the directors of movies are as follows: \n{}\n\n'.format(data['director'].describe()))
    
    #Most common director names
    print('The directors with most movies are:\n{}\n\n'.format(Counter(list(data['director'])).most_common(10)))
    
    #% of movies with listed directors
    print('The percentage of total movies with listed directors:\n{}% \n\n'.format(data[~data['director'].isnull()].shape[0]/data.shape[0]*100))

    #Basic stats for cast
    print('The basic statistics for the cast are as follows: \n{}\n\n'.format(data['cast'].describe()))
    
    #% of movies with listed cast
    print('The percentage of total movies with listed cast:\n{}% \n\n'.format(data[~data['cast'].isnull()].shape[0]/data.shape[0]*100))
 
    #movies with missing name
    print('The number of movies with missing movie titles:\n{}\n\n'.format(data[data['title'].isnull()].shape[0]))
    
    #movies with missing description
    print('The number of movies with missing movie description:\n{}\n\n'.format(data[data['description'].isnull()].shape[0]))
    

# 5. Main Execution:

In [64]:
def main(Netflix_file):
    
        print('LOADING DATA...{}\n\n '.format(Netflix_file))
        data = load_data(Netflix_file)

        print('DATA ANALYSIS...\n\n')
        analysis(data)


In [65]:
main('netflix_titles.csv')

LOADING DATA...netflix_titles.csv

 
The data has 7787 data points and 12 features 

The data after removing irrelevant features has 7787 and it contains these 6 features. The names of the features are ['title', 'country', 'director', 'cast', 'listed_in', 'description'] 
 

DATA ANALYSIS...


The basic statistics for the directors of movies are as follows: 
count                       5398
unique                      4049
top       Raúl Campos, Jan Suter
freq                          18
Name: director, dtype: object


The directors with most movies are:
[(nan, 2389), ('Raúl Campos, Jan Suter', 18), ('Marcus Raboy', 16), ('Jay Karas', 14), ('Cathy Garcia-Molina', 13), ('Youssef Chahine', 12), ('Martin Scorsese', 12), ('Jay Chapman', 12), ('Steven Spielberg', 10), ('David Dhawan', 9)]


The percentage of total movies with listed directors:
69.32066264286631% 


The basic statistics for the cast are as follows: 
count                   7069
unique                  6831
top       David Att