# Import libraries

In [1]:
import pandas as pd
import numpy as np
import warnings
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

In [2]:
class ETL:
    
    #initializing the variables
    def __init__(self,Netflix_file,Filepath,Filepath_df):
        
        #assigning csv file to Netflix_file variable 
        self.Netflix_file = Netflix_file
        
        #assigning a address to save the cleaned data
        self.Filepath = Filepath
        
        #assigning a address to save the df data frame
        self.Filepath_df = Filepath_df
        
        #downloading Netflix datset
        self.data = pd.read_csv(self.Netflix_file)
        
        #saving columns in a differnt dataset for better result visualization
        self.df = self.data[['title','description','listed_in']]
        
        #defining stopwords used in data cleaning
        self.stopword = set(stopwords.words('english'))
        
        #assigning a string variable to be used in stopword removal in data cleaning
        self.strng = ""
        
    #Data Loading
    def load_data(self):
    
        #printing the shape of our data set 
        print('The data has {} data points and {} features \n'.format(self.data.shape[0],self.data.shape[1]))
        
        #keeping just the pertinent features
        self.data = self.data[['title','country','director','cast','listed_in','description']]

        print('The data after removing irrelevant features has {} and it contains these {} features. The names of the features are {} \n \n'.format(self.data.shape[0],self.data.shape[1],list(self.data.columns)))
    
    #Data Analysis
    def analysis(self):
    
        #Basic stats for directors
        print('The basic statistics for the directors of movies are as follows: \n{}\n\n'.format(self.data['director'].describe()))

        #Most common director names
        print('The directors with most movies are:\n{}\n\n'.format(Counter(list(self.data['director'])).most_common(10)))

        #% of movies with listed directors
        print('The percentage of total movies with listed directors:\n{}% \n\n'.format(self.data[~self.data['director'].isnull()].shape[0]/self.data.shape[0]*100))

        #Basic stats for cast
        print('The basic statistics for the cast are as follows: \n{}\n\n'.format(self.data['cast'].describe()))

        #% of movies with listed cast
        print('The percentage of total movies with listed cast:\n{}% \n\n'.format(self.data[~self.data['cast'].isnull()].shape[0]/self.data.shape[0]*100))

        #movies with missing name
        print('The basic statisitcs for movie titles:\n{}\n\n'.format(self.data['title'].describe()))

        #movies with missing name
        print('The number of movies with missing movie titles:\n{}\n\n'.format(self.data[self.data['title'].isnull()].shape[0]))

        #movies with missing description
        print('The number of movies with missing movie description:\n{}\n\n'.format(self.data[self.data['description'].isnull()].shape[0]))

            
    #Data cleaning    
    def data_cleaning(self):
    
        #There is no missing movies titles and missing description data
        #There is enough text information about the movie even with missing movie cast and director to create relevant vectors after text vectorization
        #Replacing the null cast and director columns with blank string
        self.data.fillna(" ",inplace=True)
        print('Replacing the null values in director and cast feature with a blank string to create a text feature contaning all relevant data in the form  of a string.\n\n')

        #removing stopwords, terms which are not alphanumeric and lowering text
        for index,rows in self.data.iterrows():
            for words in rows['description'].split():
                #removing special characters 
                word = ("".join(i for i in words if i.isalnum()))

                #lowering the words
                word = word.lower()

                #removing stopwords
                if word not in self.stopword:
                    self.strng += word + " "         
            self.data['description'].loc[index] = self.strng
        
        #keeping just the top three cast names and joining their first names and surnames
        self.data['cast'] = self.data['cast'].map(lambda x : x.replace(' ','').lower().split(',')[:3])

        #seprating the listed_in category
        self.data['listed_in'] = self.data['listed_in'].map(lambda x : x.lower().split(','))

        #joining the director surnames and first names
        self.data['director'] = self.data['director'].map(lambda x: x.replace(' ','').lower().split(','))

        #country
        self.data['country'] = self.data['country'].map(lambda x : x.replace(' ','').lower().split(' '))

        #making a list of description
        self.data['description']= self.data['description'].map(lambda x : x.split(' '))

        #Form a column text such that it contains all the columns merged in string format
        self.data['text'] = ''
        column = list(self.data.columns)
        column.remove('title')        
        
        for index,rows in self.data.iterrows():
            words = ''
            for col in column:
                    words = words + ' '.join(rows[col]) + ' '
            self.data['text'][index] = words

        #removing double spaces from the new text string and making them a single space 
        self.data['text'] = self.data['text'].map(lambda x : x.replace('  ',' '))
        self.data = self.data[['title','text']] 

        print('The remaning data is as follows:/n{}/n/n'.format(self.data.head()))
    
    #Save Data method    
    def save_data(self):
    
        #saving data in a pickle file
        self.data.to_pickle(self.Filepath)

        #saving data in a pickle file
        self.df.to_pickle(self.Filepath_df)
    
    #Final method
    def processed(self):
    
        print('LOADING DATA...{}\n\n '.format(self.Netflix_file))
        self.load_data()

        print('DATA ANALYSIS...\n\n')
        self.analysis()
        
        print('CLEANING DATA...\n\n')
        self.data_cleaning()
        
        print('SAVING DATA IN PICKLE FILE PREPROCESSED {}...\n\n'.format(self.Filepath))
        self.save_data()
        
        print('\n\n Cleaned data saved to pickle file preprocessed in Pickle folder')

In [3]:
#extract_transform_load = ETL('netflix_titles.csv','Pickle/preprocessed_data','Pickle/original_data')
#extract_transform_load.processed()

LOADING DATA...netflix_titles.csv

 
The data has 7787 data points and 12 features 

The data after removing irrelevant features has 7787 and it contains these 6 features. The names of the features are ['title', 'country', 'director', 'cast', 'listed_in', 'description'] 
 

DATA ANALYSIS...


The basic statistics for the directors of movies are as follows: 
count                       5398
unique                      4049
top       Raúl Campos, Jan Suter
freq                          18
Name: director, dtype: object


The directors with most movies are:
[(nan, 2389), ('Raúl Campos, Jan Suter', 18), ('Marcus Raboy', 16), ('Jay Karas', 14), ('Cathy Garcia-Molina', 13), ('Youssef Chahine', 12), ('Martin Scorsese', 12), ('Jay Chapman', 12), ('Steven Spielberg', 10), ('David Dhawan', 9)]


The percentage of total movies with listed directors:
69.32066264286631% 


The basic statistics for the cast are as follows: 
count                   7069
unique                  6831
top       David Att