In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pyodbc
from dateutil.parser import parse
import warnings

warnings.filterwarnings("ignore")

# Set the environment
np.random.seed(1234)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
# Set the connection string to SQL Server
trst_conn= "Yes"
driver = "ODBC Driver 17 for SQL Server"
server = "localhost"
src_database = "Netflix_STG"     # Source DB
dest_database = "Netflix_STG"    # Destination DB

In [None]:
# Create a new connection to source DB (Trusted Connection -> Windows Authentication)
trusted_conn_src = pyodbc.connect(f'DRIVER={driver};SERVER={server};DATABASE={src_database};TRUSTED_CONNECTION={trst_conn}')

In [None]:
# Load table from source DB into a dataframe
read_query='''
SELECT *  FROM Netflix.dbo.tblNetflix
'''
df = pd.read_sql(read_query, trusted_conn_src)

In [None]:
df.head(10)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
# Remove empty rows/columns
df.dropna(how="all", axis=0, inplace=True) # Rows
df.dropna(how="all", axis=1, inplace=True) # Columns
df.info()

In [None]:
# Remove duplicate rows
df.drop_duplicates(keep='first', inplace=True)
df.shape

In [None]:
# Remove leading and trailing spaces from 'object' (string) columns
for col in df.select_dtypes(['object']).columns:
    df[col] = df[col].str.strip()

In [None]:
# Count unique values
df.nunique()

In [None]:
# Check for nulls
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'percent_missing': percent_missing})
missing_value_df

In [None]:
df.head()

In [None]:
# Split a column and create a new column with the first part of the splitted column
df['listed_in_2'] = df['listed_in'].str.split(',').str[0]
df.head()

In [None]:
# Remove the original column
df.drop(['listed_in'], axis='columns', inplace=True)
df.head()

In [None]:
# Rename the new column from the split, back to the name of the original column
df = df.rename(columns={'listed_in_2':'listed_in'})
df.head()

In [None]:
# Keep only records which does not contain unrecognized characters
filtered_df = df[df['director'].str.contains(r'[^0-9a-zA-Z,\s]') == False]
filtered_df.shape

In [None]:
# Save modified flat file to .CSV
filtered_df.to_csv('netflix_cleaned.csv', index=False)