# Amazon Dataset Cleaning

In [None]:
# imports
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text

## Import SQL Data Using Sqlalchemy

In [None]:
# Database credentials
username = 'postgres'
password = 'hellosql'
host = 'localhost'
port = '5433'
database = 'da_project_streaming'

# Connection String
connection_string = f'postgresql+psycopg2://{username}:{password}@{host}:{port}/{database}'

# Engine
engine = create_engine(connection_string)

In [None]:
# select all from amazon database
query = "SELECT * FROM amazon_prime_titles"
df_amazon = pd.DataFrame(engine.connect().execute(text(query)))

In [None]:
df_amazon.head()

## Check for Duplicates/ NULL Values

In [None]:
df_amazon.duplicated().sum() # No duplicated values

In [None]:
df_amazon.isna().sum()

In [None]:
df_amazon.info()

# Clean Data

In [None]:
# drop some null values
df_amazon.dropna(subset='rating', inplace=True)
df_amazon.drop('date_added', axis=1, inplace=True) # date_added mostly null so drop
df_amazon.dropna(subset='country', inplace=True) # drop col because of small amount of hard to use data
# fill unknown director and cast with empty str
df_amazon['director'] = df_amazon['director'].fillna('')
df_amazon['cast'] = df_amazon['cast'].fillna('')
df_amazon.info()

In [None]:
df_amazon['rating'].unique()

In [None]:
# Fix rating for shows so there are not multiple that mean the same thing
def fix_ratings(string):
    if string == 'ALL_AGES' or string == 'ALL':
        return 'G'
    elif string == 'NOT_RATE':
        return 'NR'
    elif string == 'UNRATED':
        return 'UR'
    elif string == '16' or string == 'AGES_16_':
        return '16+'
    elif string == 'AGES_18_':
        return '18+'
    else:
        return string

In [None]:
# apply function to normalize ratings
df_amazon['rating'] = df_amazon['rating'].apply(fix_ratings)
df_amazon['rating'].unique()

In [None]:
df_amazon.isna().sum()

In [None]:
def new_duration(string):
    return int(string.split()[0])

In [None]:
df_amazon['int_duration'] = df_amazon['duration'].apply(new_duration)
df_amazon['int_duration'].head()

In [None]:
df_shows = df_amazon[df_amazon['type'] == 'TV Show']
df_shows.head()

In [None]:
df_movies = df_amazon[df_amazon['type'] == 'Movie']
df_movies.head()

In [None]:
# exoprt data to csv
df_amazon.to_csv('cleaned_data/amazon_data_clean.csv')

## Visualizations

In [None]:
print('Correlation between release year and duration (Movies):', df_movies['release_year'].corr(df_movies['int_duration']))
df_movies.describe()

In [None]:
print('Correlation between release year and duration (Movies):', df_shows['release_year'].corr(df_shows['int_duration']))
df_shows.describe()

In [None]:
# initial distribution visualizations
import seaborn as sns
import matplotlib.pyplot as plt

sns.pairplot(data=df_shows, diag_kind='kde', height=4, aspect=1)

In [None]:
sns.pairplot(data=df_movies, diag_kind='kde', height=4, aspect=1)

In [None]:
plt.figure(figsize=(12,6))
sns.lineplot(data=df_shows, x='release_year', y='int_duration')
plt.title('TV Show Duration Over Time')
plt.xlabel('Release Year')
plt.ylabel('Duration (Seasons)')
plt.show()

In [None]:
plt.figure(figsize=(12,6))
sns.lineplot(data=df_movies, x='release_year', y='int_duration')
plt.title('TV Show Duration Over Time')
plt.xlabel('Release Year')
plt.ylabel('Duration (Seasons)')
plt.show()

In [None]:
plt.figure(figsize=(12,6))
sns.barplot(data=df_shows, x='rating', y='int_duration')
plt.title('Duration by Rating (Shows)')
plt.xlabel('Rating')
plt.ylabel('Duration (Seasons)')
plt.show()

In [None]:
plt.figure(figsize=(12,6))
sns.barplot(data=df_movies, x='rating', y='int_duration')
plt.title('Duration by Rating (Movies)')
plt.xlabel('Rating')
plt.ylabel('Duration (Minutes)')
plt.show()