In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
from ipywidgets import interact
import json
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

In [None]:
df = pd.read_csv("files/USvideos.csv")
df.isnull().sum()

In [None]:
df["description"] = df['description'].fillna("No Description")
duplicates = df.duplicated().sum()
df_cleaned = df.drop_duplicates()
duplicates_after_cleaning = df_cleaned.duplicated().sum()
print(f'Number of Duplicates Before cleaning: {duplicates}\nNumber of Duplicates After cleaning: {duplicates_after_cleaning}')
df.isnull().sum()

In [None]:
with open('files/US_category_id.json', 'r') as file:
    data = json.load(file)
items = data['items']
json_data = pd.json_normalize(items)
json_data.rename(columns={'snippet.title': 'Genre','id':'category_id'}, inplace=True)
json_data['category_id']= json_data['category_id'].astype('int')

df = pd.merge(df,json_data, on= 'category_id')
df.drop(['kind', 'etag', 'snippet.channelId','snippet.assignable','category_id'], axis=1, inplace=True)
print(df.isnull().sum())
df.dtypes

In [None]:
df['trending_date'] = pd.to_datetime(df['trending_date'], format = '%y.%d.%m')
df['publish_time']= pd.to_datetime(df['publish_time'])
df.dtypes

In [None]:
def plot_dist(column):
    plt.figure(figsize=(10, 6))
    sns.histplot(df[column], bins=50, kde=True)
    plt.title(f'Distribution of {column}')
    plt.show()

def plot_joint(x, y, kind='scatter'):
    sns.jointplot(x=x, y=y, data=df, kind=kind)
    plt.show()

def plot_pair(columns):
    for col in columns:
        if col not in df.columns:
            print(f"Column '{col}' not found in the DataFrame.")
            return
    
    sns.pairplot(df[list(columns)])
    plt.show()

def plot_rug(column):
    plt.figure(figsize=(10, 6))
    sns.rugplot(df[column])
    plt.title(f'Rug Plot of {column}')
    plt.show()

print("Distribution Plots")
interact(plot_dist, column=df.select_dtypes(include=['int64', 'float64']).columns);

print("Joint Plots")
interact(plot_joint, x=df.select_dtypes(include=['int64', 'float64']).columns, y=df.select_dtypes(include=['int64', 'float64']).columns, kind=['scatter', 'hex']);

print("Pair Plots")
interact(plot_pair, columns=widgets.SelectMultiple(options=df.select_dtypes(include=['int64', 'float64']).columns, value=['views', 'likes']));

print("Rug Plots")
interact(plot_rug, column=df.select_dtypes(include=['int64', 'float64']).columns);


In [None]:
def plot_bar(y):
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Genre', y=y, data=df)
    plt.title(f'Bar Plot of Genre vs {y}')
    plt.show()

def plot_count(column):
    plt.figure(figsize=(10, 6))
    sns.countplot(x=column, data=df)
    plt.title(f'Count Plot of {column}')
    plt.show()

def plot_box(y):
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='Genre', y=y, data=df)
    plt.title(f'Box Plot of Genre vs {y}')
    plt.show()

def plot_violin(y):
    plt.figure(figsize=(10, 6))
    sns.violinplot(x='Genre', y=y, data=df)
    plt.title(f'Violin Plot of Genre vs {y}')
    plt.show()

print("Bar Plots")
interact(plot_bar, y=df.select_dtypes(include=['int64', 'float64']).columns);

print("Count Plots")
interact(plot_count, column=(df.select_dtypes(include=['object', 'category']).columns)[-1]);

print("Box Plots")
interact(plot_box, y=df.select_dtypes(include=['int64']).columns);

print("Violin Plots")
interact(plot_violin, y=df.select_dtypes(include=['int64', 'float64']).columns);


In [None]:
def plot_strip(y):
    plt.figure(figsize=(10, 6))
    sns.stripplot(x='Genre', y=y, data=df)
    plt.title(f'Strip Plot of Genre vs {y}')
    plt.show()



print("Strip Plots")
interact(plot_strip, y=df.select_dtypes(include=['int64', 'float64']).columns);

In [None]:
def plot_facetgrid(y):
    g = sns.FacetGrid(df, col='Genre', col_wrap=4)
    g.map(plt.hist, y)
    plt.show()

print("Facet Grid")
interact(plot_facetgrid, y=df.select_dtypes(include=['int64', 'float64']).columns);

In [None]:
def plot_reg(x, y):
    plt.figure(figsize=(10, 6))
    sns.regplot(x=x, y=y, data=df)
    plt.title(f'Regression Plot of {x} vs {y}')
    plt.show()
    
print("Regression Plot")
interact(plot_reg, x=df.select_dtypes(include=['int64', 'float64']).columns, y=df.select_dtypes(include=['int64', 'float64']).columns);
