Group 4 Members

* Shyam Akhil Nekkanti - 8982123
* Jun He (Helena) - 8903073
* Zheming Li (Brendan) - 8914152

### Dataset Description

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import json
from scipy.stats import shapiro

pd.set_option('display.max_columns', None)
plt.style.use('seaborn')

def load_category_mapping(json_file):
    with open(json_file, 'r') as f:
        category_data = json.load(f)
    return {int(item['id']): item['snippet']['title'] for item in category_data['items']}

def load_and_preprocess_data(csv_file, category_mapping):
    # Load CSV data
    df = pd.read_csv(csv_file)
    
    # Convert dates
    df['trending_date'] = pd.to_datetime(df['trending_date'], format='%y.%d.%m')
    df['publish_time'] = pd.to_datetime(df['publish_time'])
    
    # Add category names
    df['category_name'] = df['category_id'].map(category_mapping)
    
    # Append numerical features
    df['likes_ratio'] = df['likes'] / (df['views'] + 1)
    df['dislikes_ratio'] = df['dislikes'] / (df['views'] + 1)
    df['comment_ratio'] = df['comment_count'] / (df['views'] + 1)
    
    return df

category_mapping = load_category_mapping('category_id.json')
df = load_and_preprocess_data('US_youtube_trending_data.csv', category_mapping)

## Exploratory Data Analysis and Distribution Analysis

def plot_distribution_analysis(data, column, title):
    """Create distribution analysis plots for a given column"""
    plt.figure(figsize=(15, 5))
    
    # Histogram
    plt.subplot(131)
    sns.histplot(data[column], kde=True)
    plt.title(f'Histogram of {title}')
    
    # Box Plot
    plt.subplot(132)
    sns.boxplot(y=data[column])
    plt.title(f'Box Plot of {title}')
    
    # Q-Q Plot
    plt.subplot(133)
    stats.probplot(data[column], dist="norm", plot=plt)
    plt.title(f'Q-Q Plot of {title}')
    
    plt.tight_layout()
    plt.show()

# Analyze distributions for key metrics
metrics = ['views', 'likes', 'dislikes', 'comment_count']

for metric in metrics:
    plot_distribution_analysis(df, metric, metric.replace('_', ' ').title())
    
    # Calculate and display basic statistics
    print(f"\nStatistics for {metric}:")
    print(df[metric].describe())
    
    # Z-score analysis
    z_scores = stats.zscore(df[metric])
    outliers = len(z_scores[abs(z_scores) > 3])
    print(f"Number of outliers (|Z-score| > 3): {outliers}")
    
    # T-score analysis
    t_scores = (df[metric] - df[metric].mean()) / (df[metric].std() / np.sqrt(len(df[metric])))
    print(f"T-score mean: {t_scores.mean():.2f}")
    print(f"T-score std: {t_scores.std():.2f}")
    
    # Shapiro-Wilk test
    statistic, p_value = shapiro(df[metric])
    print(f"Shapiro-Wilk test p-value: {p_value}")
    print("\n" + "="*50 + "\n")

## Analysis Summaries