In [1]:
#Import libraries for data cleaning
import pandas as pd
import numpy as np
import warnings
from datetime import datetime

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

print("Libraries imported successfully!")

Libraries imported successfully!


In [2]:
# Cell 2: Load raw data
df = pd.read_csv('../data/raw/New_Anime_list.csv', index_col=0)
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')

print(f"Raw dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

Raw dataset shape: (2500, 8)
Columns: ['title', 'genre', 'studio', 'number_of_episodes', 'release_date', 'content_type', 'viewer_reviews', 'source']


In [3]:
# Cell 3: Quick data quality check
print("DATA QUALITY SUMMARY")
print("-" * 30)
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Zero episodes: {(df['number_of_episodes'] == 0).sum()}")
print(f"Zero reviews: {(df['viewer_reviews'] == 0).sum()}")
print(f"Missing dates: {df['release_date'].isna().sum()}")

DATA QUALITY SUMMARY
------------------------------
Missing values: 17
Zero episodes: 33
Zero reviews: 34
Missing dates: 17


In [4]:
# Dataset segmentation analysis
print("DATASET SEGMENTATION")
print("-" * 30)

# Content type distribution
print("Content types:")
print(df['content_type'].value_counts())

# Temporal analysis
current_year = 2025
df['year'] = df['release_date'].dt.year
print(f"\nTemporal distribution:")
print(f"Past/current ({current_year} or before): {(df['year'] <= current_year).sum()}")
print(f"Future (after {current_year}): {(df['year'] > current_year).sum()}")

# Movie vs Series classification
movies_mask = (df['content_type'] == 'Movie') | \
              ((df['number_of_episodes'] == 1) & (df['content_type'].isin(['Special', 'Ova'])))
series_mask = df['number_of_episodes'] > 1
ambiguous_mask = ~movies_mask & ~series_mask

print(f"\nClassification:")
print(f"Movies: {movies_mask.sum()}")
print(f"Series: {series_mask.sum()}")
print(f"Ambiguous: {ambiguous_mask.sum()}")

DATASET SEGMENTATION
------------------------------
Content types:
content_type
Tv          1755
Movie        316
Ova          181
Ona          127
Special       62
Tv Short      56
Music          2
Unknown        1
Name: count, dtype: int64

Temporal distribution:
Past/current (2025 or before): 2481
Future (after 2025): 2

Classification:
Movies: 455
Series: 2010
Ambiguous: 41


In [5]:
#Create clean datasets

# Step 1: Remove problematic data
df_clean = df[
    (df['number_of_episodes'] > 0) &      # Remove 0 episodes
    (df['release_date'].notna()) &        # Remove missing dates
    (df['viewer_reviews'] > 0)             # Remove 0 reviews
].copy()

# Step 2: Identify movies (including special formats)
is_movie = df_clean['content_type'] == 'Movie'
is_single_episode_special = (
    (df_clean['number_of_episodes'] == 1) & 
    (df_clean['content_type'].isin(['Special', 'Ova']))
)
df_movies = df_clean[is_movie | is_single_episode_special].copy()

# Step 3: Identify series (more than 1 episode)
df_series = df_clean[df_clean['number_of_episodes'] > 1].copy()

# Summary
print("CLEAN DATASETS CREATED")
print("-" * 30)
print(f"Original: {len(df)} rows")
print(f"Clean: {len(df_clean)} rows ({len(df_clean)/len(df)*100:.1f}%)")
print(f"Movies: {len(df_movies)} ({len(df_movies)/len(df_clean)*100:.1f}% of clean)")
print(f"Series: {len(df_series)} ({len(df_series)/len(df_clean)*100:.1f}% of clean)")

# Check overlap
overlap = set(df_movies.index) & set(df_series.index)
if overlap:
    print(f"\nWarning: {len(overlap)} entries appear in both movies and series")

# Save datasets
df_clean.to_csv('../data/processed/anime_clean.csv')
df_movies.to_csv('../data/processed/anime_movies.csv')
df_series.to_csv('../data/processed/anime_series.csv')
print("\nDatasets saved to processed folder")

CLEAN DATASETS CREATED
------------------------------
Original: 2500 rows
Clean: 2460 rows (98.4%)
Movies: 448 (18.2% of clean)
Series: 2009 (81.7% of clean)


Datasets saved to processed folder
