In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


sns.set(style="whitegrid", palette="muted", font_scale=1.1)
plt.rcParams["figure.figsize"] = (10, 5)


movies = pd.read_csv("../data/raw/tmdb_5000_movies.csv")
credits = pd.read_csv("../data/raw/tmdb_5000_credits.csv")

print(f"Movies shape: {movies.shape}")
print(f"Credits shape: {credits.shape}")


Movies shape: (4803, 20)
Credits shape: (4803, 4)


In [None]:

print("\nTypy zmiennych w zbiorze:")
display(movies.dtypes.value_counts())

print("\nStatystyki opisowe (numeryczne kolumny):")
display(movies.describe().T)

print("\nKolumny nienumeryczne:")
display(movies.describe(include='object').T.head())



Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond‚Äôs past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466


Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [None]:

missing = movies.isnull().sum().sort_values(ascending=False)
missing = missing[missing > 0]

plt.figure(figsize=(10, 5))
sns.barplot(x=missing.index, y=missing.values)
plt.title("Braki danych w kolumnach (filmy)")
plt.ylabel("Liczba brakujƒÖcych warto≈õci")
plt.xticks(rotation=45)
plt.show()

print("\nKolumny z brakami danych:")
display(missing)


In [None]:
duplicates = movies[movies.duplicated(subset=["title"], keep=False)]

if duplicates.empty:
    print("‚úÖ Brak duplikat√≥w po kolumnie 'title'")
else:
    print(f"‚ö†Ô∏è Znaleziono {len(duplicates)} duplikat√≥w tytu≈Ç√≥w film√≥w:")
    display(duplicates[["title", "id", "release_date"]])


In [None]:
numeric_cols = ["budget", "revenue", "popularity", "runtime"]

for col in numeric_cols:
    plt.figure()
    sns.boxplot(x=movies[col])
    plt.title(f"Boxplot: {col}")
    plt.show()

    Q1 = movies[col].quantile(0.25)
    Q3 = movies[col].quantile(0.75)
    IQR = Q3 - Q1
    lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
    outliers = movies[(movies[col] < lower) | (movies[col] > upper)]
    print(f"{col}: {len(outliers)} potencjalnych warto≈õci odstajƒÖcych")


In [None]:
quality_issues = {
    "duplikaty": movies.duplicated().sum(),
    "braki_danych": movies.isnull().sum().sum(),
    "wartosci_zerowe": (movies == 0).sum().sum(),
    "naniesione_wartosci": (movies == " ").sum().sum(),
}

print("üìã Podsumowanie jako≈õci danych:")
for k, v in quality_issues.items():
    print(f"- {k}: {v}")


In [None]:
Wnioski ko≈Ñcowe:
Braki danych: g≈Ç√≥wnie w opisowych kolumnach.
Duplikaty: brak.
Warto≈õci odstajƒÖce: naturalne, g≈Ç√≥wnie w budget i revenue.
Dane wymagajƒÖ dalszego czyszczenia i przetworzenia kolumn JSON (np. genres, cast).