# Initial file upload: TMDB_tv_dataset_v3.csv

In [None]:
import pandas as pd

df = pd.read_csv(r"TMDB_tv_dataset_v3.csv")

In [None]:
import matplotlib as plt
%matplotlib inline

import numpy as np
import seaborn as sb
import warnings

# Step 1: Data Preparation



In [None]:
# To have an overview of the dataset
df.describe()

In [None]:
# To have an overview of missing data and types of data

summary = pd.DataFrame({
    'non_null_count': df.notnull().sum(),
    'null_percent': df.isnull().mean() * 100,
    'num_unique': df.nunique(),
    'data_type': df.dtypes,
}).sort_values(by='null_percent', ascending=False)

print(summary)

In [None]:
df.head()

In [None]:
# Show all columns
pd.set_option('display.max_columns', None)
print(df.head(10))

## Look at rows and columns - Rows and columns duplication, check nulls and unknown - to

---

remove them

In [None]:

# To see columns with duplicate content - Didn't find any!

duplicate_content = []

for i in range(len(df.columns)):
    for j in range(i+1, len(df.columns)):
        col1 = df.columns[i]
        col2 = df.columns[j]
        if df[col1].equals(df[col2]):
            duplicate_content.append((col1, col2))

print("Columns with duplicate content:", duplicate_content)

In [None]:
# Screen for duplication whole row - by three parameters: id, name, original_name

# ----------------------------
# Part 1 – Checking and displaying duplicates
# ----------------------------

# 1️⃣ Finding all duplicate rows by specific columns
duplicate_rows = df[df.duplicated(subset=['id', 'name', 'original_name'], keep=False)]

# 2️⃣ Sorting the results to make it easy to see the duplicates
duplicate_rows = duplicate_rows.sort_values(by=['id', 'name', 'original_name'])

# 3️⃣ Displaying the first 10 examples of duplicates
print("דוגמאות של כפילויות:")
print(duplicate_rows.head(10))

# 4️⃣ Printing the number of rows Total that are identified as duplicates
num_duplicates = len(duplicate_rows)
print(f"\nנמצאו {num_duplicates} שורות כפולות (כולל כל העותקים).")

In [None]:
# Part 2 – Remove all duplicates

# Combine repeated rows into one - New df copy for this process: df_unique

# Original number of rows
original_rows = len(df)

# ----------------------------
# Remove all duplicates, keeping one instance for each combination of ['id', 'name', 'original_name']
# ----------------------------
df_unique = df.drop_duplicates(subset=['id', 'name', 'original_name']).reset_index(drop=True)

# Calculate the number of non-unique rows removed
removed_rows = original_rows - len(df_unique)

print("\nתוצאות לאחר הסרת כפילויות:")
print(df_unique.head())
print(f"סה\"כ שורות ייחודיות: {len(df_unique)}")
print(f"סה\"כ שורות לא ייחודיות שנמחקו: {removed_rows}")

# New data file name: df_unique

In [None]:
print(f"מספר העמודות: {len(df_unique.columns)}")
print("שמות העמודות:")
print(df_unique.columns)

# Explore target - popularity

In [None]:
df_unique['popularity'].describe()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

plt.figure(figsize=(10, 6))
sns.boxplot(x=np.log1p(df['popularity']))
plt.title('Boxplot of log(Popularity) - Horizontal', fontsize=14, weight='bold')
plt.xlabel('log(Popularity + 1)')
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['popularity'], bins=30, kde=True, color='skyblue')
plt.title('Histogram of Popularity', fontsize=14, weight='bold')
plt.xlabel('Popularity')
plt.ylabel('Count')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

plt.figure(figsize=(10, 6))
sns.histplot(np.log1p(df['popularity']), bins=30, kde=True, color='skyblue')
plt.title('Histogram of log(Popularity + 1)', fontsize=14, weight='bold')
plt.xlabel('log(Popularity + 1)')
plt.ylabel('Count')
plt.show()


In [None]:
pop_max = df['popularity'].quantile(0.99)
plt.figure(figsize=(10, 6))
sns.histplot(df[df['popularity'] <= pop_max]['popularity'], bins=30, kde=True, color='skyblue')
plt.title('Histogram of Popularity (99th percentile)', fontsize=14, weight='bold')
plt.xlabel('Popularity')
plt.ylabel('Count')
plt.show()


In [None]:
# Check overall nulls
import missingno as msno
msno.matrix(df_unique)

In [None]:
# Checking nulls by percentage
missing_percent = df_unique.isnull().mean() * 100
print(missing_percent.sort_values(ascending=False))

# Repeated columns with the same information
## Columns handeling - unit or remove duplicate

In [None]:
# Repeated columns with the same information
# Second - TV show name and original name

print(df_unique[['name', 'original_name']].head(20))

different_rows = df_unique[df_unique['name'] != df_unique['original_name']]
print(different_rows[['name', 'original_name']].head(20))

In [None]:

# Coulmns 'name' and 'original_name' - are mostly the same.
# Check for missing values in 'name' column - without the name the row is useless


missing_name_rows = df_unique[df_unique['name'].isnull()]
missing_name_count = len(missing_name_rows)

print(f"מספר השורות שבהן 'name' חסר: {missing_name_count}")
missing_name_rows[['id', 'name', 'original_name']].head(20)


In [None]:
import numpy as np

# Define a condition for true null in name:
# 1. name is NaN
# 2. name is empty ("")
# 3. name contains only periods, such as "..." or "....."

missing_name_condition = (
    df_unique['name'].isna() |
    (df_unique['name'].astype(str).str.strip() == "") |
    (df_unique['name'].astype(str).str.match(r'^\.*$'))
)

# Retrieve the missing rows according to the condition
missing_name_rows = df_unique[missing_name_condition]

print(f"מספר השורות שבהן name חסר או בעייתי: {len(missing_name_rows)}")
missing_name_rows[['id', 'name', 'original_name']].head(20)


In [None]:
# Number of rows before deletion
rows_before = len(df_unique)
print(f"סה\"כ שורות לפני המחיקה: {rows_before}")

# Condition for detecting invalid name: NaN, empty, or only dots
missing_name_condition = (
    df_unique['name'].isna() |
    (df_unique['name'].astype(str).str.strip() == "") |
    (df_unique['name'].astype(str).str.match(r'^\.*$'))
)

# Saving rows for deletion (for viewing or saving)
missing_name_rows = df_unique[missing_name_condition]
deleted_ids = missing_name_rows['id'].tolist()

# Deleting invalid rows
df_unique = df_unique[~missing_name_condition].reset_index(drop=True)

# Number of rows after Delete
rows_after = len(df_unique)
print(f"סה\"כ שורות אחרי המחיקה: {rows_after}")

# How many rows were deleted
removed_rows = rows_before - rows_after
print(f"סה\"כ שורות שנמחקו: {removed_rows}")

# Display the deleted IDs
print("ID של השורות שנמחקו בגלל name לא תקין:")
print(deleted_ids)



In [None]:
# Merge columns 'name' and 'original_name' into a new column called 'final_name' and delete the two original columns

import re

# --- Function to merge names ---
def merge_names(row):
    name = row['name']
    original = row['original_name']

    if pd.isna(name) and pd.isna(original):
        return None
    elif pd.isna(name):
        return original
    elif pd.isna(original):
        return name
    elif name == original:
        return name
    else:
        return f"{name} / {original}"

# --- Create the merged column ---
df_unique['final_name'] = df_unique.apply(merge_names, axis=1)

# --- Clean up and rebuild final_name ---
df_unique['final_name'] = df_unique['final_name'].str.strip()                             # הסרת רווחים מיותרים
df_unique['final_name'] = df_unique['final_name'].str.lower()                             # המרה לאותיות קטנות
df_unique['final_name'] = df_unique['final_name'].str.replace(r'[^\w\s]', '', regex=True)  # הסרת תווים מיוחדים

# --- Convert the column to pandas string type ---
df_unique['final_name'] = df_unique['final_name'].astype('string')

# --- Delete the original columns ---
df_unique = df_unique.drop(columns=['name', 'original_name'])

# --- Check the first 20 rows ---
print(df_unique[['final_name']].head(20))

# --- Check the data type ---
print("\nData type of final_name:", df_unique['final_name'].dtype)


# Repeated columns with the same information


In [None]:
# Repeated columns with the same information
# First - take a look at language

print(df_unique[['original_language', 'languages', 'origin_country',
                 'spoken_languages', 'production_countries']].head())

In [None]:
# Check language uniques
language_columns = ['original_language', 'languages', 'origin_country',
                    'spoken_languages', 'production_countries']

for col in language_columns:
    print(f"\n--- {col} ---")
    print(df_unique[col].unique())

In [None]:
### Remove column!!
# Leave only one language column - "original_language", and remove the others.... Dont' remove the 'production_countries'

cols_to_drop = ['languages', 'origin_country', 'spoken_languages']
df_unique = df_unique.drop(columns=cols_to_drop)
print(df_unique.columns)

print(df_unique.shape)


# Columns with more then 50% missing values

In [None]:
# Columns with more then 50% missing data - nulls.

# Calculate the percentage of missing rows in each column
missing_percent = df_unique.isnull().mean() * 100

# Select columns with 50% or more missing
columns_50pct_or_more_missing = missing_percent[missing_percent >= 50].index.tolist()

# Display the result
print("עמודות עם יותר או שווה ל-50% ערכים חסרים:")
print(columns_50pct_or_more_missing)


In [None]:
# Show only the first 10 rows of columns with more than 50% missing
# Calculate the percentage of missing rows in each column

missing_percent = df_unique[columns_50pct_or_more_missing].isnull().mean() * 100

# Create a new DataFrame with custom headers: column name + percentage of NULLs
df_to_show = df_unique[columns_50pct_or_more_missing].head(10).copy()
df_to_show.columns = [f"{col} ({missing_percent[col]:.1f}% NULLs)" for col in df_to_show.columns]

# Show the first 10 rows with the new headers
print("\n10 השורות הראשונות מהעמודות עם יותר או שווה ל-50% ערכים חסרים (כולל אחוז NULLs):")
print(df_to_show)



In [None]:
# Remove columns with more then 50% nulls, leave the 'production_countries' for furter analysis .


# Calculate the percentage of missing rows in each column
missing_percent = df_unique.isnull().mean() * 100

# Select columns with 50% or more missing
columns_50pct_or_more_missing = missing_percent[missing_percent >= 50].index.tolist()

# Remove 'production_countries' from the list (do not delete it)
columns_to_drop = [col for col in columns_50pct_or_more_missing if col != 'production_countries']

# Drop columns
df_unique = df_unique.drop(columns=columns_to_drop)

# Display remaining columns
print("העמודות אחרי המחיקה (כולל production_countries):")
print(df_unique.columns.tolist())


In [None]:
# Check overall nulls
import missingno as msno
msno.matrix(df_unique)

print(df_unique.columns)

print(df_unique.shape)

# Deal with "dates".

In [None]:
# Deal with "dates".

# --- Convert date columns to datetime ---
df_unique['first_air_date'] = pd.to_datetime(df_unique['first_air_date'], errors='coerce')
df_unique['last_air_date'] = pd.to_datetime(df_unique['last_air_date'], errors='coerce')

# --- Extract useful features ---

# Year, month, day of the first date
df_unique['first_year'] = df_unique['first_air_date'].dt.year
df_unique['first_month'] = df_unique['first_air_date'].dt.month
df_unique['first_day'] = df_unique['first_air_date'].dt.day

# Year, month, day of the last date
df_unique['last_year'] = df_unique['last_air_date'].dt.year
df_unique['last_month'] = df_unique['last_air_date'].dt.month
df_unique['last_day'] = df_unique['last_air_date'].dt.day


In [None]:
# For dates - make a new column
# production_length
# and leave only first_year and last_year


import numpy as np

# Let's assume there are already NaNs instead of missing values
# Calculating production_length with NaN
df_unique['production_length'] = df_unique['last_year'] - df_unique['first_year']

# Replacing all NaNs in production_length with -1
df_unique['production_length'] = df_unique['production_length'].fillna(-1)

# replace NaNs in the column headers if you want
df_unique['first_year'] = df_unique['first_year'].fillna(-1)
df_unique['last_year']  = df_unique['last_year'].fillna(-1)

# Deleting unnecessary columns
cols_to_drop = ['first_month', 'first_day', 'last_month', 'last_day', 'first_air_date', 'last_air_date']
df_unique = df_unique.drop(columns=[col for col in cols_to_drop if col in df_unique.columns])

# check
df_unique[['first_year', 'last_year', 'production_length']].head(10)




In [None]:
msno.matrix(df_unique)
df_unique.head(3)

# Remove uninformative column 'poster_path'

In [None]:

# A column that remains and is not informative, we will remove it
# Deleting the column 'poster_path'
df_unique = df_unique.drop(columns=['poster_path'])

# Displaying the list of columns after deletion
print("רשימת העמודות אחרי מחיקה:")
print(df_unique.columns.tolist())


In [None]:

summary = pd.DataFrame({
    'non_null_count': df_unique.notnull().sum(),
    'null_percent': df_unique.isnull().mean() * 100,
    'num_unique': df_unique.nunique(),
    'data_type': df_unique.dtypes,
}).sort_values(by='null_percent', ascending=False)

print(summary)

# Try to nerrow uniques

In [None]:
# Unique values ​​in each column
# List of columns

columns_to_check = ['type', 'in_production', 'genres', 'status', 'production_countries', 'networks', 'overview']

# Ordered printing of unique values ​​with column name
for col in columns_to_check:
    print(f"ערכים ייחודיים בעמודה '{col}':")
    print(df_unique[col].unique())
    print("\n" + "-"*50 + "\n")

# Number of unique values
print("Number of unique values in 'type':", df_unique['type'].nunique())
print("Number of unique values in 'in_production':", df_unique['in_production'].nunique())
print("Number of unique values in 'genres':", df_unique['genres'].nunique())
print("Number of unique values in 'status':", df_unique['status'].nunique())
print("Number of unique values in 'production_countries':", df_unique['production_countries'].nunique())
print("Number of unique values in 'networks':", df_unique['networks'].nunique())

In [None]:
# Show all columns
pd.set_option('display.max_columns', None)
print(df_unique.head(10))

In [None]:
# clean text for editing: remove spaces, capital letters...
# Select all text columns
text_cols = df_unique.select_dtypes(include=['object', 'string']).columns.tolist()
print("עמודות טקסטואליות לניקוי:", text_cols)

# String cleaning function
def clean_text(val):
    if pd.isnull(val):
        return val
    val = str(val).strip().lower()
    val = ','.join([v.strip() for v in val.split(',')])
    return val

# Remove extra commas and spaces and convert to lowercase for all text columns
for col in text_cols:
    df_unique[col] = df_unique[col].apply(clean_text)

# Example display of the first 10 rows after cleaning
print(df_unique[text_cols].head(10))

In [None]:
# Look for uniques in columns: 'production_countries' and 'networks'
# Unique values ​​in each column

print(df_unique['production_countries'].unique())
print(df_unique['networks'].unique())

# Number of unique values

print("Number of unique values in 'production_countries':", df_unique['production_countries'].nunique())
print("Number of unique values in 'networks':", df_unique['networks'].nunique())


In [None]:
# Look for uniques in column: 'episode_run_time'

df_unique['episode_run_time'].unique()

In [None]:
# Max and Min uniques values in 'episode_run_time'

unique_times = df_unique['episode_run_time'].dropna().unique()
min_time = unique_times.min() if hasattr(unique_times, 'min') else min(unique_times)
max_time = unique_times.max() if hasattr(unique_times, 'max') else max(unique_times)

print(f"Minimum episode run time: {min_time} minutes")
print(f"Maximum episode run time: {max_time} minutes")

# zero values count
num_zeros = (df_unique['episode_run_time'] == 0).sum()
print(f"Number of 0 values in episode_run_time: {num_zeros}")

In [None]:
# Check df
summary = pd.DataFrame({
    'null_percent': df_unique.isnull().mean() * 100,
    'num_unique': df_unique.nunique(),
    'data_type': df_unique.dtypes
}).sort_values(by='null_percent', ascending=False)

print(summary)

## Changing strings/objects into category

In [None]:
# Lets start with the easy columns that have short limitied uniques
categorical_cols = ['original_language', 'type', 'status', 'production_countries', 'networks', 'genres']
for col in categorical_cols:
    df_unique[col] = df_unique[col].astype('category')


In [None]:
summary = pd.DataFrame({
    'null_percent': df_unique.isnull().mean() * 100,
    'num_unique': df_unique.nunique(),
    'data_type': df_unique.dtypes
}).sort_values(by='null_percent', ascending=False)

print(summary)

In [None]:
df_unique['production_countries'].value_counts()

In [None]:
# A better overview of 'production_countries' uniques

# Allows printing all values ​​in columns without truncation
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Count the number of occurrences of each value in the production_countries column
value_counts = df_unique['production_countries'].value_counts()

# Turns this into a DataFrame for neat display
value_counts_df = value_counts.reset_index()
value_counts_df.columns = ['production_countries', 'count']

# Displays all values ​​with the number of occurrences
print(value_counts_df)

# When finished, you can revert to the default settings
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')


In [None]:
# Uniques in 'type'
df_unique['type'].value_counts()


In [None]:
# Uniques in 'status'

df_unique['status'].value_counts()

In [None]:
# A better overview of 'original_language' uniques

# Allows printing all values ​​in columns without truncation
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Displays all values ​​and frequencies
print(df_unique['original_language'].value_counts())

# When finished, you can return to the default settings
# pd.reset_option('display.max_rows')
# pd.reset_option('display.max_columns')

In [None]:
# Unit 'original_language' uniques that are under 10 counts

# Languages ​​under 10 became OTHER
# For original_language


# 1️⃣ Clean up spaces and extra characters in the language column
df_unique['original_language'] = df_unique['original_language'].astype(str).str.strip().str.lower()

# 2️⃣ Count all values ​​in the column
language_counts = df_unique['original_language'].value_counts()

# 3️⃣ Identify languages ​​that appear less than 10 times
rare_languages = language_counts[language_counts < 10].index

# 4️⃣ Replace rare values ​​with 'other' directly in the existing column
df_unique['original_language'] = df_unique['original_language'].apply(
    lambda x: 'other' if x in rare_languages else x
)

# 5️⃣ Displaying the results
print("התפלגות השפות לאחר איחוד ערכים נדירים:")
print(df_unique['original_language'].value_counts().sort_values(ascending=False))


In [None]:
# A better overview of 'overview' uniques

df_unique['overview'].head()

In [None]:
# Clean text - spaces and other - in 'overview. For further editing
df_unique['overview'] = df_unique['overview'].apply(
    lambda x: x.strip().lower() if isinstance(x, str) else x
)
print(df_unique['overview'].head(20))

In [None]:
# Check results
df_unique['networks'].value_counts()
print(df_unique['networks'].unique())

In [None]:
# # A better overview of 'networks' uniques

# Allows printing all values ​​in columns without truncation

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Count the number of occurrences of each value in the networks column
value_counts = df_unique['networks'].value_counts()

# Convert to a DataFrame with columns: network name and number of occurrences
value_counts_df = value_counts.reset_index()
value_counts_df.columns = ['network', 'count']

# Display all values ​​with number of occurrences
print(value_counts_df)

# When finished, you can revert to the default settings
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')


In [None]:
# Unit 'networks' uniques that are under 10 counts


# 1️⃣ Count the number of occurrences of each network
network_counts = df_unique['networks'].value_counts()

# 2️⃣ Identify the values ​​that appear less than 10 times
rare_networks = network_counts[network_counts < 10].index.tolist()

# 3️⃣ Replace the rare values ​​with 'OTHER'
df_unique['networks'] = df_unique['networks'].apply(lambda x: 'OTHER' if x in rare_networks else x)

# 4️⃣ Check: Frequencies after merging
print("שכיחויות הערכים לאחר איחוד נדירים ל-OTHER:")
print(df_unique['networks'].value_counts())


In [None]:
# Explore 'genres' uniques

df_unique['genres'].value_counts()
print(df_unique['genres'].unique())

In [None]:
# explore 'genres' unique and clean spaces... for futher editing
# Clean and merge genres and uniform description of words
# Function to clean and sort genres

def clean_genres(val):
    if pd.isna(val) or val.strip() == '':
        return val
    items = [x.strip().lower() for x in val.split(',')]
    items.sort()
    return ','.join(items)

# Apply the function to the entire existing column
df_unique['genres'] = df_unique['genres'].apply(clean_genres)

# Show the frequencies of the values ​​after cleaning
print(df_unique['genres'].value_counts().head(100))

# Summary: How many unique values ​​are there now In column
num_unique_genres = df_unique['genres'].nunique()
print(f"\nסה\"כ ערכים ייחודיים בעמודה 'genres' לאחר הניקוי: {num_unique_genres}")


In [None]:
# Unit 'genres' uniques that are under 10 counts

# 1️⃣ Count all values ​​in the genres column
counts = df_unique['genres'].value_counts()

# 2️⃣ Identify values ​​that appear less than 10 times
to_replace = counts[counts < 10].index.tolist()

# 3️⃣ Replace rare values ​​with 'other'
df_unique['genres'] = df_unique['genres'].apply(
    lambda x: 'other' if x in to_replace else x
)

# 4️⃣ Test: Show the distribution after the union
value_counts = df_unique['genres'].value_counts()
print(value_counts)

# 5️⃣ Show the summary of the number of unique values ​​after the union
print(f"\nסה\"כ ערכים ייחודיים בעמודה 'genres' לאחר האיחוד: {df_unique['genres'].nunique()}")


In [None]:
# To have an overview of missing data and types of data

summary = pd.DataFrame({
    'non_null_count': df_unique.notnull().sum(),
    'null_percent': df_unique.isnull().mean() * 100,
    'num_unique': df_unique.nunique(),
    'data_type': df_unique.dtypes,
}).sort_values(by='null_percent', ascending=False)

print(summary)

In [None]:
# List of columns to keep as object

keep_object = ['overview', 'final_name']

# Convert all other object columns to category
for col in df_unique.select_dtypes(include=['object']).columns:
    if col not in keep_object:
        df_unique[col] = df_unique[col].astype('category')

# Test
# Create a comprehensive summary table for each column
summary = pd.DataFrame({
    'non_null_count': df_unique.notnull().sum(),
    'null_percent': df_unique.isnull().mean() * 100,
    'num_unique': df_unique.nunique(),
    'data_type': df_unique.dtypes,
}).sort_values(by='null_percent', ascending=False)

print(summary)


# EDA

In [None]:
!pip install autoviz

In [None]:
!pip install textblob
!python -m textblob.download_corpora


### EDA reports

In [None]:
# AutoViz report - Popularity as Target

%matplotlib inline
from autoviz.AutoViz_Class import AutoViz_Class

AV = AutoViz_Class()

df_auto = AV.AutoViz(
    filename="",
    dfte=df_unique,
    depVar="popularity",
    sep=",",
    chart_format="png",
    max_rows_analyzed=30000,
    verbose=2
)


In [None]:
# ydata-profiling report

!pip install ydata-profiling --quiet

from ydata_profiling import ProfileReport
import pandas as pd

In [None]:
# Create a report
profile = ProfileReport(df_unique, title="EDA Report - Dataset Overview", explorative=True)

# For direct display in notebook (Jupyter / Colab)
profile.to_notebook_iframe()

# Or save to HTML file
profile.to_file("EDA_report.html")

print("✅ דו\"ח EDA נוצר ונשמר כ-'EDA_report.html'")


In [None]:
profile.to_file("EDA_report.html")

In [None]:

from google.colab import files
files.download("EDA_report.html")

# Data cleansing

In [None]:
# Dealing with missing values
summary = pd.DataFrame({
    'non_null_count': df_unique.notnull().sum(),
    'null_percent': df_unique.isnull().mean() * 100,
    'num_unique': df_unique.nunique(),
    'data_type': df_unique.dtypes,
}).sort_values(by='null_percent', ascending=False)

print(summary)

In [None]:
!pip install fancyimpute

In [None]:
# MICE - to fill missing values in 'overview'

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from fancyimpute import IterativeImputer

# -----------------------------
# 0️⃣ Filling missing values ​​in overview
# -----------------------------
# fill NaN
df_unique['overview'] = df_unique['overview'].fillna("unknown")
# Remove spaces and convert to lowercase
df_unique['overview'] = df_unique['overview'].apply(lambda x: x.strip().lower() if isinstance(x, str) else x)
# Fill empty strings with "unknown"
df_unique.loc[df_unique['overview'] == "", 'overview'] = "unknown"

# -----------------------------
# 1️⃣ Creating TF-IDF from overview (for MICE purpose only)
# -----------------------------
vectorizer = TfidfVectorizer(max_features=100)
overview_tfidf = vectorizer.fit_transform(df_unique['overview']).toarray()
overview_df = pd.DataFrame(overview_tfidf, columns=[f"word_{i}" for i in range(overview_tfidf.shape[1])])

# -----------------------------
# 2️⃣ Convert categories to numeric
# -----------------------------
categorical_cols = ['genres', 'networks', 'production_countries', 'type', 'status', 'adult']
le_dict = {}

for col in categorical_cols:
    if pd.api.types.is_categorical_dtype(df_unique[col]):
        df_unique[col] = df_unique[col].cat.add_categories(["Unknown"])
    df_unique[col] = df_unique[col].fillna("Unknown")

    le = LabelEncoder()
    df_unique[col] = le.fit_transform(df_unique[col])
    le_dict[col] = le

# -----------------------------
# 3️⃣ Preparing data for MICE (numeric only + TF-IDF)
#    ✅ Removing id to avoid damaging MICE
# -----------------------------
numeric_cols = df_unique.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_cols = [col for col in numeric_cols if col != 'id']  # <-- כאן הוספנו את התיקון

df_for_mice = pd.concat([df_unique[numeric_cols], overview_df], axis=1)

# -----------------------------
# 4️⃣ Running MICE
# -----------------------------
imp = IterativeImputer(max_iter=10, random_state=0)
df_filled_array = imp.fit_transform(df_for_mice)
df_filled = pd.DataFrame(df_filled_array, columns=df_for_mice.columns)

# -----------------------------
# 5️⃣ Return categories to text mode
# -----------------------------
for col in categorical_cols:
    le = le_dict[col]
    df_filled[col] = df_filled[col].round().astype(int)
    df_filled[col] = le.inverse_transform(df_filled[col])

# -----------------------------
# 6️⃣ Return the original overview (already filled with completions)
# -----------------------------
df_filled['overview'] = df_unique['overview']

# -----------------------------
# 7️⃣ Delete TF-IDF columns
# -----------------------------
tfidf_cols = [col for col in df_filled.columns if col.startswith("word_")]
df_filled.drop(columns=tfidf_cols, inplace=True)

# -----------------------------
# 8️⃣ Return all original text columns that did not pass MICE
# -----------------------------
non_numeric_cols = df_unique.select_dtypes(exclude=['int64', 'float64']).columns.tolist()
for col in non_numeric_cols:
    if col != 'overview':
        df_filled[col] = df_unique[col]

# -----------------------------
# ✅ Return id column as is
# -----------------------------
df_filled['id'] = df_unique['id']

# -----------------------------
# 9️⃣ check
# -----------------------------
print("מספר ערכים חסרים אחרי כל התהליך:")
print(df_filled.isnull().sum())


# new dataframe - df_filled

In [None]:
# File check
df_filled.info()

In [None]:


from google.colab import files

df_filled.to_csv('df_new_GitHub.csv', index=False)
files.download('df_new_GitHub.csv')



# Continue in second file

In [None]:
# For GitHub upload
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import nbformat

# Path ל-notebook הנוכחי
path = '/content/drive/MyDrive/Project TV show popularity/advance project/More advanced project/For GitHub/Upload to GitHub/GitHub_1_TV_show_popularity_part_one_upload.ipynb'  # שנה לפי הנתיב שלך


# קריאה ועריכה של ה-notebook
nb = nbformat.read(path, as_version=4)

# ניקוי metadata בעייתית
if "widgets" in nb.metadata:
    del nb.metadata["widgets"]
if "colab" in nb.metadata:
    del nb.metadata["colab"]
if "celltoolbar" in nb.metadata:
    del nb.metadata["celltoolbar"]

# שמירה מחדש
nbformat.write(nb, path)
print("✅ Notebook cleaned and ready for GitHub!")