In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("ott_churn_dataset.csv")

# Count missing gender values before imputation
missing_before = df[df['gender_filled'] == "UNKNOWN"].shape[0]

# Apply rule-based imputation
df.loc[(df['gender_filled'] == "UNKNOWN") & (df['no_of_days_subscribed'] > 365), 'gender_filled'] = "Male"
df.loc[(df['gender_filled'] == "UNKNOWN") & (df['no_of_days_subscribed'] < 200), 'gender_filled'] = "Female"
df.loc[(df['gender_filled'] == "UNKNOWN") & (df['weekly_mins_watched'] > 600), 'gender_filled'] = "Male"
df.loc[(df['gender_filled'] == "UNKNOWN") & (df['weekly_mins_watched'] < 400), 'gender_filled'] = "Female"
df.loc[(df['gender_filled'] == "UNKNOWN") & (df['churn_filled'] == "Yes"), 'gender_filled'] = "Female"
df.loc[(df['gender_filled'] == "UNKNOWN") & (df['churn_filled'] == "No"), 'gender_filled'] = "Male"
df.loc[(df['gender_filled'] == "UNKNOWN") & (df['mail_subscribed'] == "Yes"), 'gender_filled'] = "Female"

# Fill remaining UNKNOWN values with majority gender
mode_gender = df[df['gender_filled'] != "UNKNOWN"]['gender_filled'].mode()[0]
df.loc[df['gender_filled'] == "UNKNOWN", 'gender_filled'] = mode_gender

# Count missing gender values after imputation
missing_after = df[df['gender_filled'] == "UNKNOWN"].shape[0]

# Save cleaned dataset
df.to_csv("ott_churn_cleaned_dataset.csv", index=False)

# Print completion message
print("Gender imputation completed! {missing_before - missing_after} missing values were filled.")

Gender imputation completed! {missing_before - missing_after} missing values were filled.


In [2]:
print(df['gender_filled'].value_counts())  # Check the distribution of gender values
print(df.isnull().sum())  # Check for any other missing values

gender_filled
Male      1053
Female     947
Name: count, dtype: int64
year                      0
customer_id               0
gender_filled             0
age                       0
no_of_days_subscribed     0
multi_screen              0
mail_subscribed           0
weekly_mins_watched       0
minimum_daily_mins        0
maximum_daily_mins        0
weekly_max_night_mins     0
videos_watched            0
maximum days_filled       0
customer_support_calls    0
churn_filled              0
dtype: int64


In [9]:
import pandas as pd

# Load the cleaned dataset
df = pd.read_csv("ott_churn_cleaned_dataset.csv")  # Use the correct file path if needed

# Rename columns
df.rename(columns={'gender_filled': 'gender', 'churn_filled': 'churn' , 'maximum days_filled': 'maximum_days'}, inplace=True)

# Save the updated dataset
df.to_csv("ott_complete_cleaned_dataset.csv", index=False)  # Saves the cleaned file
df.to_excel("ott_complete_cleaned_dataset.xlsx", index=False)  # Saves as Excel file 

print("Columns renamed successfully. Dataset updated!")


Columns renamed successfully. Dataset updated!
