In [26]:
import pandas as pd
import numpy as np

df= pd.read_csv("ott_complete_cleaned_dataset.csv")

# Handling Missing Values 
df.fillna({
    'maximum_days': df['no_of_days_subscribed'],  # If missing, assume full subscription days
    'customer_support_calls': 0,  # If missing, assume no calls made
}, inplace=True)

# Feature: Churn Risk Score
# Higher churn risk score indicates higher probability of churning
df['churn_risk_score'] = (
    ((df['maximum_days'] - df['no_of_days_subscribed']) * 0.3) + 
    (df['customer_support_calls'] * 0.5) + 
    ((df['weekly_mins_watched'] / (df['no_of_days_subscribed'] + 1)) * -0.2)
)

# Feature:Engagement Score
# Measures overall customer engagement in the platform
df['engagement_score'] = (
    (df['weekly_mins_watched'] * 0.4) + 
    (df['videos_watched'] * 0.3) + 
    ((df['minimum_daily_mins'] + df['maximum_daily_mins']) / 2) * 0.3
)

# Feature:Viewing Consistency Score
# Higher means the user is watching at a more consistent time each day
df['consistency_score'] = df['minimum_daily_mins'] / df['maximum_daily_mins']

# Feature:Night Owl Score
# Measures the tendency of watching content late at night
df['night_owl_score'] = df['weekly_max_night_mins'] / df['weekly_mins_watched']

# Feature:Subscription Tenure Category
# Categorizing users based on their subscription length
df['subscription_tenure'] = pd.cut(df['no_of_days_subscribed'], bins=[0, 90, 365, 1095, np.inf], 
                                   labels=['Short-term', 'Mid-term', 'Long-term', 'Very Long-term'])

# Feature:Multi-Screen User Score
# If a user accesses multiple screens, they are likely engaged
df['multi_screen_score'] = df['multi_screen'].map({'Yes': 1, 'No': 0})

# Feature:Email Marketing Impact Score
# Measures how email subscriptions impact engagement
df['email_marketing_score'] = df['mail_subscribed'].map({'Yes': 1, 'No': 0}) * df['weekly_mins_watched']

# Encoding Categorical Variables 
df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})

# Drop unnecessary columns (if needed)
df.drop(columns=['year', 'customer_id'], inplace=True)

# Save the processed dataset
df.to_csv("processed_ott_data.csv", index=False)

print("Advanced Feature Engineering Completed Successfully!")


Advanced Feature Engineering Completed Successfully!


In [32]:
import pandas as pd
import numpy as np

# Load your processed data
df = pd.read_csv("ott_complete_cleaned_dataset.csv")  

# Ensure the columns exist before applying transformations
if 'multi_screen' in df.columns and 'mail_subscribed' in df.columns:
    
    # Multi-Screen Score: Higher weight for users who use multiple screens
    df['multi_screen_score'] = df['multi_screen'].apply(lambda x: 2 if x == 'Yes' else 1)

    # Email Marketing Score: Higher weight for users subscribed to marketing emails
    df['email_marketing_score'] = df['mail_subscribed'].apply(lambda x: 2 if x == 'Yes' else 1)

else:
    print("Columns 'multi_screen' or 'mail_subscribed' not found in the dataset!")

# Verify the new columns
print(df[['multi_screen_score', 'email_marketing_score']].head())

# Save the updated dataset
df.to_csv("processed_ott_data.csv", index=False)  # Uncomment if you want to save




   multi_screen_score  email_marketing_score
0                   1                      1
1                   1                      1
2                   1                      1
3                   1                      1
4                   1                      1


In [34]:
import pandas as pd
import numpy as np

df= pd.read_csv("ott_complete_cleaned_dataset.csv")

# Handling Missing Values 
df.fillna({
    'maximum_days': df['no_of_days_subscribed'],  # If missing, assume full subscription days
    'customer_support_calls': 0,  # If missing, assume no calls made
}, inplace=True)

# Feature: Churn Risk Score
# Higher churn risk score indicates higher probability of churning
df['churn_risk_score'] = (
    ((df['maximum_days'] - df['no_of_days_subscribed']) * 0.3) + 
    (df['customer_support_calls'] * 0.5) + 
    ((df['weekly_mins_watched'] / (df['no_of_days_subscribed'] + 1)) * -0.2)
)

# Feature:Engagement Score
# Measures overall customer engagement in the platform
df['engagement_score'] = (
    (df['weekly_mins_watched'] * 0.4) + 
    (df['videos_watched'] * 0.3) + 
    ((df['minimum_daily_mins'] + df['maximum_daily_mins']) / 2) * 0.3
)

# Feature:Viewing Consistency Score
# Higher means the user is watching at a more consistent time each day
df['consistency_score'] = df['minimum_daily_mins'] / df['maximum_daily_mins']

# Feature:Night Owl Score
# Measures the tendency of watching content late at night
df['night_owl_score'] = df['weekly_max_night_mins'] / df['weekly_mins_watched']

# Feature:Subscription Tenure Category
# Categorizing users based on their subscription length
df['subscription_tenure'] = pd.cut(df['no_of_days_subscribed'], bins=[0, 90, 365, 1095, np.inf], 
                                   labels=['Short-term', 'Mid-term', 'Long-term', 'Very Long-term'])

# Feature:Multi-Screen User Score
# If a user accesses multiple screens, they are likely engaged
df['multi_screen_score'] = df['multi_screen'].map({'Yes': 1, 'No': 0})

# Feature:Email Marketing Impact Score
# Measures how email subscriptions impact engagement
df['email_marketing_score'] = df['mail_subscribed'].map({'Yes': 1, 'No': 0}) * df['weekly_mins_watched']

# Encoding Categorical Variables 
df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})

# Drop unnecessary columns (if needed)
df.drop(columns=['year', 'customer_id'], inplace=True)

# Save the processed dataset
df.to_csv("processed_ott_data.csv", index=False)

print("Advanced Feature Engineering Completed Successfully!")


Advanced Feature Engineering Completed Successfully!


In [36]:
import pandas as pd
import numpy as np

# Load the cleaned dataset (ensure this contains all relevant columns)
df = pd.read_csv("ott_complete_cleaned_dataset.csv")  # Change the filename accordingly

# Load original dataset to restore missing columns (if needed)
#original_df = pd.read_csv("raw_data.csv")  # Change filename if needed

# Restore 'customer_id' and 'year' if they were dropped
if 'customer_id' not in df.columns or 'year' not in df.columns:
    df = df.merge(original_df[['customer_id', 'year']], on='customer_id', how='left')

# Convert text columns to lowercase to ensure uniformity
df['multi_screen'] = df['multi_screen'].astype(str).str.lower()
df['mail_subscribed'] = df['mail_subscribed'].astype(str).str.lower()

# 🔹 Multi-Screen Score (2 = "Yes", 1 = "No")
df['multi_screen_score'] = df['multi_screen'].apply(lambda x: 2 if x == 'yes' else (1 if x == 'no' else np.nan))
df['multi_screen_score'].fillna(1, inplace=True)  # Default to 1 if missing

# 🔹 Email Marketing Score (2 = "Yes", 1 = "No")
df['email_marketing_score'] = df['mail_subscribed'].apply(lambda x: 2 if x == 'yes' else (1 if x == 'no' else np.nan))
df['email_marketing_score'].fillna(1, inplace=True)  # Default to 1 if missing

# 🔹 Subscription Tenure Score (0 to 1 scale)
df['subscription_tenure'] = df['no_of_days_subscribed'] / df['no_of_days_subscribed'].max()

# 🔹 Engagement Score (Higher engagement = Higher score)
df['engagement_score'] = df['weekly_mins_watched'] / df['weekly_mins_watched'].max()

# 🔹 Consistency Score (Measures stability of daily usage)
df['consistency_score'] = 1 - (df['maximum_daily_mins'] - df['minimum_daily_mins']) / df['maximum_daily_mins']
df['consistency_score'].fillna(0.5, inplace=True)  # Default to mid value

# 🔹 Night Owl Score (More night-time watching = Higher score)
df['night_owl_score'] = df['weekly_max_night_mins'] / df['weekly_mins_watched']
df['night_owl_score'].fillna(0, inplace=True)  # Default to 0 if no night-time watching

# 🔹 Churn Risk Score (Based on user activity and support calls)
df['churn_risk_score'] = (df['customer_support_calls'] / df['customer_support_calls'].max()) + (1 - df['engagement_score'])
df['churn_risk_score'] = df['churn_risk_score'] / df['churn_risk_score'].max()  # Normalize

# 🔹 Final Cleaning: Remove any unintended NaN values
df.fillna(0, inplace=True)

# 🔹 Save the final processed dataset
df.to_csv("processed_data.csv", index=False)

# 🔹 Print a quick summary
print(df.head())
print("Advanced Feature Engineering Completed Successfully!")


   year  customer_id  gender  age  no_of_days_subscribed multi_screen  \
0  2020       100198  Female   36                     62           no   
1  2020       100643  Female   39                    149           no   
2  2020       100756  Female   65                    126           no   
3  2020       101595  Female   24                    131           no   
4  2020       101653  Female   40                    191           no   

  mail_subscribed  weekly_mins_watched  minimum_daily_mins  \
0              no               148.35                12.2   
1              no               294.45                 7.7   
2              no                87.30                11.9   
3             yes               321.30                 9.5   
4              no               243.00                10.9   

   maximum_daily_mins  ...  maximum_days  customer_support_calls  churn  \
0               16.81  ...             4                       1      0   
1               33.37  ...            

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['multi_screen_score'].fillna(1, inplace=True)  # Default to 1 if missing
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['email_marketing_score'].fillna(1, inplace=True)  # Default to 1 if missing
The behavior will change in pandas 3.0. This inplace method will never work b