In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from docx import Document

# Read the script
def read_docx(file_path):
    doc = Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])

# Load retention data, script and video lengths
data = [
    ("B", pd.read_csv("VIDEO B.csv"), read_docx("VIDEO B.docx"), 765),
    ("C", pd.read_csv("VIDEO C.csv"), read_docx("VIDEO C.docx"), 833),
    ("D", pd.read_csv("VIDEO D.csv"), read_docx("VIDEO D.docx"), 822),
    ("E", pd.read_csv("VIDEO E.csv"), read_docx("VIDEO E.docx"), 654),
    ("F", pd.read_csv("VIDEO F.csv"), read_docx("VIDEO F.docx"), 713),    
]

# Calculate the number of rows and columns for the subplot grid
ncols = 2  # Set the number of columns (you can adjust this based on your preference)
nrows = (len(data) + ncols - 1) // ncols  # This calculates the number of rows needed

# Create a grid of subplots to display the retention vs script segment plots
fig, axes = plt.subplots(nrows, ncols, figsize=(6*ncols, 3*nrows))
# if len(data) == 1:  # If there is only one video, axes will not be a list, so convert it
    # axes = [axes]
axes = axes.flatten()

# df_retention = pd.read_csv("VIDEO C.csv")  # Columns: [timestamp, retention_rate]
# script_text = read_docx("VIDEO C.docx") 

full_retention_data = pd.DataFrame()
for i, (vdo, df_retention, script_text, video_length) in enumerate(data):
    # Define total video length (in seconds)
    # total_video_length = 833 
    # Add a new column for video_id
    df_retention["video_id"] = vdo
    vdo_retention_position = (video_length/len(df_retention))

    #Compute Words per Second
    words = script_text.split()  # Split script into words
    total_words = len(words)

    # Calculate words per second
    words_per_second = total_words / video_length
    # Words in each **-second segment
    words_per_segment = int(words_per_second * vdo_retention_position) 

    print(f"Total video length: {video_length}")
    print(len(df_retention))
    print("Video retention position is ", vdo_retention_position)
    print(f"Total Words: {total_words}")
    print(f"Words per second: {words_per_second:.2f}")
    print(f"Words per 8.33s segment: {words_per_segment}")
    #segment the script for each vdo retention position
    # Split script into 8.33s segments
    script_segments = [
        " ".join(words[i * words_per_segment:(i + 1) * words_per_segment])
        for i in range(len(df_retention))
    ]

    # Append remaining words to last segment (if any)
    if len(words) % len(df_retention) != 0:
        script_segments[-1] += " " + " ".join(words[len(df_retention) * words_per_segment:])

    # Assign segments to retention data
    df_retention["script_segment"] = script_segments
    df_retention["timestamp"] = df_retention["Video position (%)"] * vdo_retention_position
        
    # Find sections where retention drops significantly
    drop_off_threshold = 40.00  # Define a threshold, say if retention rate falls below 50%

    drop_points = df_retention[df_retention['Absolute audience retention (%)'] < drop_off_threshold]
    drop_points_segments = drop_points[['Video position (%)', 'timestamp', 'script_segment', 'Absolute audience retention (%)']]

    print(f"Identified Drop-Off Points for Video {vdo}:")
    print(drop_points_segments)
    print(df_retention.head())  # View first few rows
    
    # Plot Retention vs Script Segments for each video
    axes[i].plot(df_retention['Video position (%)'], df_retention['Absolute audience retention (%)'], marker='o', linestyle='-', color='b')
    axes[i].set_title(f"Retention Rate vs Script Segments - Video {vdo}")
    axes[i].set_xlabel("Timestamp (seconds)")
    axes[i].set_ylabel("Retention Rate")
    axes[i].grid(True)

    # # Save final dataset
    df_retention.to_csv(f"script_{vdo}_with_retention.csv", index=False)

    # Append this video data to the full dataset
    full_retention_data = pd.concat([full_retention_data, df_retention], ignore_index=True)

# Hide any unused axes (if the number of videos is less than the grid size)
for j in range(len(data), len(axes)):
    axes[j].axis('off')
plt.tight_layout()
plt.show()
# Save the concatenated full dataset (with video_id column included)
full_retention_data.to_csv("full_retention_data_with_video_id.csv", index=False)
# print(df_retention.head())  # View first few rows
