In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import os
from PIL import Image
from moviepy.editor import ImageSequenceClip, AudioFileClip, concatenate_videoclips
import numpy as np
import io
from matplotlib.animation import FuncAnimation

def generate_animated_frames(data):
    logging.info("Generating animated frames from data")
    frames = []
    
    # Generate a basic time series plot if a date column is present
    date_column = None
    for col in data.columns:
        if 'date' in col.lower() or 'time' in col.lower():
            date_column = col
            break
    
    if date_column:
        numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
        if len(numeric_columns) > 0:
            fig, ax = plt.subplots(figsize=(10, 6))
            lines = [ax.plot([], [], label=col)[0] for col in numeric_columns]
            ax.set_xlim(data[date_column].min(), data[date_column].max())
            ax.set_ylim(data[numeric_columns].min().min(), data[numeric_columns].max().max())
            ax.set_title(f"Time Series Plot for {', '.join(numeric_columns)}")
            ax.set_xlabel('Time')
            ax.set_ylabel('Values')
            ax.legend()

            def update(frame):
                for line, col in zip(lines, numeric_columns):
                    line.set_data(data[date_column][:frame], data[col][:frame])
                return lines

            ani = FuncAnimation(fig, update, frames=len(data), blit=True)
            for i in range(len(data)):
                update(i)
                buf = io.BytesIO()
                fig.savefig(buf, format='png')
                buf.seek(0)
                img = Image.open(buf)
                frames.append(np.array(img))
            plt.close(fig)
    
    # Generate a moving bar chart
    if 'category' in data.columns and 'value' in data.columns:
        fig, ax = plt.subplots(figsize=(10, 6))
        categories = data['category'].unique()
        bars = ax.bar(categories, np.zeros(len(categories)))
        ax.set_ylim(0, data['value'].max())
        ax.set_title('Moving Bar Chart')
        ax.set_xlabel('Category')
        ax.set_ylabel('Value')

        def update_bar(frame):
            for bar, category in zip(bars, categories):
                bar.set_height(data[data['category'] == category]['value'].iloc[frame])
            return bars

        ani = FuncAnimation(fig, update_bar, frames=len(data), blit=True)
        for i in range(len(data)):
            update_bar(i)
            buf = io.BytesIO()
            fig.savefig(buf, format='png')
            buf.seek(0)
            img = Image.open(buf)
            frames.append(np.array(img))
        plt.close(fig)
    
    # Generate a correlation matrix plot
    if data.select_dtypes(include=['float64', 'int64']).shape[1] > 1:
        corr_matrix = data.corr()
        fig, ax = plt.subplots(figsize=(10, 6))
        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5, ax=ax)
        ax.set_title('Correlation Matrix')
        buf = io.BytesIO()
        fig.savefig(buf, format='png')
        buf.seek(0)
        img = Image.open(buf)
        frames.append(np.array(img))
        plt.close(fig)
    
    # Generate distribution plots for numeric columns
    numeric_columns = data.select_dtypes(include(['float64', 'int64']).columns
    for col in numeric_columns:
        fig, ax = plt.subplots(figsize=(8, 6))
        sns.histplot(data[col], bins=10, kde=True, ax=ax)
        ax.set_title(f"Distribution of {col}")
        buf = io.BytesIO()
        fig.savefig(buf, format='png')
        buf.seek(0)
        img = Image.open(buf)
        frames.append(np.array(img))
        plt.close(fig)
    
    logging.info(f"Generated {len(frames)} animated frames")
    return frames

def create_video_from_frames(frames, audio_file=None, video_file="super_final_video.mp4"):
    if not frames:
        raise ValueError("No frames to create video from.")
    
    logging.info("Creating video from frames")
    video_clips = [ImageSequenceClip([np.array(frame)], fps=1) for frame in frames]  # Adjust fps to slow down the video
    
    video = concatenate_videoclips(video_clips, method="compose")
    
    if audio_file and os.path.isfile(audio_file):
        audio = AudioFileClip(audio_file)
        video = video.set_audio(audio)
    
    video.write_videofile(video_file, codec="libx264", fps=24)
    logging.info(f"Video saved as {video_file}")

def generate_infographic_video(data, insights, audio_file=None, title_image="title_screen.png"):
    frames = generate_animated_frames(data)
    if not frames:
        raise ValueError("No frames generated from data.")
    
    if os.path.exists(title_image):
        title_image_clip = Image.open(title_image)
        title_image_clip = title_image_clip.convert("RGBA")
        title_image_clip = np.array(title_image_clip)
        frames.insert(0, title_image_clip)
    
    create_video_from_frames(frames, audio_file)
    print("Video successfully generated!")

def data_storytelling_pipeline(file_path, prompt, audio_file=None):
    try:
        start_time = time.time()
        
        logging.info("Loading and preprocessing data...")
        data = load_and_preprocess_data(file_path)
        logging.debug(f"Loaded Data: {data.head()}")
        
        logging.info("Performing EDA...")
        eda_summary = perform_eda(data)
        logging.debug(f"EDA Summary: {eda_summary}")
        
        logging.info("Analyzing the user's prompt...")
        insights = analyze_prompt_for_insights(prompt)
        logging.debug(f"Extracted insights: {insights}")
        
        logging.info("Creating the infographic video...")
        generate_infographic_video(data, insights, audio_file=audio_file)
        
        end_time = time.time()
        logging.info(f"Pipeline completed successfully in {end_time - start_time:.2f} seconds")
    
    except FileNotFoundError as fnf_error:
        logging.error(f"File not found: {fnf_error}")
        raise
    except pd.errors.ParserError as parser_error:
        logging.error(f"Error parsing the file: {parser_error}")
        raise
    except TypeError as type_error:
        logging.error(f"Type error: {type_error}")
        raise
    except ValueError as value_error:
        logging.error(f"Value error: {value_error}")
        raise
    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}")
        raise

# Example usage:
file_path = '/kaggle/input/model-dataset-new-1/2015.csv'
prompt = "Analyze the country and region data"
audio_file = "/kaggle/working/file.mp3"
title_image = "/kaggle/working/title_screen.png"

if not os.path.exists(file_path):
    raise FileNotFoundError(f"Data file not found: {file_path}")
if not os.path.exists(audio_file):
    raise FileNotFoundError(f"Audio file not found: {audio_file}")
if not os.path.exists(title_image):
    raise FileNotFoundError(f"Title image not found: {title_image}")

data_storytelling_pipeline(file_path, prompt, audio_file=audio_file)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import os
from PIL import Image
from moviepy.editor import ImageSequenceClip, AudioFileClip, concatenate_videoclips, CompositeVideoClip, TextClip
import numpy as np
import io
import pandas as pd
import time
from matplotlib.animation import FuncAnimation
from mpl_toolkits.mplot3d import Axes3D
from gtts import gTTS

def generate_default_frames(data):
    logging.info("Generating default frames from data")
    frames = []
    
    # Generate a basic time series plot if a date column is present
    date_column = None
    for col in data.columns:
        if 'date' in col.lower() or 'time' in col.lower():
            date_column = col
            break
    
    if date_column:
        numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
        if len(numeric_columns) > 0:
            fig, ax = plt.subplots(figsize=(10, 6))
            for col in numeric_columns:
                ax.plot(data[date_column], data[col], label=col)
            ax.set_title(f"Time Series Plot for {', '.join(numeric_columns)}")
            ax.set_xlabel('Time')
            ax.set_ylabel('Values')
            ax.legend()
            buf = io.BytesIO()
            fig.savefig(buf, format='png')
            buf.seek(0)
            img = Image.open(buf)
            frames.append(img)
            plt.close(fig)
    
    # Generate a correlation matrix plot
    if data.select_dtypes(include=['float64', 'int64']).shape[1] > 1:
        corr_matrix = data.corr()
        fig, ax = plt.subplots(figsize=(10, 6))
        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5, ax=ax)
        ax.set_title('Correlation Matrix')
        buf = io.BytesIO()
        fig.savefig(buf, format='png')
        buf.seek(0)
        img = Image.open(buf)
        frames.append(img)
        plt.close(fig)
    
    # Generate animated bar plots for numeric columns
    numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
    for col in numeric_columns:
        fig, ax = plt.subplots(figsize=(8, 6))
        def update(num):
            ax.clear()
            sns.histplot(data[col][:num], bins=10, kde=True, ax=ax)
            ax.set_title(f"Distribution of {col} (Frame {num})")
        ani = FuncAnimation(fig, update, frames=len(data), repeat=False)
        temp_file = "temp_animation.gif"
        ani.save(temp_file, writer='imagemagick')
        img = Image.open(temp_file)
        frames.append(img)
        os.remove(temp_file)
        plt.close(fig)
    
    # Generate 3D scatter plot if there are at least 3 numeric columns
    if len(numeric_columns) >= 3:
        fig = plt.figure(figsize=(10, 6))
        ax = fig.add_subplot(111, projection='3d')
        ax.scatter(data[numeric_columns[0]], data[numeric_columns[1]], data[numeric_columns[2]])
        ax.set_title('3D Scatter Plot')
        ax.set_xlabel(numeric_columns[0])
        ax.set_ylabel(numeric_columns[1])
        ax.set_zlabel(numeric_columns[2])
        buf = io.BytesIO()
        fig.savefig(buf, format='png')
        buf.seek(0)
        img = Image.open(buf)
        frames.append(img)
        plt.close(fig)
    
    logging.info(f"Generated {len(frames)} default frames")
    return frames

def create_video_from_frames(frames, audio_file=None, video_file="final_video.mp4"):
    if not frames:
        raise ValueError("No frames to create video from.")
    
    logging.info("Creating video from frames")
    video_clips = []
    for frame in frames:
        img_clip = ImageSequenceClip([np.array(frame)], fps=1)  # 1 frame per second
        img_clip = img_clip.set_duration(5)  # Each frame lasts 5 seconds
        video_clips.append(img_clip)
    
    video = concatenate_videoclips(video_clips, method="compose")
    
    if audio_file and os.path.isfile(audio_file):
        audio = AudioFileClip(audio_file)
        video = video.set_audio(audio)
    
    video.write_videofile(video_file, codec="libx264", fps=24)
    logging.info(f"Video saved as {video_file}")

def generate_infographic_video(data, insights, audio_file=None, title_image="title_screen.png"):
    frames = generate_default_frames(data)
    if not frames:
        raise ValueError("No frames generated from data.")
    
    if os.path.exists(title_image):
        title_image_clip = Image.open(title_image)
        title_image_clip = title_image_clip.convert("RGBA")
        title_image_clip = np.array(title_image_clip)
        frames.insert(0, title_image_clip)
    
    create_video_from_frames(frames, audio_file)
    print("Video successfully generated!")

def generate_narration(text, output_file="narration.mp3"):
    tts = gTTS(text=text, lang='en')
    tts.save(output_file)
    return output_file

def data_storytelling_pipeline(file_path, prompt, audio_file=None):
    try:
        start_time = time.time()
        
        logging.info("Loading and preprocessing data...")
        data = load_and_preprocess_data(file_path)
        logging.debug(f"Loaded Data: {data.head()}")
        
        logging.info("Performing EDA...")
        eda_summary = perform_eda(data)
        logging.debug(f"EDA Summary: {eda_summary}")
        
        logging.info("Analyzing the user's prompt...")
        insights = analyze_prompt_for_insights(prompt)
        logging.debug(f"Extracted insights: {insights}")
        
        logging.info("Generating narration...")
        narration_text = f"Here is the analysis based on the prompt: {prompt}. {insights}"
        narration_file = generate_narration(narration_text)
        
        logging.info("Creating the infographic video...")
        generate_infographic_video(data, insights, audio_file=narration_file)
        
        end_time = time.time()
        logging.info(f"Pipeline completed successfully in {end_time - start_time:.2f} seconds")
    
    except FileNotFoundError as fnf_error:
        logging.error(f"File not found: {fnf_error}")
        raise
    except pd.errors.ParserError as parser_error:
        logging.error(f"Error parsing the file: {parser_error}")
        raise
    except TypeError as type_error:
        logging.error(f"Type error: {type_error}")
        raise
    except ValueError as value_error:
        logging.error(f"Value error: {value_error}")
        raise
    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}")
        raise

# Example usage:
file_path = '/kaggle/input/model-dataset-new-1/2015.csv'
prompt = "Analyze the country and region data"
audio_file = "/kaggle/working/"
title_image = "/kaggle/working/"

if not os.path.exists(file_path):
    raise FileNotFoundError(f"Data file not found: {file_path}")
if not os.path.exists(audio_file):
    raise FileNotFoundError(f"Audio file not found: {audio_file}")
if not os.path.exists(title_image):
    raise FileNotFoundError(f"Title image not found: {title_image}")

data_storytelling_pipeline(file_path, prompt, audio_file=audio_file)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import os
from PIL import Image
from moviepy.editor import ImageSequenceClip, AudioFileClip, concatenate_videoclips, CompositeVideoClip, TextClip
import numpy as np
import io
import pandas as pd
import time
from matplotlib.animation import FuncAnimation
from mpl_toolkits.mplot3d import Axes3D
from gtts import gTTS

def generate_default_frames(data):
    logging.info("Generating default frames from data")
    frames = []
    
    # Generate a basic time series plot if a date column is present
    date_column = None
    for col in data.columns:
        if 'date' in col.lower() or 'time' in col.lower():
            date_column = col
            break
    
    if date_column:
        numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
        if len(numeric_columns) > 0:
            fig, ax = plt.subplots(figsize=(10, 6))
            for col in numeric_columns:
                ax.plot(data[date_column], data[col], label=col)
            ax.set_title(f"Time Series Plot for {', '.join(numeric_columns)}")
            ax.set_xlabel('Time')
            ax.set_ylabel('Values')
            ax.legend()
            buf = io.BytesIO()
            fig.savefig(buf, format='png')
            buf.seek(0)
            img = Image.open(buf)
            frames.append(np.array(img))
            plt.close(fig)
    
    # Generate a correlation matrix plot
    if data.select_dtypes(include=['float64', 'int64']).shape[1] > 1:
        corr_matrix = data.corr()
        fig, ax = plt.subplots(figsize=(10, 6))
        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5, ax=ax)
        ax.set_title('Correlation Matrix')
        buf = io.BytesIO()
        fig.savefig(buf, format='png')
        buf.seek(0)
        img = Image.open(buf)
        frames.append(np.array(img))
        plt.close(fig)
    
    # Generate animated bar plots for numeric columns
    numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
    for col in numeric_columns:
        fig, ax = plt.subplots(figsize=(8, 6))
        def update(num):
            ax.clear()
            sns.histplot(data[col][:num], bins=10, kde=True, ax=ax)
            ax.set_title(f"Distribution of {col} (Frame {num})")
        ani = FuncAnimation(fig, update, frames=len(data), repeat=False)
        temp_file = "temp_animation.gif"
        ani.save(temp_file, writer='imagemagick')
        img = Image.open(temp_file)
        frames.append(np.array(img))
        os.remove(temp_file)
        plt.close(fig)
    
    # Generate 3D scatter plot if there are at least 3 numeric columns
    if len(numeric_columns) >= 3:
        fig = plt.figure(figsize=(10, 6))
        ax = fig.add_subplot(111, projection='3d')
        ax.scatter(data[numeric_columns[0]], data[numeric_columns[1]], data[numeric_columns[2]])
        ax.set_title('3D Scatter Plot')
        ax.set_xlabel(numeric_columns[0])
        ax.set_ylabel(numeric_columns[1])
        ax.set_zlabel(numeric_columns[2])
        buf = io.BytesIO()
        fig.savefig(buf, format='png')
        buf.seek(0)
        img = Image.open(buf)
        frames.append(np.array(img))
        plt.close(fig)
    
    logging.info(f"Generated {len(frames)} default frames")
    return frames

def create_video_from_frames(frames, audio_file=None, video_file="final_video.mp4"):
    if not frames:
        raise ValueError("No frames to create video from.")
    
    logging.info("Creating video from frames")
    video_clips = []
    for frame in frames:
        img_clip = ImageSequenceClip([frame], fps=1)  # 1 frame per second
        img_clip = img_clip.set_duration(5)  # Each frame lasts 5 seconds
        video_clips.append(img_clip)
    
    video = concatenate_videoclips(video_clips, method="compose")
    
    if audio_file and os.path.isfile(audio_file):
        audio = AudioFileClip(audio_file)
        video = video.set_audio(audio)
    
    video.write_videofile(video_file, codec="libx264", fps=24)
    logging.info(f"Video saved as {video_file}")

def generate_infographic_video(data, insights, audio_file=None, title_image="title_screen.png"):
    frames = generate_default_frames(data)
    if not frames:
        raise ValueError("No frames generated from data.")
    
    if os.path.exists(title_image):
        title_image_clip = Image.open(title_image)
        title_image_clip = title_image_clip.convert("RGBA")
        title_image_clip = np.array(title_image_clip)
        frames.insert(0, title_image_clip)
    
    create_video_from_frames(frames, audio_file)
    print("Video successfully generated!")

def generate_narration(text, output_file="narration.mp3"):
    tts = gTTS(text=text, lang='en')
    tts.save(output_file)
    return output_file

def data_storytelling_pipeline(file_path, prompt, audio_file=None):
    try:
        start_time = time.time()
        
        logging.info("Loading and preprocessing data...")
        data = load_and_preprocess_data(file_path)
        logging.debug(f"Loaded Data: {data.head()}")
        
        logging.info("Performing EDA...")
        eda_summary = perform_eda(data)
        logging.debug(f"EDA Summary: {eda_summary}")
        
        logging.info("Analyzing the user's prompt...")
        insights = analyze_prompt_for_insights(prompt)
        logging.debug(f"Extracted insights: {insights}")
        
        logging.info("Generating narration...")
        narration_text = f"Here is the analysis based on the prompt: {prompt}. {insights}"
        narration_file = generate_narration(narration_text)
        
        logging.info("Creating the infographic video...")
        generate_infographic_video(data, insights, audio_file=narration_file)
        
        end_time = time.time()
        logging.info(f"Pipeline completed successfully in {end_time - start_time:.2f} seconds")
    
    except FileNotFoundError as fnf_error:
        logging.error(f"File not found: {fnf_error}")
        raise
    except pd.errors.ParserError as parser_error:
        logging.error(f"Error parsing the file: {parser_error}")
        raise
    except TypeError as type_error:
        logging.error(f"Type error: {type_error}")
        raise
    except ValueError as value_error:
        logging.error(f"Value error: {value_error}")
        raise
    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}")
        raise

# Example usage:
file_path = '/kaggle/input/model-dataset-new-1/2015.csv'
prompt = "Analyze the country and region data"
audio_file = "/kaggle/working/file.mp3"
title_image = "/kaggle/working/title_screen.png"

if not os.path.exists(file_path):
    raise FileNotFoundError(f"Data file not found: {file_path}")
if not os.path.exists(audio_file):
    raise FileNotFoundError(f"Audio file not found: {audio_file}")
if not os.path.exists(title_image):
    raise FileNotFoundError(f"Title image not found: {title_image}")

data_storytelling_pipeline(file_path, prompt, audio_file=audio_file)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import os
from PIL import Image
from moviepy.editor import ImageSequenceClip, AudioFileClip, concatenate_videoclips, CompositeVideoClip, TextClip
import numpy as np
import io
import pandas as pd
import time
from matplotlib.animation import FuncAnimation
from mpl_toolkits.mplot3d import Axes3D
from gtts import gTTS

def generate_default_frames(data):
    logging.info("Generating default frames from data")
    frames = []
    
    # Generate a basic time series plot if a date column is present
    date_column = None
    for col in data.columns:
        if 'date' in col.lower() or 'time' in col.lower():
            date_column = col
            break
    
    if date_column:
        numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
        if len(numeric_columns) > 0:
            fig, ax = plt.subplots(figsize=(10, 6))
            for col in numeric_columns:
                ax.plot(data[date_column], data[col], label=col)
            ax.set_title(f"Time Series Plot for {', '.join(numeric_columns)}")
            ax.set_xlabel('Time')
            ax.set_ylabel('Values')
            ax.legend()
            buf = io.BytesIO()
            fig.savefig(buf, format='png')
            buf.seek(0)
            img = Image.open(buf)
            frames.append(np.array(img))
            plt.close(fig)
    
    # Generate a correlation matrix plot
    if data.select_dtypes(include=['float64', 'int64']).shape[1] > 1:
        corr_matrix = data.corr()
        fig, ax = plt.subplots(figsize=(10, 6))
        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5, ax=ax)
        ax.set_title('Correlation Matrix')
        buf = io.BytesIO()
        fig.savefig(buf, format='png')
        buf.seek(0)
        img = Image.open(buf)
        frames.append(np.array(img))
        plt.close(fig)
    
    # Generate animated bar plots for numeric columns
    numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
    for col in numeric_columns:
        fig, ax = plt.subplots(figsize=(8, 6))
        def update(num):
            ax.clear()
            sns.histplot(data[col][:num], bins=10, kde=True, ax=ax)
            ax.set_title(f"Distribution of {col} (Frame {num})")
        ani = FuncAnimation(fig, update, frames=len(data), repeat=False)
        temp_file = "temp_animation.gif"
        ani.save(temp_file, writer='imagemagick')
        img = Image.open(temp_file)
        frames.append(np.array(img))
        os.remove(temp_file)
        plt.close(fig)
    
    # Generate 3D scatter plot if there are at least 3 numeric columns
    if len(numeric_columns) >= 3:
        fig = plt.figure(figsize=(10, 6))
        ax = fig.add_subplot(111, projection='3d')
        ax.scatter(data[numeric_columns[0]], data[numeric_columns[1]], data[numeric_columns[2]])
        ax.set_title('3D Scatter Plot')
        ax.set_xlabel(numeric_columns[0])
        ax.set_ylabel(numeric_columns[1])
        ax.set_zlabel(numeric_columns[2])
        buf = io.BytesIO()
        fig.savefig(buf, format='png')
        buf.seek(0)
        img = Image.open(buf)
        frames.append(np.array(img))
        plt.close(fig)
    
    logging.info(f"Generated {len(frames)} default frames")
    return frames

def create_video_from_frames(frames, audio_file=None, video_file="final_video.mp4"):
    if not frames:
        raise ValueError("No frames to create video from.")
    
    logging.info(f"Creating video from {len(frames)} frames")
    for i, frame in enumerate(frames):
        logging.debug(f"Frame {i} shape: {frame.shape}")
    video_clips = []
    for frame in frames:
        img_clip = ImageSequenceClip([frame], fps=1)  # 1 frame per second
        img_clip = img_clip.set_duration(5)  # Each frame lasts 5 seconds
        video_clips.append(img_clip)
    
    video = concatenate_videoclips(video_clips, method="compose")
    
    if audio_file and os.path.isfile(audio_file):
        audio = AudioFileClip(audio_file)
        video = video.set_audio(audio)
    
    video.write_videofile(video_file, codec="libx264", fps=24)
    logging.info(f"Video saved as {video_file}")

def generate_infographic_video(data, insights, audio_file=None, title_image="title_screen.png"):
    frames = generate_default_frames(data)
    if not frames:
        raise ValueError("No frames generated from data.")
    
    if os.path.exists(title_image):
        title_image_clip = Image.open(title_image)
        title_image_clip = title_image_clip.convert("RGBA")
        title_image_clip = np.array(title_image_clip)
        frames.insert(0, title_image_clip)
    
    create_video_from_frames(frames, audio_file)
    print("Video successfully generated!")

def generate_narration(text, output_file="narration.mp3"):
    tts = gTTS(text=text, lang='en')
    tts.save(output_file)
    return output_file

def data_storytelling_pipeline(file_path, prompt, audio_file=None):
    try:
        start_time = time.time()
        
        logging.info("Loading and preprocessing data...")
        data = load_and_preprocess_data(file_path)
        logging.debug(f"Loaded Data: {data.head()}")
        
        logging.info("Performing EDA...")
        eda_summary = perform_eda(data)
        logging.debug(f"EDA Summary: {eda_summary}")
        
        logging.info("Analyzing the user's prompt...")
        insights = analyze_prompt_for_insights(prompt)
        logging.debug(f"Extracted insights: {insights}")
        
        logging.info("Generating narration...")
        narration_text = f"Here is the analysis based on the prompt: {prompt}. {insights}"
        narration_file = generate_narration(narration_text)
        
        logging.info("Creating the infographic video...")
        generate_infographic_video(data, insights, audio_file=narration_file)
        
        end_time = time.time()
        logging.info(f"Pipeline completed successfully in {end_time - start_time:.2f} seconds")
    
    except FileNotFoundError as fnf_error:
        logging.error(f"File not found: {fnf_error}")
        raise
    except pd.errors.ParserError as parser_error:
        logging.error(f"Error parsing the file: {parser_error}")
        raise
    except TypeError as type_error:
        logging.error(f"Type error: {type_error}")
        raise
    except ValueError as value_error:
        logging.error(f"Value error: {value_error}")
        raise
    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}")
        raise

# Example usage:
file_path = '/kaggle/input/model-dataset-new-1/2015.csv'
prompt = "Analyze the country and region data"
audio_file = "/kaggle/working/file.mp3"
title_image = "/kaggle/working/title_screen.png"

if not os.path.exists(file_path):
    raise FileNotFoundError(f"Data file not found: {file_path}")
if not os.path.exists(audio_file):
    raise FileNotFoundError(f"Audio file not found: {audio_file}")
if not os.path.exists(title_image):
    raise FileNotFoundError(f"Title image not found: {title_image}")

data_storytelling_pipeline(file_path, prompt, audio_file=audio_file)

In [None]:
%pip install plotly moviepy gtts

import plotly.express as px
import pandas as pd
import logging
import os
import time
from moviepy.editor import ImageSequenceClip, concatenate_videoclips, AudioFileClip
from gtts import gTTS

def generate_frames(data, chart_type, output_dir="frames"):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    frames = []
    for date in data['date'].unique():
        filtered_data = data[data['date'] == date]
        if chart_type == 'bar':
            fig = px.bar(filtered_data, x='category', y='value', title=f"Bar Chart - {date}")
        elif chart_type == 'pie':
            fig = px.pie(filtered_data, values='value', names='category', title=f"Pie Chart - {date}")
        elif chart_type == 'scatter':
            fig = px.scatter(filtered_data, x='x', y='y', size='value', color='category', title=f"Scatter Plot - {date}")
        else:
            raise ValueError("Invalid chart type")
        
        frame_path = os.path.join(output_dir, f"{chart_type}_{date}.png")
        fig.write_image(frame_path)
        frames.append(frame_path)
    
    return frames

def generate_video_from_frames(frames, output_file="animation.mp4", fps=1):
    clip = ImageSequenceClip(frames, fps=fps)
    clip.write_videofile(output_file, codec="libx264")
    return output_file

def generate_infographic_video(data, insights, audio_file=None, title_image="title_screen.png"):
    logging.info("Checking data columns for required visualizations")
    video_files = []
    
    # Check if required columns are present for each visualization
    video_files = generate_videos_if_needed(data)
    
    if not video_files:
        logging.error("No video files generated from data. Check if the data has the required columns.")
        raise ValueError("No video files generated from data.")
    
    logging.info("Combining video files into a single video")
    video_clips = [VideoFileClip(video_file) for video_file in video_files]
    
    if os.path.exists(title_image):
        title_clip = VideoFileClip(title_image).set_duration(5)
        video_clips.insert(0, title_clip)
    
    final_video = concatenate_videoclips(video_clips, method="compose")
    
    if audio_file and os.path.isfile(audio_file):
        audio = AudioFileClip(audio_file)
        final_video = final_video.set_audio(audio)
    
    final_video.write_videofile("final_video.mp4", codec="libx264", fps=24)
    logging.info("Final video saved as final_video.mp4")

def generate_videos_if_needed(data):
    video_files = []
    if {'category', 'value', 'date'}.issubset(data.columns):
        logging.info("Data contains required columns for bar and pie charts")
        try:
            bar_frames = generate_frames(data, 'bar')
            video_files.append(generate_video_from_frames(bar_frames, "animated_bar_chart.mp4"))
        except Exception as e:
            logging.error(f"Failed to generate animated bar chart: {e}")
        try:
            pie_frames = generate_frames(data, 'pie')
            video_files.append(generate_video_from_frames(pie_frames, "animated_pie_chart.mp4"))
        except Exception as e:
            logging.error(f"Failed to generate animated pie chart: {e}")
    if {'x', 'y', 'value', 'date'}.issubset(data.columns):
        logging.info("Data contains required columns for scatter plot")
        try:
            scatter_frames = generate_frames(data, 'scatter')
            video_files.append(generate_video_from_frames(scatter_frames, "animated_scatter_plot.mp4"))
        except Exception as e:
            logging.error(f"Failed to generate animated scatter plot: {e}")
    return video_files

def generate_narration(text, output_file="narration.mp3"):
    tts = gTTS(text=text, lang='en')
    tts.save(output_file)
    return output_file

def data_storytelling_pipeline(file_path, prompt, audio_file=None):
    try:
        start_time = time.time()
        
        logging.info("Loading and preprocessing data...")
        data = pd.read_csv(file_path)
        logging.debug(f"Loaded Data: {data.head()}")
        
        logging.info("Performing EDA...")
        eda_summary = data.describe()
        logging.debug(f"EDA Summary: {eda_summary}")
        
        logging.info("Analyzing the user's prompt...")
        insights = f"Insights based on the prompt: {prompt}"
        logging.debug(f"Extracted insights: {insights}")
        
        logging.info("Generating narration...")
        narration_text = f"Here is the analysis based on the prompt: {prompt}. {insights}"
        narration_file = generate_narration(narration_text)
        
        logging.info("Creating the infographic video...")
        generate_infographic_video(data, insights, audio_file=narration_file)
        
        end_time = time.time()
        logging.info(f"Pipeline completed successfully in {end_time - start_time:.2f} seconds")
    
    except FileNotFoundError as fnf_error:
        logging.error(f"File not found: {fnf_error}")
        raise
    except pd.errors.ParserError as parser_error:
        logging.error(f"Error parsing the file: {parser_error}")
        raise
    except TypeError as type_error:
        logging.error(f"Type error: {type_error}")
        raise
    except ValueError as value_error:
        logging.error(f"Value error: {value_error}")
        raise
    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}")
        raise

# Example usage:
file_path = '/kaggle/input/model-dataset-new-1/2015.csv'
prompt = "Analyze the country and region data"
audio_file = "/kaggle/working/file.mp3"
title_image = "/kaggle/working/title_screen.png"

if not os.path.exists(file_path):
    raise FileNotFoundError(f"Data file not found: {file_path}")
if not os.path.exists(audio_file):
    raise FileNotFoundError(f"Audio file not found: {audio_file}")
if not os.path.exists(title_image):
    raise FileNotFoundError(f"Title image not found: {title_image}")

data = pd.read_csv(file_path)

required_columns = [{'category', 'value', 'date'}, {'x', 'y', 'value', 'date'}]
if not any(columns.issubset(data.columns) for columns in required_columns):
    raise ValueError("Data does not contain the required columns for generating visualizations.")

data_storytelling_pipeline(file_path, prompt, audio_file=audio_file)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def generate_visualizations(data, insights):
    try:
        logging.info("Generating visualizations")
        if data.empty:
            raise ValueError("Data is empty.")
        if not insights:
            raise ValueError("No insights available to generate visualizations.")
        
        visuals = []
        
        if 'Date' in data.columns or 'Datetime' in data.columns:
            time_column = 'Date' if 'Date' in data.columns else 'Datetime'
            numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
            if len(numeric_columns) > 0:
                plt.figure(figsize=(10, 6))
                for col in numeric_columns:
                    plt.plot(data[time_column], data[col], label=col)
                plt.title(f"Time Series Plot for {', '.join(numeric_columns)}")
                plt.xlabel('Time')
                plt.ylabel('Values')
                plt.xticks(rotation=45)
                plt.legend()
                visuals.append(plt)
        
        if data.select_dtypes(include=['float64', 'int64']).shape[1] > 1:
            corr_matrix = data.corr()
            plt.figure(figsize=(10, 6))
            sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
            plt.title('Correlation Matrix')
            visuals.append(plt)
        
        categorical_columns = data.select_dtypes(include=['object', 'category']).columns
        if len(categorical_columns) > 0:
            for col in categorical_columns:
                plt.figure(figsize=(8, 6))
                sns.countplot(x=col, data=data)
                plt.title(f"Distribution of {col}")
                visuals.append(plt)
        
        numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
        if len(numeric_columns) > 1:
            sns.pairplot(data[numeric_columns])
            plt.suptitle('Pairwise Relationships')
            visuals.append(plt)
        
        if not visuals:
            return None
        
        logging.info(f"Generated {len(visuals)} visualizations")
        return visuals
    except Exception as e:
        logging.error(f"Error generating visualizations: {e}")
        raise

In [None]:
%pip install plotly moviepy gtts transformers spacy

import plotly.express as px
import pandas as pd
import logging
import os
import time
from moviepy.editor import ImageSequenceClip, concatenate_videoclips, AudioFileClip, VideoFileClip
from gtts import gTTS
from transformers import BartTokenizer, BartForConditionalGeneration
import spacy
import re
from sklearn.preprocessing import StandardScaler

# Setting up logging
logging.basicConfig(level=logging.INFO)

# Load spaCy model for NER
nlp = spacy.load("en_core_web_sm")

def load_and_preprocess_data(file_path):
    try:
        logging.info(f"Loading data from {file_path}")
        # Load data
        if file_path.endswith('.csv'):
            data = pd.read_csv(file_path)
        elif file_path.endswith('.xlsx'):
            data = pd.read_excel(file_path)
        elif file_path.endswith('.txt'):
            data = pd.read_csv(file_path, delimiter='\t')
        else:
            raise ValueError("Unsupported file format.")
        
        logging.info("Data loaded successfully")
        
        # Detect and convert data types
        for col in data.columns:
            try:
                data[col] = pd.to_numeric(data[col], errors='coerce')
            except ValueError:
                pass
        
        # Handle missing values
        data.fillna(data.mean(), inplace=True)
        
        # Handle categorical data
        categorical_cols = data.select_dtypes(include=['object']).columns
        data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)
        
        # Normalize numeric data
        numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
        scaler = StandardScaler()
        data[numeric_cols] = scaler.fit_transform(data[numeric_cols])
        
        logging.info("Data preprocessing completed")
        return data
    except Exception as e:
        logging.error(f"Error loading and preprocessing data: {e}")
        raise

def generate_frames(data, chart_type, output_dir="frames"):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    frames = []
    for date in data['date'].unique():
        filtered_data = data[data['date'] == date]
        if chart_type == 'bar':
            fig = px.bar(filtered_data, x='category', y='value', title=f"Bar Chart - {date}")
        elif chart_type == 'pie':
            fig = px.pie(filtered_data, values='value', names='category', title=f"Pie Chart - {date}")
        elif chart_type == 'scatter':
            fig = px.scatter(filtered_data, x='x', y='y', size='value', color='category', title=f"Scatter Plot - {date}")
        else:
            raise ValueError("Invalid chart type")
        
        frame_path = os.path.join(output_dir, f"{chart_type}_{date}.png")
        fig.write_image(frame_path)
        frames.append(frame_path)
    
    return frames

def generate_video_from_frames(frames, output_file="animation.mp4", fps=1):
    clip = ImageSequenceClip(frames, fps=fps)
    clip.write_videofile(output_file, codec="libx264")
    return output_file

def generate_infographic_video(data, insights, audio_file=None, title_image="title_screen.png"):
    logging.info("Checking data columns for required visualizations")
    video_files = []
    
    # Check if required columns are present for each visualization
    video_files = generate_videos_if_needed(data)
    
    if not video_files:
        logging.error("No video files generated from data. Check if the data has the required columns.")
        raise ValueError("No video files generated from data.")
    
    logging.info("Combining video files into a single video")
    video_clips = [VideoFileClip(video_file) for video_file in video_files]
    
    if os.path.exists(title_image):
        title_clip = VideoFileClip(title_image).set_duration(5)
        video_clips.insert(0, title_clip)
    
    final_video = concatenate_videoclips(video_clips, method="compose")
    
    if audio_file and os.path.isfile(audio_file):
        audio = AudioFileClip(audio_file)
        final_video = final_video.set_audio(audio)
    
    final_video.write_videofile("final_video.mp4", codec="libx264", fps=24)
    logging.info("Final video saved as final_video.mp4")

def generate_videos_if_needed(data):
    video_files = []
    if {'category', 'value', 'date'}.issubset(data.columns):
        logging.info("Data contains required columns for bar and pie charts")
        try:
            bar_frames = generate_frames(data, 'bar')
            video_files.append(generate_video_from_frames(bar_frames, "animated_bar_chart.mp4"))
        except Exception as e:
            logging.error(f"Failed to generate animated bar chart: {e}")
        try:
            pie_frames = generate_frames(data, 'pie')
            video_files.append(generate_video_from_frames(pie_frames, "animated_pie_chart.mp4"))
        except Exception as e:
            logging.error(f"Failed to generate animated pie chart: {e}")
    if {'x', 'y', 'value', 'date'}.issubset(data.columns):
        logging.info("Data contains required columns for scatter plot")
        try:
            scatter_frames = generate_frames(data, 'scatter')
            video_files.append(generate_video_from_frames(scatter_frames, "animated_scatter_plot.mp4"))
        except Exception as e:
            logging.error(f"Failed to generate animated scatter plot: {e}")
    return video_files

def generate_narration(text, output_file="narration.mp3"):
    tts = gTTS(text=text, lang='en')
    tts.save(output_file)
    return output_file

def perform_eda(data):
    try:
        eda_summary = {
            "shape": data.shape,
            "columns": data.columns.tolist(),
            "dtypes": data.dtypes.tolist(),
            "null_counts": data.isnull().sum().tolist(),
            "describe": data.describe().to_dict()
        }
        return eda_summary
    except Exception as e:
        logging.error(f"Error performing EDA: {e}")
        raise

def analyze_prompt_for_insights(prompt, model_name="facebook/bart-large"):
    try:
        logging.info("Analyzing prompt for insights")
        tokenizer = BartTokenizer.from_pretrained(model_name)
        model = BartForConditionalGeneration.from_pretrained(model_name)
        
        inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
        input_ids = inputs.input_ids
        attention_mask = inputs.attention_mask
        
        outputs = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=200,
            num_return_sequences=1,
            temperature=0.7,
            top_p=0.9,
            top_k=50,
            no_repeat_ngram_size=2,
            pad_token_id=tokenizer.eos_token_id
        )
        
        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        insights = extract_insights_from_text(generated_text)
        
        insights_list = [key for key, value in insights.items() if value]
        if not insights_list:
            raise ValueError("No insights could be extracted from the provided prompt.")
        
        logging.info(f"Insights extracted: {insights_list}")
        return insights_list
    except Exception as e:
        logging.error(f"Error in analyzing prompt for insights: {e}")
        return []

def extract_insights_from_text(text):
    possible_insights = ["trend", "comparison", "distribution", "correlation", "pattern", "anomaly", "outlier", "relationship", "performance", "growth"]
    insights = {insight: False for insight in possible_insights}
    text = text.lower()
    
    for insight in possible_insights:
        if re.search(r'\b' + re.escape(insight) + r'\b', text):
            insights[insight] = True
    
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    insights['entities'] = entities
    
    return insights

def data_storytelling_pipeline(file_path, prompt, audio_file=None):
    try:
        start_time = time.time()
        
        logging.info("Loading and preprocessing data...")
        data = load_and_preprocess_data(file_path)
        logging.debug(f"Loaded Data: {data.head()}")
        
        logging.info("Performing EDA...")
        eda_summary = perform_eda(data)
        logging.debug(f"EDA Summary: {eda_summary}")
        
        logging.info("Analyzing the user's prompt...")
        insights = analyze_prompt_for_insights(prompt)
        logging.debug(f"Extracted insights: {insights}")
        
        logging.info("Generating narration...")
        narration_text = f"Here is the analysis based on the prompt: {prompt}. {insights}"
        narration_file = generate_narration(narration_text)
        
        logging.info("Creating the infographic video...")
        generate_infographic_video(data, insights, audio_file=narration_file)
        
        end_time = time.time()
        logging.info(f"Pipeline completed successfully in {end_time - start_time:.2f} seconds")
    
    except FileNotFoundError as fnf_error:
        logging.error(f"File not found: {fnf_error}")
        raise
    except pd.errors.ParserError as parser_error:
        logging.error(f"Error parsing the file: {parser_error}")
        raise
    except TypeError as type_error:
        logging.error(f"Type error: {type_error}")
        raise
    except ValueError as value_error:
        logging.error(f"Value error: {value_error}")
        raise
    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}")
        raise


# Example usage:
file_path = '/kaggle/input/model-dataset-new-1/2015.csv'
prompt = "Analyze the country and region data"
audio_file = "/kaggle/working/narration.mp3"
title_image = "/kaggle/working/title_screen.png"

if not os.path.exists(file_path):
    raise FileNotFoundError(f"Data file not found: {file_path}")
if not os.path.exists(audio_file):
    raise FileNotFoundError(f"Audio file not found: {audio_file}")
if not os.path.exists(title_image):
    raise FileNotFoundError(f"Title image not found: {title_image}")

data = pd.read_csv(file_path)

# Print the first few rows and columns in the data for debugging purposes
print("First few rows of the data:\n", data.head())
print("Columns in the data:", data.columns)

required_columns = [{'category', 'value', 'date'}, {'x', 'y', 'value', 'date'}]
missing_columns = [columns for columns in required_columns if not columns.issubset(data.columns)]

if missing_columns:
    raise ValueError(f"Data does not contain the required columns for generating visualizations. Missing columns: {missing_columns}")

data_storytelling_pipeline(file_path, prompt, audio_file=audio_file)

In [None]:
import plotly.express as px
import pandas as pd
import logging
import os
import time
from moviepy.editor import ImageSequenceClip, concatenate_videoclips, AudioFileClip, VideoFileClip
from gtts import gTTS
from transformers import BartTokenizer, BartForConditionalGeneration
import spacy
import re
from sklearn.preprocessing import StandardScaler

# Setting up logging
logging.basicConfig(level=logging.INFO)

# Load spaCy model for NER
nlp = spacy.load("en_core_web_sm")

def load_and_preprocess_data(file_path):
    try:
        logging.info(f"Loading data from {file_path}")
        # Load data
        if file_path.endswith('.csv'):
            data = pd.read_csv(file_path)
        elif file_path.endswith('.xlsx'):
            data = pd.read_excel(file_path)
        elif file_path.endswith('.txt'):
            data = pd.read_csv(file_path, delimiter='\t')
        else:
            raise ValueError("Unsupported file format.")
        
        logging.info("Data loaded successfully")
        
        # Detect and convert data types
        for col in data.columns:
            try:
                data[col] = pd.to_numeric(data[col], errors='coerce')
            except ValueError:
                pass
        
        # Handle missing values
        data.fillna(data.mean(), inplace=True)
        
        # Handle categorical data
        categorical_cols = data.select_dtypes(include=['object']).columns
        data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)
        
        # Normalize numeric data
        numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
        scaler = StandardScaler()
        data[numeric_cols] = scaler.fit_transform(data[numeric_cols])
        
        logging.info("Data preprocessing completed")
        return data
    except Exception as e:
        logging.error(f"Error loading and preprocessing data: {e}")
        raise

def infer_columns(data):
    column_mapping = {}
    possible_columns = {
        'category': ['category', 'type', 'class', 'label', 'country', 'region'],
        'value': ['value', 'amount', 'score', 'total', 'family', 'generosity'],
        'date': ['date', 'time', 'year', 'month', 'day'],
        'x': ['x', 'longitude', 'lat', 'latitude'],
        'y': ['y', 'latitude', 'long', 'longitude']
    }
    
    logging.info(f"Data columns: {data.columns.tolist()}")
    for key, patterns in possible_columns.items():
        for pattern in patterns:
            potential_cols = [col for col in data.columns if re.search(pattern, col, re.IGNORECASE)]
            if potential_cols:
                column_mapping[key] = potential_cols[0]
                break
    
    # If any required column is missing, attempt to infer it
    if 'date' not in column_mapping:
        data['date'] = pd.date_range(start='1/1/2020', periods=len(data), freq='D')
        column_mapping['date'] = 'date'
    
    if 'value' not in column_mapping:
        numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
        if len(numeric_cols) > 0:
            column_mapping['value'] = numeric_cols[0]
        else:
            raise ValueError("Cannot infer 'value' column.")
    
    if 'category' not in column_mapping:
        categorical_cols = data.select_dtypes(include=['object']).columns
        if len(categorical_cols) > 0:
            column_mapping['category'] = categorical_cols[0]
        else:
            non_numeric_cols = data.select_dtypes(exclude=['float64', 'int64']).columns
            if len(non_numeric_cols) > 0:
                column_mapping['category'] = non_numeric_cols[0]
            else:
                raise ValueError("Cannot infer 'category' column.")
    
    return column_mapping

def generate_frames(data, chart_type, column_mapping, output_dir="frames"):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    frames = []
    for date in data[column_mapping['date']].unique():
        filtered_data = data[data[column_mapping['date']] == date]
        if chart_type == 'bar':
            fig = px.bar(filtered_data, x=column_mapping['category'], y=column_mapping['value'], title=f"Bar Chart - {date}")
        elif chart_type == 'pie':
            fig = px.pie(filtered_data, values=column_mapping['value'], names=column_mapping['category'], title=f"Pie Chart - {date}")
        elif chart_type == 'scatter':
            fig = px.scatter(filtered_data, x=column_mapping['x'], y=column_mapping['y'], size=column_mapping['value'], color=column_mapping['category'], title=f"Scatter Plot - {date}")
        else:
            raise ValueError("Invalid chart type")
        
        frame_path = os.path.join(output_dir, f"{chart_type}_{date}.png")
        fig.write_image(frame_path)
        frames.append(frame_path)
    
    return frames

def generate_video_from_frames(frames, output_file="animation.mp4", fps=1):
    clip = ImageSequenceClip(frames, fps=fps)
    clip.write_videofile(output_file, codec="libx264")
    return output_file

def generate_infographic_video(data, insights, audio_file=None, title_image="title_screen.png"):
    logging.info("Checking data columns for required visualizations")
    video_files = []
    
    # Check if required columns are present for each visualization
    video_files = generate_videos_if_needed(data)
    
    if not video_files:
        logging.error("No video files generated from data. Check if the data has the required columns.")
        logging.error(f"Column mapping: {column_mapping}")
        logging.error(f"Required columns: {required_columns}")
        raise ValueError("No video files generated from data.")
    
    logging.info("Combining video files into a single video")
    video_clips = [VideoFileClip(video_file) for video_file in video_files]
    
    if os.path.exists(title_image):
        title_clip = VideoFileClip(title_image).set_duration(5)
        video_clips.insert(0, title_clip)
    
    final_video = concatenate_videoclips(video_clips, method="compose")
    
    if audio_file and os.path.isfile(audio_file):
        audio = AudioFileClip(audio_file)
        final_video = final_video.set_audio(audio)
    
    final_video.write_videofile("final_video.mp4", codec="libx264", fps=24)
    logging.info("Final video saved as final_video.mp4")

def generate_videos_if_needed(data):
    video_files = []
    column_mapping = infer_columns(data)
    required_columns = {'category', 'value', 'date'}
    if not required_columns.issubset(column_mapping.keys()):
        logging.error(f"Missing required columns: {required_columns - column_mapping.keys()}")
        raise ValueError(f"Missing required columns: {required_columns - column_mapping.keys()}")

    if required_columns.issubset(column_mapping.keys()):
        logging.info("Data contains required columns for bar and pie charts")
        try:
            bar_frames = generate_frames(data, 'bar', column_mapping)
            video_files.append(generate_video_from_frames(bar_frames, "animated_bar_chart.mp4"))
        except Exception as e:
            logging.error(f"Failed to generate animated bar chart: {e}")
        try:
            pie_frames = generate_frames(data, 'pie', column_mapping)
            video_files.append(generate_video_from_frames(pie_frames, "animated_pie_chart.mp4"))
        except Exception as e:
            logging.error(f"Failed to generate animated pie chart: {e}")
    if {'x', 'y', 'value', 'date'}.issubset(column_mapping.keys()):
        logging.info("Data contains required columns for scatter plot")
        try:
            scatter_frames = generate_frames(data, 'scatter', column_mapping)
            video_files.append(generate_video_from_frames(scatter_frames, "animated_scatter_plot.mp4"))
        except Exception as e:
            logging.error(f"Failed to generate animated scatter plot: {e}")
    return video_files

def generate_narration(text, output_file="narration.mp3"):
    tts = gTTS(text=text, lang='en')
    tts.save(output_file)
    return output_file

def perform_eda(data):
    try:
        eda_summary = {
            "shape": data.shape,
            "columns": data.columns.tolist(),
            "dtypes": data.dtypes.tolist(),
            "null_counts": data.isnull().sum().tolist(),
            "describe": data.describe().to_dict()
        }
        return eda_summary
    except Exception as e:
        logging.error(f"Error performing EDA: {e}")
        raise

def analyze_prompt_for_insights(prompt, model_name="facebook/bart-large"):
    try:
        logging.info("Analyzing prompt for insights")
        tokenizer = BartTokenizer.from_pretrained(model_name)
        model = BartForConditionalGeneration.from_pretrained(model_name)
        
        inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
        input_ids = inputs.input_ids
        attention_mask = inputs.attention_mask
        
        outputs = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=200,
            num_return_sequences=1,
            temperature=0.7,
            top_p=0.9,
            top_k=50,
            no_repeat_ngram_size=2,
            pad_token_id=tokenizer.eos_token_id
        )
        
        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        insights = extract_insights_from_text(generated_text)
        
        insights_list = [key for key, value in insights.items() if value]
        if not insights_list:
            raise ValueError("No insights could be extracted from the provided prompt.")
        
        logging.info(f"Insights extracted: {insights_list}")
        return insights_list
    except Exception as e:
        logging.error(f"Error in analyzing prompt for insights: {e}")
        return []

def extract_insights_from_text(text):
    possible_insights = ["trend", "comparison", "distribution", "correlation", "pattern", "anomaly", "outlier", "relationship", "performance", "growth"]
    insights = {insight: False for insight in possible_insights}
    text = text.lower()
    
    for insight in possible_insights:
        if re.search(r'\b' + re.escape(insight) + r'\b', text):
            insights[insight] = True
    
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    insights['entities'] = entities
    
    return insights

def data_storytelling_pipeline(file_path, prompt, audio_file=None):
    try:
        start_time = time.time()
        
        logging.info("Loading and preprocessing data...")
        data = load_and_preprocess_data(file_path)
        logging.debug(f"Loaded Data: {data.head()}")
        
        logging.info("Performing EDA...")
        eda_summary = perform_eda(data)
        logging.debug(f"EDA Summary: {eda_summary}")
        
        logging.info("Analyzing the user's prompt...")
        insights = analyze_prompt_for_insights(prompt)
        logging.debug(f"Extracted insights: {insights}")
        
        logging.info("Generating narration...")
        narration_text = f"Here is the analysis based on the prompt: {prompt}. {insights}"
        narration_file = generate_narration(narration_text)
        
        logging.info("Creating the infographic video...")
        generate_infographic_video(data, insights, audio_file=narration_file)
        
        end_time = time.time()
        logging.info(f"Pipeline completed successfully in {end_time - start_time:.2f} seconds")
    
    except FileNotFoundError as fnf_error:
        logging.error(f"File not found: {fnf_error}")
        raise
    except pd.errors.ParserError as parser_error:
        logging.error(f"Error parsing the file: {parser_error}")
        raise
    except TypeError as type_error:
        logging.error(f"Type error: {type_error}")
        raise
    except ValueError as value_error:
        logging.error(f"Value error: {value_error}")
        raise
    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}")
        raise

# Example usage:
file_path = '/kaggle/input/model-dataset-new-1/2017.csv'
prompt = "compare the family and generosity in an interactive video format"
audio_file = "/kaggle/working/narration.mp3"
title_image = "/kaggle/working/title_screen.png"

if not os.path.exists(file_path):
    raise FileNotFoundError(f"Data file not found: {file_path}")
if not os.path.exists(audio_file):
    raise FileNotFoundError(f"Audio file not found: {audio_file}")
if not os.path.exists(title_image):
    raise FileNotFoundError(f"Title image not found: {title_image}")

data = pd.read_csv(file_path)

# Print the first few rows and columns in the data for debugging purposes
print("First few rows of the data:\n", data.head())
print("Columns in the data:", data.columns)

data_storytelling_pipeline(file_path, prompt, audio_file=audio_file)


In [None]:
import plotly.express as px
import pandas as pd
import logging
import os
import time
from moviepy.editor import ImageSequenceClip, concatenate_videoclips, AudioFileClip, VideoFileClip
from gtts import gTTS
from transformers import BartTokenizer, BartForConditionalGeneration
import spacy
import re
from sklearn.preprocessing import StandardScaler

# Ensure all required packages are installed
%pip install plotly pandas moviepy gtts transformers spacy scikit-learn
%pip install torch  # Required for transformers
%pip install openpyxl  # Required for reading .xlsx files with pandas
%pip install kaleido  # Required for saving plotly figures as images

# Setting up logging
logging.basicConfig(level=logging.INFO)

# Load spaCy model for NER
nlp = spacy.load("en_core_web_sm")

def load_and_preprocess_data(file_path):
    try:
        logging.info(f"Loading data from {file_path}")
        print(f"Loading data from {file_path}")
        # Load data
        if file_path.endswith('.csv'):
            data = pd.read_csv(file_path)
        elif file_path.endswith('.xlsx'):
            data = pd.read_excel(file_path)
        elif file_path.endswith('.txt'):
            data = pd.read_csv(file_path, delimiter='\t')
        else:
            raise ValueError("Unsupported file format.")
        
        logging.info("Data loaded successfully")
        print("Data loaded successfully")
        
        # Detect and convert data types
        for col in data.columns:
            try:
                data[col] = pd.to_numeric(data[col], errors='coerce')
            except ValueError:
                pass
        
        # Handle missing values
        data.fillna(data.mean(), inplace=True)
        
        # Handle categorical data
        categorical_cols = data.select_dtypes(include=['object']).columns
        data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)
        
        # Normalize numeric data
        numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
        scaler = StandardScaler()
        data[numeric_cols] = scaler.fit_transform(data[numeric_cols])
        
        logging.info("Data preprocessing completed")
        print("Data preprocessing completed")
        return data
    except Exception as e:
        logging.error(f"Error loading and preprocessing data: {e}")
        print(f"Error loading and preprocessing data: {e}")
        raise

def infer_columns(data):
    try:
        column_mapping = {}
        possible_columns = {
            'category': ['category', 'type', 'class', 'label', 'country', 'region'],
            'value': ['value', 'amount', 'score', 'total', 'family', 'generosity'],
            'date': ['date', 'time', 'year', 'month', 'day'],
            'x': ['x', 'longitude', 'lat', 'latitude'],
            'y': ['y', 'latitude', 'long', 'longitude']
        }
        
        logging.info(f"Data columns: {data.columns.tolist()}")
        print(f"Data columns: {data.columns.tolist()}")
        for key, patterns in possible_columns.items():
            for pattern in patterns:
                potential_cols = [col for col in data.columns if re.search(pattern, col, re.IGNORECASE)]
                if potential_cols:
                    column_mapping[key] = potential_cols[0]
                    break
        
        # If any required column is missing, attempt to infer it
        if 'date' not in column_mapping:
            data['date'] = pd.date_range(start='1/1/2020', periods=len(data), freq='D')
            column_mapping['date'] = 'date'
        
        if 'value' not in column_mapping:
            numeric_cols = data.select_dtypes(include=['float64', 'int64']).columns
            if len(numeric_cols) > 0:
                column_mapping['value'] = numeric_cols[0]
            else:
                raise ValueError("Cannot infer 'value' column.")
        
        if 'category' not in column_mapping:
            categorical_cols = data.select_dtypes(include=['object']).columns
            if len(categorical_cols) > 0:
                column_mapping['category'] = categorical_cols[0]
            else:
                non_numeric_cols = data.select_dtypes(exclude=['float64', 'int64']).columns
                if len(non_numeric_cols) > 0:
                    column_mapping['category'] = non_numeric_cols[0]
                else:
                    raise ValueError("Cannot infer 'category' column.")
        
        return column_mapping
    except Exception as e:
        logging.error(f"Error inferring columns: {e}")
        print(f"Error inferring columns: {e}")
        raise

def generate_frames(data, chart_type, column_mapping, output_dir="frames"):
    try:
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        frames = []
        for date in data[column_mapping['date']].unique():
            filtered_data = data[data[column_mapping['date']] == date]
            if chart_type == 'bar':
                fig = px.bar(filtered_data, x=column_mapping['category'], y=column_mapping['value'], title=f"Bar Chart - {date}")
            elif chart_type == 'pie':
                fig = px.pie(filtered_data, values=column_mapping['value'], names=column_mapping['category'], title=f"Pie Chart - {date}")
            elif chart_type == 'scatter':
                fig = px.scatter(filtered_data, x=column_mapping['x'], y=column_mapping['y'], size=column_mapping['value'], color=column_mapping['category'], title=f"Scatter Plot - {date}")
            else:
                raise ValueError("Invalid chart type")
            
            frame_path = os.path.join(output_dir, f"{chart_type}_{date}.png")
            fig.write_image(frame_path)
            frames.append(frame_path)
        
        return frames
    except Exception as e:
        logging.error(f"Error generating frames: {e}")
        print(f"Error generating frames: {e}")
        raise

def generate_video_from_frames(frames, output_file="animation.mp4", fps=1):
    try:
        clip = ImageSequenceClip(frames, fps=fps)
        clip.write_videofile(output_file, codec="libx264")
        return output_file
    except Exception as e:
        logging.error(f"Error generating video from frames: {e}")
        print(f"Error generating video from frames: {e}")
        raise

def generate_infographic_video(data, insights, audio_file=None, title_image="title_screen.png"):
    try:
        logging.info("Checking data columns for required visualizations")
        print("Checking data columns for required visualizations")
        column_mapping = infer_columns(data)
        logging.debug(f"Column mapping: {column_mapping}")
        print(f"Column mapping: {column_mapping}")
        video_files = generate_videos_if_needed(data)
        
        if not video_files:
            logging.error("No video files generated from data. Check if the data has the required columns.")
            print("No video files generated from data. Check if the data has the required columns.")
            raise ValueError("No video files generated from data.")
        
        logging.info("Combining video files into a single video")
        print("Combining video files into a single video")
        video_clips = [VideoFileClip(video_file) for video_file in video_files]
        
        if os.path.exists(title_image):
            title_clip = VideoFileClip(title_image).set_duration(5)
            video_clips.insert(0, title_clip)
        
        final_video = concatenate_videoclips(video_clips, method="compose")
        
        if audio_file and os.path.isfile(audio_file):
            audio = AudioFileClip(audio_file)
            final_video = final_video.set_audio(audio)
        
        final_video.write_videofile("final_video.mp4", codec="libx264", fps=24)
        logging.info("Final video saved as final_video.mp4")
        print("Final video saved as final_video.mp4")
    except Exception as e:
        logging.error(f"Error generating infographic video: {e}")
        print(f"Error generating infographic video: {e}")
        raise

def generate_videos_if_needed(data):
    try:
        video_files = []
        column_mapping = infer_columns(data)
        required_columns = {'category', 'value', 'date'}
        if not required_columns.issubset(column_mapping.keys()):
            logging.error(f"Missing required columns: {required_columns - column_mapping.keys()}")
            print(f"Missing required columns: {required_columns - column_mapping.keys()}")
            raise ValueError(f"Missing required columns: {required_columns - column_mapping.keys()}")

        if required_columns.issubset(column_mapping.keys()):
            logging.info("Data contains required columns for bar and pie charts")
            print("Data contains required columns for bar and pie charts")
            try:
                bar_frames = generate_frames(data, 'bar', column_mapping)
                video_files.append(generate_video_from_frames(bar_frames, "animated_bar_chart.mp4"))
            except Exception as e:
                logging.error(f"Failed to generate animated bar chart: {e}")
                print(f"Failed to generate animated bar chart: {e}")
            try:
                pie_frames = generate_frames(data, 'pie', column_mapping)
                video_files.append(generate_video_from_frames(pie_frames, "animated_pie_chart.mp4"))
            except Exception as e:
                logging.error(f"Failed to generate animated pie chart: {e}")
                print(f"Failed to generate animated pie chart: {e}")
        if {'x', 'y', 'value', 'date'}.issubset(column_mapping.keys()):
            logging.info("Data contains required columns for scatter plot")
            print("Data contains required columns for scatter plot")
            try:
                scatter_frames = generate_frames(data, 'scatter', column_mapping)
                video_files.append(generate_video_from_frames(scatter_frames, "animated_scatter_plot.mp4"))
            except Exception as e:
                logging.error(f"Failed to generate animated scatter plot: {e}")
                print(f"Failed to generate animated scatter plot: {e}")
        return video_files
    except Exception as e:
        logging.error(f"Error generating videos if needed: {e}")
        print(f"Error generating videos if needed: {e}")
        raise

def generate_narration(text, output_file="narration.mp3"):
    try:
        tts = gTTS(text=text, lang='en')
        tts.save(output_file)
        return output_file
    except Exception as e:
        logging.error(f"Error generating narration: {e}")
        print(f"Error generating narration: {e}")
        raise

def perform_eda(data):
    try:
        eda_summary = {
            "shape": data.shape,
            "columns": data.columns.tolist(),
            "dtypes": data.dtypes.tolist(),
            "null_counts": data.isnull().sum().tolist(),
            "describe": data.describe().to_dict()
        }
        return eda_summary
    except Exception as e:
        logging.error(f"Error performing EDA: {e}")
        print(f"Error performing EDA: {e}")
        raise

def analyze_prompt_for_insights(prompt, model_name="facebook/bart-large"):
    try:
        logging.info("Analyzing prompt for insights")
        print("Analyzing prompt for insights")
        tokenizer = BartTokenizer.from_pretrained(model_name)
        model = BartForConditionalGeneration.from_pretrained(model_name)
        
        inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
        input_ids = inputs.input_ids
        attention_mask = inputs.attention_mask
        
        outputs = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=200,
            num_return_sequences=1,
            temperature=0.7,
            top_p=0.9,
            top_k=50,
            no_repeat_ngram_size=2,
            pad_token_id=tokenizer.eos_token_id
        )
        
        generated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        insights = extract_insights_from_text(generated_text)
        
        insights_list = [key for key, value in insights.items() if value]
        if not insights_list:
            raise ValueError("No insights could be extracted from the provided prompt.")
        
        logging.info(f"Insights extracted: {insights_list}")
        print(f"Insights extracted: {insights_list}")
        return insights_list
    except Exception as e:
        logging.error(f"Error in analyzing prompt for insights: {e}")
        print(f"Error in analyzing prompt for insights: {e}")
        return []

def extract_insights_from_text(text):
    try:
        possible_insights = ["trend", "comparison", "distribution", "correlation", "pattern", "anomaly", "outlier", "relationship", "performance", "growth"]
        insights = {insight: False for insight in possible_insights}
        text = text.lower()
        
        for insight in possible_insights:
            if re.search(r'\b' + re.escape(insight) + r'\b', text):
                insights[insight] = True
        
        doc = nlp(text)
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        insights['entities'] = entities
        
        return insights
    except Exception as e:
        logging.error(f"Error extracting insights from text: {e}")
        print(f"Error extracting insights from text: {e}")
        raise

def data_storytelling_pipeline(file_path, prompt, audio_file=None):
    try:
        start_time = time.time()
        
        logging.info("Loading and preprocessing data...")
        print("Loading and preprocessing data...")
        data = load_and_preprocess_data(file_path)
        logging.debug(f"Loaded Data: {data.head()}")
        print(f"Loaded Data: {data.head()}")
        
        logging.info("Performing EDA...")
        print("Performing EDA...")
        eda_summary = perform_eda(data)
        logging.debug(f"EDA Summary: {eda_summary}")
        print(f"EDA Summary: {eda_summary}")
        
        logging.info("Analyzing the user's prompt...")
        print("Analyzing the user's prompt...")
        insights = analyze_prompt_for_insights(prompt)
        logging.debug(f"Extracted insights: {insights}")
        print(f"Extracted insights: {insights}")
        
        logging.info("Generating narration...")
        print("Generating narration...")
        narration_text = f"Here is the analysis based on the prompt: {prompt}. {insights}"
        narration_file = generate_narration(narration_text)
        
        logging.info("Creating the infographic video...")
        print("Creating the infographic video...")
        generate_infographic_video(data, insights, audio_file=narration_file)
        
        end_time = time.time()
        logging.info(f"Pipeline completed successfully in {end_time - start_time:.2f} seconds")
        print(f"Pipeline completed successfully in {end_time - start_time:.2f} seconds")
    
    except FileNotFoundError as fnf_error:
        logging.error(f"File not found: {fnf_error}")
        print(f"File not found: {fnf_error}")
        raise
    except pd.errors.ParserError as parser_error:
        logging.error(f"Error parsing the file: {parser_error}")
        print(f"Error parsing the file: {parser_error}")
        raise
    except TypeError as type_error:
        logging.error(f"Type error: {type_error}")
        print(f"Type error: {type_error}")
        raise
    except ValueError as value_error:
        logging.error(f"Value error: {value_error}")
        print(f"Value error: {value_error}")
        raise
    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}")
        print(f"An unexpected error occurred: {e}")
        raise

if not os.path.exists(file_path):
    raise FileNotFoundError(f"Data file not found: {file_path}")
if audio_file and not os.path.exists(audio_file):
    raise FileNotFoundError(f"Audio file not found: {audio_file}")
if title_image and not os.path.exists(title_image):
    raise FileNotFoundError(f"Title image not found: {title_image}")
if not os.path.exists(file_path):
    raise FileNotFoundError(f"Data file not found: {file_path}")
if not os.path.exists(audio_file):
    raise FileNotFoundError(f"Audio file not found: {audio_file}")
if not os.path.exists(title_image):
    raise FileNotFoundError(f"Title image not found: {title_image}")

try:
    data_storytelling_pipeline(file_path, prompt, audio_file=audio_file)
except ValueError as e:
    if "Missing required columns" in str(e):
        logging.info("Adding missing columns to the data")
        print("Adding missing columns to the data")
        data = load_and_preprocess_data(file_path)
        column_mapping = infer_columns(data)
        
        if 'date' not in column_mapping:
            data['date'] = pd.date_range(start='1/1/2020', periods=len(data), freq='D')
        
        if 'value' not in column_mapping:
            data['value'] = data.select_dtypes(include=['float64', 'int64']).iloc[:, 0]
        
        if 'category' not in column_mapping:
            data['category'] = 'default_category'
        
        data_storytelling_pipeline(file_path, prompt, audio_file=audio_file)
    else:
        raise

In [1]:
# will work tommorow