In [None]:
%pip install matplotlib 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import random
import os

In [None]:
train_df = pd.read_csv('Set3_Sentiment/train/train_prop_sent_csv3_final.csv', encoding='ISO-8859-1')
# Define path to video clips
video_dir = 'Set3_Sentiment/train/train_prop_sent_data3'


# Function to get video file path from IDs
def get_video_clip_path(row):
    dialogue_id = row['Dialogue_ID']
    utterance_id = row['Utterance_ID']
    filename = f"dia{dialogue_id}_utt{utterance_id}.mp4"
    return os.path.join(video_dir, filename)

# Apply the function to get file paths for each sampled clip
train_df['video_clip_path'] = train_df.apply(get_video_clip_path, axis=1)

# Check sample paths
print(train_df[['Dialogue_ID', 'Utterance_ID', 'video_clip_path']].head())

In [None]:
train_df.shape

code for text 

In [None]:
# Import required libraries
import re
import string

def clean_text(text):
    """
    Clean text while preserving emotional markers
    """
    # Convert to lowercase
    text = text.lower()
    
    # Replace multiple exclamation/question marks with single ones while preserving them
    text = re.sub(r'!+', '!', text)
    text = re.sub(r'\?+', '?', text)
    
    # Handle ellipsis (preserve as single ...)
    text = re.sub(r'\.{2,}', '...', text)
    
    # Remove other punctuation except ! ? ...
    def remove_punct(text):
        punct = string.punctuation.replace('!', '').replace('?', '').replace('.', '')
        return ''.join(ch for ch in text if ch not in punct)
    
    text = remove_punct(text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

# Add cleaned text as new column
train_df['cleaned_text'] = train_df['Utterance'].apply(clean_text)

# Display some examples
print("Original vs Cleaned Text Examples:")
for i in range(5):
    print(f"\nOriginal: {train_df['Utterance'].iloc[i]}")
    print(f"Cleaned : {train_df['cleaned_text'].iloc[i]}")

# Add emotion markers count as features
train_df['exclamation_count'] = train_df['Utterance'].str.count(r'!')
train_df['question_count'] = train_df['Utterance'].str.count(r'\?')
train_df['ellipsis_count'] = train_df['Utterance'].str.count(r'\.\.\.')

# Show distribution of emotion markers
print("\nEmotion Markers Statistics:")
print("\nExclamation marks:")
print(train_df['exclamation_count'].value_counts().head())
print("\nQuestion marks:")
print(train_df['question_count'].value_counts().head())
print("\nEllipsis:")
print(train_df['ellipsis_count'].value_counts().head())


In [None]:
# Function to get previous utterances in same dialogue
def get_context_window(df, current_row, window_size=2):
    """
    Get previous utterances in the same dialogue
    """
    dialogue_id = current_row['Dialogue_ID']
    utterance_id = current_row['Utterance_ID']
    
    # Get all utterances from same dialogue with lower utterance IDs
    prev_utterances = df[
        (df['Dialogue_ID'] == dialogue_id) & 
        (df['Utterance_ID'] < utterance_id)
    ].sort_values('Utterance_ID', ascending=False)
    
    # Take the last 'window_size' utterances
    return prev_utterances.head(window_size)

# Add context features
def add_context_features(df):
    """
    Add contextual features to the dataframe
    """
    # Initialize new columns
    df['prev_sentiment'] = None
    df['prev_speaker'] = None
    df['time_gap'] = 0.0
    df['utterance_position'] = 0
    
    # Process each row
    for idx, row in df.iterrows():
        # Get previous utterances
        prev_utts = get_context_window(df, row)
        
        if not prev_utts.empty:
            # Get previous sentiment
            df.at[idx, 'prev_sentiment'] = prev_utts.iloc[0]['Sentiment']
            
            # Get previous speaker
            df.at[idx, 'prev_speaker'] = prev_utts.iloc[0]['Speaker']
            
            # Calculate time gap
            current_start = pd.to_datetime(row['StartTime'])
            prev_end = pd.to_datetime(prev_utts.iloc[0]['EndTime'])
            df.at[idx, 'time_gap'] = (current_start - prev_end).total_seconds()
    
    # Add dialogue position feature
    dialogue_lengths = df.groupby('Dialogue_ID').size()
    df['dialogue_length'] = df['Dialogue_ID'].map(dialogue_lengths)
    df['utterance_position'] = df.groupby('Dialogue_ID')['Utterance_ID'].rank()
    df['relative_position'] = df['utterance_position'] / df['dialogue_length']
    
    return df

# Apply context features
train_df = add_context_features(train_df)

# Display example of context features
print("\nExample of context features:")
print(train_df[['Utterance', 'Speaker', 'Sentiment', 
                'prev_sentiment', 'prev_speaker', 
                'time_gap', 'relative_position']].head())

# Analyze speaker interactions
speaker_pairs = train_df[['Speaker', 'prev_speaker']].value_counts()
print("\nMost common speaker interactions:")
print(speaker_pairs.head())

# Analyze sentiment transitions
sentiment_transitions = train_df[['Sentiment', 'prev_sentiment']].value_counts()
print("\nMost common sentiment transitions:")
print(sentiment_transitions.head())

In [None]:
# Create speaker profiles and analyze patterns
def create_speaker_profiles(df):
    # Get sentiment distribution for each speaker
    speaker_sentiment = pd.crosstab(df['Speaker'], df['Sentiment'], normalize='index')
    
    # Calculate average emotion markers per speaker
    speaker_stats = df.groupby('Speaker').agg({
        'exclamation_count': 'mean',
        'question_count': 'mean',
        'ellipsis_count': 'mean',
        'Utterance': 'count'  # number of utterances
    }).rename(columns={'Utterance': 'total_utterances'})
    
    # Combine with sentiment distributions
    speaker_profiles = pd.concat([speaker_sentiment, speaker_stats], axis=1)
    
    return speaker_profiles

# Analyze character interactions
def analyze_character_interactions(df):
    # Create speaker pairs
    df['speaker_pair'] = df.apply(lambda x: 
        tuple(sorted([x['Speaker'], x['prev_speaker']]))
        if pd.notna(x['prev_speaker']) else None, axis=1)
    
    # Get sentiment patterns for each speaker pair
    pair_sentiments = df[df['speaker_pair'].notna()].groupby('speaker_pair')['Sentiment'].value_counts()
    
    # Calculate average time gap between speakers
    pair_time_gaps = df[df['speaker_pair'].notna()].groupby('speaker_pair')['time_gap'].mean()
    
    return pair_sentiments, pair_time_gaps

# Create speaker profiles
speaker_profiles = create_speaker_profiles(train_df)
print("Speaker Profiles:")
print(speaker_profiles.round(3))

# Analyze main character interactions
pair_sentiments, pair_time_gaps = analyze_character_interactions(train_df)

# Display specific character interactions
main_characters = ['Ross', 'Rachel', 'Monica', 'Chandler', 'Joey', 'Phoebe']
print("\nMain Character Interactions:")
for pair in pair_sentiments.index.levels[0]:
    if all(char in pair for char in ['Ross', 'Rachel']):
        print("\nRoss-Rachel Interactions:")
        print(pair_sentiments[pair])
    elif all(char in pair for char in ['Monica', 'Chandler']):
        print("\nMonica-Chandler Interactions:")
        print(pair_sentiments[pair])

# Add speaker-based features to dataframe
def add_speaker_features(df, speaker_profiles):
    # Add speaker's typical sentiment distributions
    for sentiment in ['positive', 'negative', 'neutral']:
        col_name = f'speaker_{sentiment}_ratio'
        df[col_name] = df['Speaker'].map(speaker_profiles[sentiment])
    
    # Add speaker's emotion marker averages
    for marker in ['exclamation_count', 'question_count', 'ellipsis_count']:
        col_name = f'speaker_avg_{marker}'
        df[col_name] = df['Speaker'].map(speaker_profiles[marker])
    
    return df

# Apply speaker features
train_df = add_speaker_features(train_df, speaker_profiles)

# Display example of new features
print("\nExample of speaker-based features:")
print(train_df[['Speaker', 'Sentiment', 
                'speaker_positive_ratio', 'speaker_negative_ratio', 
                'speaker_avg_exclamation_count']].head())

In [None]:
# Function to extract temporal features
def add_temporal_features(df):
    """
    Extract temporal features from StartTime and EndTime
    """
    # Convert time strings to datetime
    df['start_time'] = pd.to_datetime(df['StartTime'], format='%H:%M:%S,%f')
    df['end_time'] = pd.to_datetime(df['EndTime'], format='%H:%M:%S,%f')
    
    # Calculate utterance duration in seconds
    df['utterance_duration'] = (df['end_time'] - df['start_time']).dt.total_seconds()
    
    # Calculate speaking rate (words per second)
    df['word_count'] = df['Utterance'].str.split().str.len()
    df['speaking_rate'] = df['word_count'] / df['utterance_duration']
    
    # Calculate pause after utterance (time until next utterance in same dialogue)
    df['pause_after'] = 0.0
    
    for dialogue_id in df['Dialogue_ID'].unique():
        dialogue_mask = df['Dialogue_ID'] == dialogue_id
        dialogue_df = df[dialogue_mask].sort_values('Utterance_ID')
        
        # Calculate time difference to next utterance
        df.loc[dialogue_mask, 'pause_after'] = (
            dialogue_df['start_time'].shift(-1) - dialogue_df['end_time']
        ).dt.total_seconds()
    
    # Identify rapid exchanges (short duration + short pause)
    df['is_rapid_exchange'] = (
        (df['utterance_duration'] < df['utterance_duration'].median()) & 
        (df['pause_after'] < df['pause_after'].median())
    )
    
    return df

# Apply temporal features
train_df = add_temporal_features(train_df)

# Display summary statistics
print("Temporal Features Summary:")
print("\nUtterance Duration Statistics (seconds):")
print(train_df['utterance_duration'].describe())

print("\nSpeaking Rate Statistics (words/second):")
print(train_df['speaking_rate'].describe())

print("\nPause Duration Statistics (seconds):")
print(train_df['pause_after'].describe())

# Analyze relationship between temporal features and sentiment
print("\nAverage Duration by Sentiment:")
print(train_df.groupby('Sentiment')['utterance_duration'].mean())

print("\nAverage Speaking Rate by Sentiment:")
print(train_df.groupby('Sentiment')['speaking_rate'].mean())

print("\nPercentage of Rapid Exchanges by Sentiment:")
print(train_df.groupby('Sentiment')['is_rapid_exchange'].mean() * 100)

# Visualize temporal patterns
plt.figure(figsize=(12, 4))

plt.subplot(131)
plt.boxplot([train_df[train_df['Sentiment'] == s]['utterance_duration'] 
             for s in ['positive', 'neutral', 'negative']], 
            labels=['Positive', 'Neutral', 'Negative'])
plt.title('Duration by Sentiment')
plt.ylabel('Duration (seconds)')

plt.subplot(132)
plt.boxplot([train_df[train_df['Sentiment'] == s]['speaking_rate'] 
             for s in ['positive', 'neutral', 'negative']], 
            labels=['Positive', 'Neutral', 'Negative'])
plt.title('Speaking Rate by Sentiment')
plt.ylabel('Words per Second')

plt.subplot(133)
plt.boxplot([train_df[train_df['Sentiment'] == s]['pause_after'] 
             for s in ['positive', 'neutral', 'negative']], 
            labels=['Positive', 'Neutral', 'Negative'])
plt.title('Pause Duration by Sentiment')
plt.ylabel('Pause (seconds)')

plt.tight_layout()
plt.show()

In [None]:
# Function to add episode context features
def add_episode_features(df):
    """
    Add features related to episode context and character development
    """
    # Create episode identifier
    df['episode_id'] = df['Season'].astype(str) + '_' + df['Episode'].astype(str)
    
    # Track character appearances per episode
    episode_chars = df.groupby(['episode_id', 'Speaker']).size().reset_index()
    episode_chars.columns = ['episode_id', 'Speaker', 'appearances_in_episode']
    df = df.merge(episode_chars, on=['episode_id', 'Speaker'])
    
    # Calculate cumulative appearances for each character
    df['cumulative_appearances'] = df.groupby('Speaker').cumcount() + 1
    
    # Track sentiment patterns across episodes
    df['prev_episode_sentiment'] = None
    for speaker in df['Speaker'].unique():
        speaker_mask = df['Speaker'] == speaker
        df.loc[speaker_mask, 'prev_episode_sentiment'] = (
            df[speaker_mask].sort_values(['Season', 'Episode'])['Sentiment'].shift(1)
        )
    
    # Calculate episode-level statistics
    episode_stats = df.groupby('episode_id').agg({
        'Sentiment': lambda x: x.value_counts().index[0],  # Most common sentiment
        'Speaker': 'nunique',  # Number of unique speakers
        'Dialogue_ID': 'nunique'  # Number of dialogues
    }).reset_index()
    episode_stats.columns = ['episode_id', 'episode_dominant_sentiment', 
                           'unique_speakers', 'dialogue_count']
    df = df.merge(episode_stats, on='episode_id')
    
    # Track character interactions per episode
    df['interaction_count'] = df.groupby(['episode_id', 'Speaker', 'prev_speaker']).cumcount() + 1
    
    return df

# Apply episode features
train_df = add_episode_features(train_df)

# Display summary of episode features
print("\nEpisode Context Analysis:")
print("\nCharacter Appearances by Episode:")
print(train_df.groupby('episode_id')['Speaker'].value_counts().head())

print("\nDominant Sentiment by Episode:")
print(train_df.groupby('episode_id')['episode_dominant_sentiment'].first())

print("\nCharacter Development (Sample):")
for speaker in ['Ross', 'Rachel', 'Monica', 'Chandler', 'Joey', 'Phoebe']:
    speaker_data = train_df[train_df['Speaker'] == speaker]
    if not speaker_data.empty:
        print(f"\n{speaker}:")
        print(f"Total appearances: {len(speaker_data)}")
        print("Sentiment distribution:")
        print(speaker_data['Sentiment'].value_counts(normalize=True))

In [None]:
# Function to detect text enhancement patterns
def add_text_enhancement_features(df):
    """
    Add features for detecting text patterns like shouting, repetition, stuttering, etc.
    """
    # Initialize new columns
    df['has_shouting'] = False
    df['shouting_word_count'] = 0
    df['has_repetition'] = False
    df['repetition_count'] = 0
    df['has_stuttering'] = False
    df['stutter_count'] = 0
    df['has_laughter'] = False
    df['laughter_count'] = 0
    
    # Process each utterance
    for idx, row in df.iterrows():
        text = row['Utterance']
        words = text.split()
        
        # 1. Detect shouting (words in ALL CAPS)
        caps_words = [w for w in words if w.isupper() and len(w) > 1]
        df.at[idx, 'has_shouting'] = len(caps_words) > 0
        df.at[idx, 'shouting_word_count'] = len(caps_words)
        
        # 2. Detect repetition (words joined by hyphens)
        repetitions = [w for w in words if w.count('-') >= 2]  # e.g., "no-no-no"
        df.at[idx, 'has_repetition'] = len(repetitions) > 0
        df.at[idx, 'repetition_count'] = len(repetitions)
        
        # 3. Detect stuttering (single letters/words with hyphens)
        stutters = [w for w in words if len(w) <= 2 and '-' in w]  # e.g., "I-I"
        df.at[idx, 'has_stuttering'] = len(stutters) > 0
        df.at[idx, 'stutter_count'] = len(stutters)
        
        # 4. Detect laughter indicators
        laughter_patterns = ['haha', 'hehe', 'lol', 'lmao']
        found_laughter = []
        for pattern in laughter_patterns:
            if pattern in text.lower():
                found_laughter.append(pattern)
        df.at[idx, 'has_laughter'] = len(found_laughter) > 0
        df.at[idx, 'laughter_count'] = len(found_laughter)
    
    return df

# Apply text enhancement features
train_df = add_text_enhancement_features(train_df)

# Display examples and statistics
print("Text Enhancement Examples:")
print("\nShouting Examples:")
print(train_df[train_df['has_shouting']][['Utterance', 'shouting_word_count']].head())

print("\nRepetition Examples:")
print(train_df[train_df['has_repetition']][['Utterance', 'repetition_count']].head())

print("\nStuttering Examples:")
print(train_df[train_df['has_stuttering']][['Utterance', 'stutter_count']].head())

print("\nLaughter Examples:")
print(train_df[train_df['has_laughter']][['Utterance', 'laughter_count']].head())

# Analyze relationship with sentiment
print("\nFeature Distribution by Sentiment:")
for feature in ['has_shouting', 'has_repetition', 'has_stuttering', 'has_laughter']:
    print(f"\n{feature} distribution:")
    print(train_df.groupby('Sentiment')[feature].mean())

# Visualize patterns
plt.figure(figsize=(12, 4))

# Plot 1: Text enhancement features by sentiment
features = ['has_shouting', 'has_repetition', 'has_stuttering', 'has_laughter']
sentiments = ['positive', 'neutral', 'negative']

plt.subplot(131)
for i, feature in enumerate(features):
    feature_means = [train_df[train_df['Sentiment'] == s][feature].mean() for s in sentiments]
    plt.bar([x + i*0.25 for x in range(len(sentiments))], feature_means, width=0.25, label=feature)

plt.xticks([x + 0.375 for x in range(len(sentiments))], sentiments)
plt.title('Text Enhancement Features by Sentiment')
plt.legend(bbox_to_anchor=(1.05, 1))

plt.tight_layout()
plt.show()

# Additional statistics
print("\nOverall Statistics:")
print("\nPercentage of utterances with:")
for feature in ['has_shouting', 'has_repetition', 'has_stuttering', 'has_laughter']:
    percentage = (train_df[feature].sum() / len(train_df)) * 100
    print(f"{feature}: {percentage:.2f}%")

In [None]:
# Function to analyze dialogue structure
def add_dialogue_structure_features(df):
    """
    Add features related to dialogue structure and patterns
    """
    # Initialize new columns
    df['is_question'] = df['Utterance'].str.contains(r'\?')
    df['is_answer'] = False
    df['is_interruption'] = False
    df['conversation_size'] = 0
    df['is_opening'] = False
    df['is_closing'] = False
    
    # Process each dialogue
    for dialogue_id in df['Dialogue_ID'].unique():
        dialogue_mask = df['Dialogue_ID'] == dialogue_id
        dialogue = df[dialogue_mask].sort_values('Utterance_ID')
        
        # Get conversation size (number of unique speakers)
        conversation_size = dialogue['Speaker'].nunique()
        df.loc[dialogue_mask, 'conversation_size'] = conversation_size
        
        # Mark answers (utterances following questions)
        questions = dialogue[dialogue['is_question']].index
        for q_idx in questions:
            next_utt_idx = dialogue.index[dialogue.index.get_loc(q_idx) + 1] if q_idx != dialogue.index[-1] else None
            if next_utt_idx is not None:
                df.at[next_utt_idx, 'is_answer'] = True
        
        # Detect interruptions (short time gaps or overlapping timestamps)
        time_gaps = pd.to_datetime(dialogue['StartTime']) - pd.to_datetime(dialogue['EndTime']).shift(1)
        interruption_mask = (time_gaps.dt.total_seconds() < 0.5) & (time_gaps.dt.total_seconds() > 0)
        df.loc[dialogue[interruption_mask].index, 'is_interruption'] = True
        
        # Mark opening/closing utterances
        if len(dialogue) > 0:
            df.loc[dialogue.index[0], 'is_opening'] = True
            df.loc[dialogue.index[-1], 'is_closing'] = True
    
    # Add features for conversation type
    df['is_one_on_one'] = df['conversation_size'] == 2
    df['is_group_chat'] = df['conversation_size'] > 2
    
    return df

# Apply dialogue structure features
train_df = add_dialogue_structure_features(train_df)

# Display examples and statistics
print("Dialogue Structure Analysis:")
print("\nQuestion-Answer Examples:")
qa_pairs = train_df[train_df['is_question'] | train_df['is_answer']][
    ['Utterance', 'Speaker', 'is_question', 'is_answer']
].head()
print(qa_pairs)

print("\nInterruption Examples:")
interruptions = train_df[train_df['is_interruption']][
    ['Utterance', 'Speaker', 'StartTime', 'EndTime']
].head()
print(interruptions)

print("\nConversation Size Distribution:")
print(train_df['conversation_size'].value_counts())

print("\nConversation Type Statistics:")
print("One-on-one conversations:", (train_df['is_one_on_one']).sum())
print("Group conversations:", (train_df['is_group_chat']).sum())

# Analyze relationship with sentiment
print("\nSentiment Distribution by Dialogue Structure:")
for feature in ['is_question', 'is_answer', 'is_interruption', 'is_opening', 'is_closing']:
    print(f"\n{feature}:")
    print(train_df.groupby([feature, 'Sentiment']).size().unstack(fill_value=0))

# Visualize patterns
plt.figure(figsize=(15, 5))

# Plot 1: Conversation sizes
plt.subplot(131)
train_df['conversation_size'].value_counts().plot(kind='bar')
plt.title('Distribution of Conversation Sizes')
plt.xlabel('Number of Participants')
plt.ylabel('Count')

# Plot 2: Dialogue structure features by sentiment
plt.subplot(132)
features = ['is_question', 'is_answer', 'is_interruption']
for sentiment in ['positive', 'neutral', 'negative']:
    feature_means = [train_df[train_df['Sentiment'] == sentiment][f].mean() for f in features]
    plt.bar([x + ['pos', 'neu', 'neg'].index(sentiment[0:3])*0.25 for x in range(len(features))],
            feature_means, width=0.25, label=sentiment)
plt.xticks(range(len(features)), features, rotation=45)
plt.title('Dialogue Features by Sentiment')
plt.legend()

# Plot 3: Opening/Closing sentiment distribution
plt.subplot(133)
opening_closing = pd.DataFrame({
    'Opening': train_df[train_df['is_opening']]['Sentiment'].value_counts(),
    'Closing': train_df[train_df['is_closing']]['Sentiment'].value_counts()
})
opening_closing.plot(kind='bar')
plt.title('Sentiment in Opening/Closing Utterances')
plt.xlabel('Sentiment')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

# Additional analysis: Speaker transitions
def analyze_speaker_transitions(df):
    """
    Analyze how speakers transition in conversations
    """
    transitions = []
    for dialogue_id in df['Dialogue_ID'].unique():
        dialogue = df[df['Dialogue_ID'] == dialogue_id].sort_values('Utterance_ID')
        speakers = dialogue['Speaker'].tolist()
        for i in range(len(speakers)-1):
            transitions.append((speakers[i], speakers[i+1]))
    
    return pd.DataFrame(transitions, columns=['from_speaker', 'to_speaker'])

speaker_transitions = analyze_speaker_transitions(train_df)
print("\nMost Common Speaker Transitions:")
print(speaker_transitions.value_counts().head())

In [None]:
# Function to detect emotion intensifiers
def add_emotion_intensifier_features(df):
    """
    Add features for detecting emotion intensifiers like repeated punctuation,
    emphasis words, and emotion words
    """
    # Initialize new columns
    df['has_repeated_punct'] = False
    df['repeated_punct_count'] = 0
    df['has_emphasis_words'] = False
    df['emphasis_word_count'] = 0
    df['has_emotion_words'] = False
    df['emotion_word_count'] = 0
    
    # Define patterns
    emphasis_words = ['very', 'so', 'really', 'extremely', 'totally', 'absolutely', 
                     'completely', 'literally', 'definitely', 'seriously']
    
    positive_emotions = ['love', 'happy', 'excited', 'glad', 'wonderful', 'great',
                        'amazing', 'fantastic', 'awesome', 'excellent', 'perfect']
    
    negative_emotions = ['hate', 'angry', 'sad', 'upset', 'terrible', 'horrible',
                        'awful', 'furious', 'annoyed', 'disappointed', 'worried']
    
    # Process each utterance
    for idx, row in df.iterrows():
        text = row['Utterance'].lower()
        
        # 1. Check for repeated punctuation
        repeated_punct = re.findall(r'[!?]{2,}', text)
        df.at[idx, 'has_repeated_punct'] = len(repeated_punct) > 0
        df.at[idx, 'repeated_punct_count'] = len(repeated_punct)
        
        # 2. Check for emphasis words
        emphasis_count = sum(1 for word in emphasis_words if word in text.split())
        df.at[idx, 'has_emphasis_words'] = emphasis_count > 0
        df.at[idx, 'emphasis_word_count'] = emphasis_count
        
        # 3. Check for emotion words
        emotion_count = sum(1 for word in positive_emotions + negative_emotions 
                          if word in text.split())
        df.at[idx, 'has_emotion_words'] = emotion_count > 0
        df.at[idx, 'emotion_word_count'] = emotion_count
        
        # Additional: Track positive vs negative emotions separately
        df.at[idx, 'positive_emotion_count'] = sum(1 for word in positive_emotions 
                                                  if word in text.split())
        df.at[idx, 'negative_emotion_count'] = sum(1 for word in negative_emotions 
                                                  if word in text.split())
    
    return df

# Apply emotion intensifier features
train_df = add_emotion_intensifier_features(train_df)

# Display examples and statistics
print("Emotion Intensifier Examples:")
print("\nRepeated Punctuation Examples:")
print(train_df[train_df['has_repeated_punct']][['Utterance', 'repeated_punct_count']].head())

print("\nEmphasis Word Examples:")
print(train_df[train_df['has_emphasis_words']][['Utterance', 'emphasis_word_count']].head())

print("\nEmotion Word Examples:")
print(train_df[train_df['has_emotion_words']][['Utterance', 'emotion_word_count', 
                                              'positive_emotion_count', 
                                              'negative_emotion_count']].head())

# Analyze relationship with sentiment
print("\nFeature Distribution by Sentiment:")
for feature in ['has_repeated_punct', 'has_emphasis_words', 'has_emotion_words']:
    print(f"\n{feature} distribution:")
    print(train_df.groupby('Sentiment')[feature].mean())

# Visualize patterns
plt.figure(figsize=(15, 5))

# Plot 1: Emotion intensifier features by sentiment
features = ['has_repeated_punct', 'has_emphasis_words', 'has_emotion_words']
sentiments = ['positive', 'neutral', 'negative']

plt.subplot(131)
for i, feature in enumerate(features):
    feature_means = [train_df[train_df['Sentiment'] == s][feature].mean() 
                    for s in sentiments]
    plt.bar([x + i*0.25 for x in range(len(sentiments))], feature_means, 
            width=0.25, label=feature)

plt.xticks([x + 0.25 for x in range(len(sentiments))], sentiments)
plt.title('Emotion Intensifiers by Sentiment')
plt.legend(bbox_to_anchor=(1.05, 1))

# Plot 2: Positive vs Negative emotion words by sentiment
plt.subplot(132)
pos_means = [train_df[train_df['Sentiment'] == s]['positive_emotion_count'].mean() 
             for s in sentiments]
neg_means = [train_df[train_df['Sentiment'] == s]['negative_emotion_count'].mean() 
             for s in sentiments]

x = range(len(sentiments))
width = 0.35
plt.bar([i - width/2 for i in x], pos_means, width, label='Positive Emotions')
plt.bar([i + width/2 for i in x], neg_means, width, label='Negative Emotions')
plt.xticks(x, sentiments)
plt.title('Emotion Words by Sentiment')
plt.legend()

plt.tight_layout()
plt.show()

# Additional statistics
print("\nOverall Statistics:")
print("\nPercentage of utterances with:")
for feature in ['has_repeated_punct', 'has_emphasis_words', 'has_emotion_words']:
    percentage = (train_df[feature].sum() / len(train_df)) * 100
    print(f"{feature}: {percentage:.2f}%")

print("\nAverage counts per utterance:")
for feature in ['repeated_punct_count', 'emphasis_word_count', 'emotion_word_count']:
    mean = train_df[feature].mean()
    print(f"{feature}: {mean:.2f}")

3.video Preprocessing

In [None]:
def extract_advanced_features(df):
    """Add advanced features while keeping existing ones"""
    # Text length features
    df['word_count'] = df['Utterance'].str.split().str.len()
    df['char_count'] = df['Utterance'].str.len()
    df['avg_word_length'] = df['char_count'] / df['word_count']
    
    # Emotional markers
    df['has_caps'] = df['Utterance'].str.contains(r'[A-Z]{2,}').astype(int)
    df['has_repeated_chars'] = df['Utterance'].str.contains(r'(.)\1{2,}').astype(int)
    df['has_emphasis'] = df['Utterance'].str.contains(r'!+|\?+|\.{2,}').astype(int)
    
    # Dialogue context
    df['is_first_utterance'] = df.groupby('Dialogue_ID')['Utterance_ID'].transform('min') == df['Utterance_ID']
    df['is_last_utterance'] = df.groupby('Dialogue_ID')['Utterance_ID'].transform('max') == df['Utterance_ID']
    
    # Speaker patterns
    df['speaker_turn_length'] = df.groupby(['Dialogue_ID', 'Speaker']).cumcount()
    df['speaker_change'] = (df['Speaker'] != df['Speaker'].shift()).astype(int)
    
    return df

def add_interaction_features(df):
    """Add features about speaker interactions"""
    # Previous speaker sentiment
    df['prev_speaker_sentiment'] = df.groupby('Dialogue_ID')['Sentiment'].shift(1)
    
    # Count of previous interactions between speakers
    df['speaker_pair'] = df.apply(lambda x: '_'.join(sorted([x['Speaker'], str(x['prev_speaker'])])), axis=1)
    df['interaction_count'] = df.groupby('speaker_pair').cumcount()
    
    # Speaker's emotional tendency
    speaker_sentiment = df.groupby('Speaker')['Sentiment'].value_counts(normalize=True).unstack()
    df['speaker_positive_ratio'] = df['Speaker'].map(speaker_sentiment['positive'].fillna(0))
    df['speaker_negative_ratio'] = df['Speaker'].map(speaker_sentiment['negative'].fillna(0))
    
    return df

In [None]:
%pip install scikit-learn
from sklearn.utils import resample

def balance_dataset(df):
    """Balance the dataset using upsampling"""
    # Separate majority and minority classes
    df_majority = df[df['Sentiment'] == 'neutral']
    df_minority_pos = df[df['Sentiment'] == 'positive']
    df_minority_neg = df[df['Sentiment'] == 'negative']
    
    # Upsample minority classes
    df_minority_pos_upsampled = resample(df_minority_pos,
                                       replace=True,
                                       n_samples=len(df_majority),
                                       random_state=42)
    df_minority_neg_upsampled = resample(df_minority_neg,
                                       replace=True,
                                       n_samples=len(df_majority),
                                       random_state=42)
    
    # Combine all samples
    df_balanced = pd.concat([df_majority, 
                           df_minority_pos_upsampled,
                           df_minority_neg_upsampled])
    
    return df_balanced

In [None]:
# # Install required packages
# ! pip install opencv-python mtcnn tensorflow numpy

# # Import required libraries
# import cv2
# import numpy as np
# from mtcnn import MTCNN
# import tensorflow as tf

# # Initialize MTCNN detector
# detector = MTCNN()

# # Function to extract frames from video
# def extract_frames(video_path, sample_rate=0.5):
#     """
#     Extract frames from video at given sample rate
#     Args:
#         video_path: Path to video file
#         sample_rate: Extract frame every sample_rate seconds
#     Returns:
#         frames: List of extracted frames
#     """
#     frames = []
#     cap = cv2.VideoCapture(video_path)
#     fps = cap.get(cv2.CAP_PROP_FPS)
#     frame_interval = int(fps * sample_rate)
    
#     frame_count = 0
#     while cap.isOpened():
#         ret, frame = cap.read()
#         if not ret:
#             break
            
#         if frame_count % frame_interval == 0:
#             # Convert BGR to RGB
#             frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
#             frames.append(frame_rgb)
            
#         frame_count += 1
        
#     cap.release()
#     return frames

# # Function to detect faces and extract features
# def extract_face_features(frame):
#     """
#     Detect faces and extract features using MTCNN
#     Args:
#         frame: Input image frame
#     Returns:
#         features: Dictionary containing facial features
#     """
#     # Detect faces
#     faces = detector.detect_faces(frame)
    
#     if len(faces) == 0:
#         return None
        
#     # Get the largest face
#     face = max(faces, key=lambda x: x['box'][2] * x['box'][3])
    
#     features = {
#         'confidence': face['confidence'],
#         'box': face['box'],
#         'keypoints': face['keypoints'],
#         'emotions': {
#             'left_eye': face['keypoints']['left_eye'],
#             'right_eye': face['keypoints']['right_eye'],
#             'nose': face['keypoints']['nose'],
#             'mouth_left': face['keypoints']['mouth_left'], 
#             'mouth_right': face['keypoints']['mouth_right']
#         }
#     }
    
#     return features

# # Function to process video and extract features
# def process_video(video_path):
#     """
#     Process video file and extract facial features
#     Args:
#         video_path: Path to video file
#     Returns:
#         video_features: Dictionary containing features for each frame
#     """
#     # Extract frames
#     frames = extract_frames(video_path)
    
#     # Extract features from each frame
#     video_features = []
#     for frame in frames:
#         features = extract_face_features(frame)
#         if features is not None:
#             video_features.append(features)
            
#     return video_features

# # Test on a sample video
# video_path = train_df['video_clip_path'][0]
# features = process_video(video_path)

# print(f"Extracted features from {len(features)} frames")
# print("\nSample features from first frame:")
# print(f"Face confidence: {features[0]['confidence']:.2f}")
# print(f"Face box: {features[0]['box']}")
# print(f"Facial keypoints: {features[0]['keypoints']}")

In [None]:
%pip install tensorflow
%pip install opencv-python
%pip install mtcnn

import cv2
import numpy as np
from mtcnn import MTCNN
import tensorflow as tf

# Initialize MTCNN detector
detector = MTCNN()

In [None]:
def extract_frames(video_path, sample_rate=0.5):
    """
    Extract frames from video at given sample rate
    Args:
        video_path: Path to video file
        sample_rate: Extract frame every sample_rate seconds
    Returns:
        frames: List of extracted frames
    """
    frames = []
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_interval = int(fps * sample_rate)
    
    frame_count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
            
        if frame_count % frame_interval == 0:
            # Convert BGR to RGB
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(frame_rgb)
            
        frame_count += 1
        
    cap.release()
    return frames

In [None]:
def extract_frames(video_path, sample_rate=0.5):
    """
    Extract frames from video at given sample rate
    Args:
        video_path: Path to video file
        sample_rate: Extract frame every sample_rate seconds
    Returns:
        frames: List of extracted frames
    """
    frames = []
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_interval = int(fps * sample_rate)
    
    frame_count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
            
        if frame_count % frame_interval == 0:
            # Convert BGR to RGB
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(frame_rgb)
            
        frame_count += 1
        
    cap.release()
    return frames

In [None]:
def extract_face_features(frame):
    """
    Detect faces and extract features using MTCNN
    Args:
        frame: Input image frame
    Returns:
        features: Dictionary containing facial features
    """
    # Detect faces
    faces = detector.detect_faces(frame)
    
    if len(faces) == 0:
        return None
        
    # Get the largest face
    face = max(faces, key=lambda x: x['box'][2] * x['box'][3])
    
    # Calculate geometric features
    box = face['box']
    keypoints = face['keypoints']
    
    # Calculate eye aspect ratio
    left_eye = keypoints['left_eye']
    right_eye = keypoints['right_eye']
    eye_distance = np.sqrt((left_eye[0] - right_eye[0])**2 + 
                          (left_eye[1] - right_eye[1])**2)
    
    # Calculate mouth openness
    mouth_left = keypoints['mouth_left']
    mouth_right = keypoints['mouth_right']
    mouth_width = np.sqrt((mouth_left[0] - mouth_right[0])**2 + 
                         (mouth_left[1] - mouth_right[1])**2)
    
    features = {
        'confidence': face['confidence'],
        'box': box,
        'keypoints': keypoints,
        'geometric_features': {
            'eye_distance': eye_distance,
            'mouth_width': mouth_width,
            'face_height': box[3],
            'face_width': box[2]
        }
    }
    
    return features

In [None]:
def process_video(video_path):
    """
    Process video file and extract facial features with temporal information
    Args:
        video_path: Path to video file
    Returns:
        video_features: Dictionary containing features across frames
    """
    # Extract frames
    frames = extract_frames(video_path)
    
    # Extract features from each frame
    frame_features = []
    for frame in frames:
        features = extract_face_features(frame)
        if features is not None:
            frame_features.append(features)
    
    # Calculate temporal features
    if len(frame_features) > 1:
        temporal_features = {
            'confidence_mean': np.mean([f['confidence'] for f in frame_features]),
            'face_movement': calculate_movement(frame_features),
            'expression_change': calculate_expression_change(frame_features)
        }
    else:
        temporal_features = None
        
    video_features = {
        'frame_features': frame_features,
        'temporal_features': temporal_features,
        'num_frames': len(frame_features)
    }
            
    return video_features

def calculate_movement(frame_features):
    """Calculate face movement across frames"""
    movements = []
    for i in range(1, len(frame_features)):
        prev_box = frame_features[i-1]['box']
        curr_box = frame_features[i]['box']
        
        # Calculate center point movement
        prev_center = [prev_box[0] + prev_box[2]/2, prev_box[1] + prev_box[3]/2]
        curr_center = [curr_box[0] + curr_box[2]/2, curr_box[1] + curr_box[3]/2]
        
        movement = np.sqrt((prev_center[0] - curr_center[0])**2 + 
                         (prev_center[1] - curr_center[1])**2)
        movements.append(movement)
    
    return np.mean(movements) if movements else 0

def calculate_expression_change(frame_features):
    """Calculate expression changes across frames"""
    changes = []
    for i in range(1, len(frame_features)):
        prev_features = frame_features[i-1]['geometric_features']
        curr_features = frame_features[i]['geometric_features']
        
        # Calculate changes in key measurements
        mouth_change = abs(prev_features['mouth_width'] - curr_features['mouth_width'])
        eye_change = abs(prev_features['eye_distance'] - curr_features['eye_distance'])
        
        change = mouth_change + eye_change
        changes.append(change)
    
    return np.mean(changes) if changes else 0

In [None]:
# Function to organize features from video processing
def organize_video_features(video_features):
    """
    Organize raw video features into structured format
    Args:
        video_features: Dictionary containing raw features from video processing
    Returns:
        organized_features: Dictionary with structured features
    """
    # Extract per-frame features
    frame_features = video_features['frame_features']
    
    # Calculate per-frame metrics
    per_frame_features = {
        'confidence_scores': [f['confidence'] for f in frame_features],
        'facial_landmarks': [f['keypoints'] for f in frame_features],
        'geometric_features': [f['geometric_features'] for f in frame_features]
    }
    
    # Calculate video-level features
    video_level_features = {
        'avg_confidence': np.mean(per_frame_features['confidence_scores']),
        'expression_intensity': calculate_expression_intensity(frame_features),
        'movement_amount': video_features['temporal_features']['face_movement'],
        'expression_changes': video_features['temporal_features']['expression_change']
    }
    
    return {
        'per_frame': per_frame_features,
        'video_level': video_level_features
    }

def calculate_expression_intensity(frame_features):
    """Calculate overall expression intensity from geometric features"""
    intensities = []
    for frame in frame_features:
        geom = frame['geometric_features']
        # Combine mouth and eye movements as expression intensity
        intensity = (geom['mouth_width'] / geom['face_width'] + 
                    geom['eye_distance'] / geom['face_width']) / 2
        intensities.append(intensity)
    return np.mean(intensities)

# Function to combine video and text features
def combine_features(video_features, text_features):
    """
    Combine video and text features with temporal alignment
    Args:
        video_features: Dictionary containing video features
        text_features: Dictionary containing text features
    Returns:
        combined_features: Dictionary with aligned multimodal features
    """
    # Get utterance duration
    duration = text_features['utterance_duration']
    
    # Normalize temporal features to utterance duration
    normalized_video_features = {
        'expression_intensity': video_features['video_level']['expression_intensity'],
        'movement_amount': video_features['video_level']['movement_amount'] / duration,
        'expression_changes': video_features['video_level']['expression_changes'] / duration
    }
    
    # Combine with text features
    combined_features = {
        # Video features
        'visual_confidence': video_features['video_level']['avg_confidence'],
        'expression_intensity': normalized_video_features['expression_intensity'],
        'movement_amount': normalized_video_features['movement_amount'],
        'expression_changes': normalized_video_features['expression_changes'],
        
        # Text features
        'text_sentiment': text_features['sentiment'],
        'speaking_rate': text_features['speaking_rate'],
        'utterance_duration': text_features['utterance_duration'],
        
        # Temporal features
        'pause_after': text_features['pause_after'],
        'is_rapid_exchange': text_features['is_rapid_exchange']
    }
    
    return combined_features

# Process videos and combine features
def process_all_videos(df):
    """
    Process all videos and combine features
    Args:
        df: DataFrame containing video paths and text features
    Returns:
        all_features: List of combined features for each video
    """
    all_features = []
    
    for idx, row in df.iterrows():
        # Process video
        video_path = row['video_clip_path']
        video_features = process_video(video_path)
        
        # Skip if face detection failed
        if video_features['num_frames'] == 0:
            continue
            
        # Organize video features
        organized_video_features = organize_video_features(video_features)
        
        # Get text features
        text_features = {
            'sentiment': row['Sentiment'],
            'speaking_rate': row['speaking_rate'],
            'utterance_duration': row['utterance_duration'],
            'pause_after': row['pause_after'],
            'is_rapid_exchange': row['is_rapid_exchange']
        }
        
        # Combine features
        combined = combine_features(organized_video_features, text_features)
        all_features.append(combined)
        
        # Print progress
        if idx % 100 == 0:
            print(f"Processed {idx} videos...")
    
    return all_features

# Example usage:
# Process first few videos as example
sample_df = train_df.head(5)
sample_features = process_all_videos(sample_df)

# Display example of combined features
print("\nExample of combined features:")
if sample_features:
    for key, value in sample_features[0].items():
        print(f"{key}: {value}")
else:
    print("No features extracted. Please check your video processing pipeline.")

In [None]:
class VideoProcessor:
    def __init__(self, sample_rate=0.5):
        """
        Initialize video processor
        Args:
            sample_rate: Extract frame every sample_rate seconds
        """
        self.sample_rate = sample_rate
        self.detector = MTCNN()  # Initialize MTCNN detector
        
    def load_video(self, video_path):
        """Load video and extract frames"""
        frames = []
        cap = cv2.VideoCapture(video_path)
        fps = cap.get(cv2.CAP_PROP_FPS)
        frame_interval = int(fps * self.sample_rate)
        
        frame_count = 0
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
                
            if frame_count % frame_interval == 0:
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frames.append(frame_rgb)
                
            frame_count += 1
            
        cap.release()
        return frames

In [None]:
    def detect_faces(self, frame):
        """Detect faces and extract features"""
        faces = self.detector.detect_faces(frame)
        
        if len(faces) == 0:
            return None
            
        # Get largest face
        face = max(faces, key=lambda x: x['box'][2] * x['box'][3])
        
        # Extract geometric features
        box = face['box']
        keypoints = face['keypoints']
        
        # Calculate facial measurements
        eye_distance = self.calculate_eye_distance(keypoints)
        mouth_width = self.calculate_mouth_width(keypoints)
        
        features = {
            'confidence': face['confidence'],
            'box': box,
            'keypoints': keypoints,
            'measurements': {
                'eye_distance': eye_distance,
                'mouth_width': mouth_width,
                'face_height': box[3],
                'face_width': box[2]
            }
        }
        
        return features
        
    def calculate_eye_distance(self, keypoints):
        """Calculate distance between eyes"""
        left_eye = keypoints['left_eye']
        right_eye = keypoints['right_eye']
        return np.sqrt((left_eye[0] - right_eye[0])**2 + 
                      (left_eye[1] - right_eye[1])**2)
        
    def calculate_mouth_width(self, keypoints):
        """Calculate mouth width"""
        mouth_left = keypoints['mouth_left']
        mouth_right = keypoints['mouth_right']
        return np.sqrt((mouth_left[0] - mouth_right[0])**2 + 
                      (mouth_left[1] - mouth_right[1])**2)

In [None]:
    def extract_temporal_features(self, frame_features):
        """Extract temporal features across frames"""
        if len(frame_features) < 2:
            return None
            
        temporal_features = {
            'confidence_mean': np.mean([f['confidence'] for f in frame_features]),
            'movement': self.calculate_movement(frame_features),
            'expression_change': self.calculate_expression_change(frame_features)
        }
        
        return temporal_features
        
    def calculate_movement(self, frame_features):
        """Calculate face movement between frames"""
        movements = []
        for i in range(1, len(frame_features)):
            prev = frame_features[i-1]['box']
            curr = frame_features[i]['box']
            
            prev_center = [prev[0] + prev[2]/2, prev[1] + prev[3]/2]
            curr_center = [curr[0] + curr[2]/2, curr[1] + curr[3]/2]
            
            movement = np.sqrt((prev_center[0] - curr_center[0])**2 + 
                             (prev_center[1] - curr_center[1])**2)
            movements.append(movement)
            
        return np.mean(movements) if movements else 0

In [None]:
class FeatureIntegrator:
    def __init__(self):
        self.video_processor = VideoProcessor()
        
    def process_video(self, video_path):
        """Process video and extract all features"""
        # Load and process video
        frames = self.video_processor.load_video(video_path)
        
        # Extract features from each frame
        frame_features = []
        for frame in frames:
            features = self.video_processor.detect_faces(frame)
            if features is not None:
                frame_features.append(features)
                
        # Extract temporal features
        temporal_features = self.video_processor.extract_temporal_features(frame_features)
        
        return {
            'frame_features': frame_features,
            'temporal_features': temporal_features
        }
        
    def combine_features(self, video_features, text_features):
        """Combine video and text features"""
        # Normalize features by duration
        duration = text_features['utterance_duration']
        
        combined_features = {
            # Video features
            'visual_confidence': np.mean([f['confidence'] for f in video_features['frame_features']]),
            'movement_amount': video_features['temporal_features']['movement'] / duration,
            'expression_change': video_features['temporal_features']['expression_change'] / duration,
            
            # Text features
            'text_sentiment': text_features['sentiment'],
            'speaking_rate': text_features['speaking_rate'],
            
            # Temporal features
            'duration': duration,
            'pause_after': text_features['pause_after']
        }
        
        return combined_features

4.Audio Processing

In [None]:
%pip install setuptools==57.5.0
%pip install praat-parselmouth
%pip install librosa
%pip install soundfile  parselmouth tqdm pandas
%pip uninstall moviepy
%pip install moviepy==1.0.3


import librosa
import soundfile as sf
from moviepy.editor import VideoFileClip
import numpy as np
import parselmouth
from parselmouth.praat import call
import os
import warnings
from tqdm import tqdm
import pandas as pd

def load_data():
    """Load training and test data with proper encoding"""
    try:
        train_df = pd.read_csv('train/train_prop_sent_csv3_final.csv', encoding='ISO-8859-1')
        test_df = pd.read_csv('test/test_prop_sent_csv3_final.csv', encoding='ISO-8859-1')
        
        # Add video paths
        train_df['video_clip_path'] = train_df.apply(lambda x: f"train/train_prop_sent_data3/dia{x['Dialogue_ID']}_utt{x['Utterance_ID']}.mp4", axis=1)
        test_df['video_clip_path'] = test_df.apply(lambda x: f"test/test_prop_sent_data3/dia{x['Dialogue_ID']}_utt{x['Utterance_ID']}.mp4", axis=1)
        
        return train_df, test_df
            
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        return None, None

class AudioProcessor:
    def __init__(self, sample_rate=16000, temp_dir='temp_audio'):
        self.sample_rate = sample_rate
        self.temp_dir = temp_dir
        if not os.path.exists(temp_dir):
            os.makedirs(temp_dir)
        warnings.filterwarnings('ignore')

    def extract_audio_from_video(self, video_path):
        """Extract audio from video file"""
        try:
            temp_path = os.path.join(self.temp_dir, f"temp_{os.path.basename(video_path)}.wav")
            if not os.path.exists(temp_path):
                video = VideoFileClip(video_path)
                audio = video.audio
                if audio is None:
                    return None, None
                audio.write_audiofile(temp_path, logger=None)
                video.close()
            y, sr = librosa.load(temp_path, sr=self.sample_rate)
            return y, sr
        except Exception as e:
            print(f"Error processing {video_path}: {str(e)}")
            return None, None

    def extract_features(self, y, sr):
        """Extract all audio features"""
        try:
            # Basic features
            sound = parselmouth.Sound(y, sr)
            pitch = sound.to_pitch()
            pitch_values = pitch.selected_array['frequency']
            pitch_values = pitch_values[pitch_values != 0]
            
            # Energy features
            rms = librosa.feature.rms(y=y)[0]
            
            # Voice quality
            harmonicity = call(sound, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0)
            hnr = call(harmonicity, "Get mean", 0, 0)
            
            return {
                'pitch_mean': float(np.mean(pitch_values)) if len(pitch_values) > 0 else 0,
                'pitch_std': float(np.std(pitch_values)) if len(pitch_values) > 0 else 0,
                'energy_mean': float(np.mean(rms)),
                'energy_std': float(np.std(rms)), 
                'hnr': float(hnr)
            }
            
        except:
            return {k:0 for k in ['pitch_mean', 'pitch_std', 'energy_mean', 'energy_std', 'hnr']}

    def process_audio(self, video_path):
        """Process single video and extract features"""
        y, sr = self.extract_audio_from_video(video_path)
        if y is None:
            return None
        return self.extract_features(y, sr)

    def cleanup(self):
        """Clean up temp files"""
        import shutil
        if os.path.exists(self.temp_dir):
            shutil.rmtree(self.temp_dir)

def process_dataset(df, batch_size=32):
    """Process audio features for entire dataset"""
    audio_processor = AudioProcessor()
    all_features = []
    
    try:
        for i in tqdm(range(0, len(df), batch_size)):
            batch_df = df.iloc[i:i+batch_size]
            batch_features = []
            for _, row in batch_df.iterrows():
                features = audio_processor.process_audio(row['video_clip_path'])
                if features is None:
                    features = {k:0 for k in ['pitch_mean', 'pitch_std', 'energy_mean', 'energy_std', 'hnr']}
                batch_features.append(features)
            all_features.extend(batch_features)
    finally:
        audio_processor.cleanup()
        
    return pd.DataFrame(all_features)


Feature Engineering

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.ensemble import VotingClassifier
import xgboost as xgb

class FeatureIntegrator:
    def __init__(self):
        """Initialize feature integrator"""
        self.text_features = None
        self.video_features = None 
        self.audio_features = None
        
    def extract_all_features(self, df):
        """Extract features from all modalities"""
        # Calculate duration and speaking rate
        df['start_time'] = pd.to_datetime(df['StartTime'])
        df['end_time'] = pd.to_datetime(df['EndTime'])
        df['utterance_duration'] = (df['end_time'] - df['start_time']).dt.total_seconds()
        df['word_count'] = df['Utterance'].str.split().str.len()
        df['speaking_rate'] = df['word_count'] / df['utterance_duration']
        
        # Text Features
        text_features = {
            'text_sentiment': df['Sentiment'],
            'exclamation_count': df['Utterance'].str.count('!'),
            'question_count': df['Utterance'].str.count(r'\?'),
            'ellipsis_count': df['Utterance'].str.count(r'\.\.\.'),
            'word_count': df['word_count'],
            'speaking_rate': df['speaking_rate']
        }
        
        # Speaker Features
        speaker_features = {
            'speaker_total_utterances': df.groupby('Speaker')['Utterance'].transform('count'),
            'speaker_sentiment_ratio': df.groupby('Speaker')['Sentiment'].transform(
                lambda x: (x == 'positive').mean()
            )
        }
        
        # Context Features
        context_features = {
            'prev_sentiment': df.groupby('Dialogue_ID')['Sentiment'].shift(1),
            'prev_speaker': df.groupby('Dialogue_ID')['Speaker'].shift(1),
            'relative_position': df.groupby('Dialogue_ID').cumcount() / 
                               df.groupby('Dialogue_ID').size()
        }
        
        return {
            'text': text_features,
            'speaker': speaker_features,
            'context': context_features
        }

class SentimentClassifier:
    def __init__(self):
        self.scaler = StandardScaler()
        
        # Create base models
        self.model = VotingClassifier(
            estimators=[
                ('xgb1', xgb.XGBClassifier(
                    n_estimators=200, max_depth=5, learning_rate=0.1,
                    subsample=0.8, colsample_bytree=0.8, random_state=42,
                    use_label_encoder=False, eval_metric='mlogloss'
                )),
                ('xgb2', xgb.XGBClassifier(
                    n_estimators=200, max_depth=7, learning_rate=0.05,
                    subsample=0.7, colsample_bytree=0.7, random_state=43,
                    use_label_encoder=False, eval_metric='mlogloss'
                ))
            ],
            weights=[0.6, 0.4],
            voting='soft'
        )
        
    def prepare_features(self, df):
        """Prepare features for training/prediction"""
        features = []
        
        # Text features
        features.extend([
            df['Utterance'].str.count('!'),
            df['Utterance'].str.count(r'\?'),
            df['Utterance'].str.count(r'\.\.\.'),
            df['Utterance'].str.split().str.len(),
            df['Utterance'].str.len()
        ])
        
        # Speaker features
        features.extend([
            pd.Categorical(df['Speaker']).codes,
            df.groupby('Speaker')['Utterance'].transform('count')
        ])
        
        # Convert to numpy array
        X = np.column_stack(features)
        return X
        
    def train_with_cv(self, train_df, n_splits=5):
        """Train with cross-validation"""
        X = self.prepare_features(train_df)
        y = train_df['Sentiment'].values
        
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
        val_scores = []
        
        for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
            
            X_train = self.scaler.fit_transform(X_train)
            X_val = self.scaler.transform(X_val)
            
            self.model.fit(X_train, y_train)
            val_pred = self.model.predict(X_val)
            
            print(f"\nFold {fold + 1} Validation Report:")
            print(classification_report(y_val, val_pred))
            val_scores.append(classification_report(y_val, val_pred, output_dict=True)['weighted avg']['f1-score'])
            
        print(f"\nAverage F1 Score: {np.mean(val_scores):.3f}")
        
        # Train final model
        X = self.scaler.fit_transform(X)
        self.model.fit(X, y)
        
    def predict(self, test_df):
        """Generate predictions for test data"""
        X_test = self.prepare_features(test_df)
        X_test = self.scaler.transform(X_test)
        return self.model.predict(X_test)

def main():
    # Load data
    print("Loading data...")
    train_df = pd.read_csv('train/train_prop_sent_csv3_final.csv', encoding='ISO-8859-1')
    test_df = pd.read_csv('test/test_prop_sent_csv3_final.csv', encoding='ISO-8859-1')
    
    # Process features and train model
    print("\nTraining model...")
    classifier = SentimentClassifier()
    classifier.train_with_cv(train_df)
    
    # Generate predictions
    print("\nGenerating predictions...")
    predictions = classifier.predict(test_df)
    
    # Create submission
    submission_df = pd.DataFrame({
        'Sr No.': test_df['Sr No.'],
        'Sentiment': predictions
    })
    
    # Save submission
    submission_df.to_csv('submission.csv', index=False)
    print("\nSubmission saved!")
    
    # Print distribution
    print("\nPredicted class distribution:")
    print(pd.Series(predictions).value_counts())

if __name__ == "__main__":
    main()

In [None]:
# Text Model
class TextModel:
    def __init__(self):
        self.text_features = [
            'exclamation_count', 'question_count', 'ellipsis_count',
            'speaking_rate', 'utterance_duration', 'pause_after',
            'speaker_positive_ratio', 'speaker_negative_ratio'
        ]
        
    def prepare_features(self, df):
        return df[self.text_features].values

# Video Model
class VideoModel:
    def __init__(self):
        self.video_features = [
            'face_confidence', 'expression_intensity', 'movement_amount',
            'expression_changes'
        ]
        
    def prepare_features(self, df):
        return df[self.video_features].values

# Audio Model
class AudioModel:
    def __init__(self):
        self.audio_features = [
            'pitch_mean', 'energy_mean', 'speech_rate',
            'hnr', 'jitter', 'shimmer'
        ]
        
    def prepare_features(self, df):
        return df[self.audio_features].values

In [None]:
class EarlyFusionModel:
    def __init__(self):
        self.text_model = TextModel()
        self.video_model = VideoModel()
        self.audio_model = AudioModel()
        
    def prepare_features(self, df):
        # Combine features from all modalities
        text_features = self.text_model.prepare_features(df)
        video_features = self.video_model.prepare_features(df)
        audio_features = self.audio_model.prepare_features(df)
        
        # Concatenate features
        combined_features = np.concatenate([
            text_features,
            video_features,
            audio_features
        ], axis=1)
        
        return combined_features

In [None]:
class LateFusionModel:
    def __init__(self):
        self.text_model = TextModel()
        self.video_model = VideoModel()
        self.audio_model = AudioModel()
        
        # Individual classifiers for each modality
        self.text_classifier = None
        self.video_classifier = None
        self.audio_classifier = None
        
    def train_individual_models(self, df):
        # Train text model
        text_features = self.text_model.prepare_features(df)
        self.text_classifier.fit(text_features, df['Sentiment'])
        
        # Train video model
        video_features = self.video_model.prepare_features(df)
        self.video_classifier.fit(video_features, df['Sentiment'])
        
        # Train audio model
        audio_features = self.audio_model.prepare_features(df)
        self.audio_classifier.fit(audio_features, df['Sentiment'])
        
    def predict(self, df):
        # Get predictions from each model
        text_pred = self.text_classifier.predict_proba(
            self.text_model.prepare_features(df)
        )
        video_pred = self.video_classifier.predict_proba(
            self.video_model.prepare_features(df)
        )
        audio_pred = self.audio_classifier.predict_proba(
            self.audio_model.prepare_features(df)
        )
        
        # Combine predictions (e.g., weighted average)
        final_pred = (0.4 * text_pred + 
                     0.3 * video_pred + 
                     0.3 * audio_pred)
        
        return final_pred

In [None]:
class HybridFusionModel:
    def __init__(self):
        self.text_model = TextModel()
        self.video_model = VideoModel()
        self.audio_model = AudioModel()
        
    def prepare_features(self, df):
        # Low-level feature fusion
        text_features = self.text_model.prepare_features(df)
        video_features = self.video_model.prepare_features(df)
        
        # Combine text and video features
        text_video_features = np.concatenate([
            text_features,
            video_features
        ], axis=1)
        
        # Train intermediate model
        intermediate_pred = self.intermediate_classifier.predict_proba(
            text_video_features
        )
        
        # Get audio predictions
        audio_features = self.audio_model.prepare_features(df)
        audio_pred = self.audio_classifier.predict_proba(
            audio_features
        )
        
        # Late fusion of intermediate and audio predictions
        final_pred = 0.7 * intermediate_pred + 0.3 * audio_pred
        
        return final_pred

In [None]:
from sklearn.metrics import accuracy_score, f1_score

In [None]:
def train_model(model, train_df):
    """
    Train model on training data
    """
    # Prepare features
    X_train = model.prepare_features(train_df)
    y_train = train_df['Sentiment']
    
    # Train model
    model.fit(X_train, y_train)
    
    return model

def predict_test(model, test_df):
    """
    Make predictions on test data
    """
    # Prepare features
    X_test = model.prepare_features(test_df)
    
    # Make predictions
    predictions = model.predict(X_test)
    
    return predictions

def evaluate_model(model, val_df):
    """
    Evaluate model performance on validation set
    """
    # Prepare features
    X_val = model.prepare_features(val_df)
    y_val = val_df['Sentiment']
    
    # Make predictions
    y_pred = model.predict(X_val)
    
    # Calculate metrics
    accuracy = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred, average='weighted')
    
    return accuracy, f1

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.ensemble import VotingClassifier
from sklearn.utils import resample
import xgboost as xgb
import numpy as np
import pandas as pd

class SentimentClassifier:
    def __init__(self):
        self.scaler = StandardScaler()
        
        # Create base models
        self.xgb1 = xgb.XGBClassifier(
            n_estimators=200, max_depth=5, learning_rate=0.1,
            subsample=0.8, colsample_bytree=0.8,
            random_state=42, use_label_encoder=False, eval_metric='mlogloss'
        )
        
        self.xgb2 = xgb.XGBClassifier(
            n_estimators=200, max_depth=7, learning_rate=0.05,
            subsample=0.7, colsample_bytree=0.7,
            random_state=43, use_label_encoder=False, eval_metric='mlogloss'
        )
        
        self.xgb3 = xgb.XGBClassifier(
            n_estimators=200, max_depth=9, learning_rate=0.01,
            subsample=0.6, colsample_bytree=0.6,
            random_state=44, use_label_encoder=False, eval_metric='mlogloss'
        )
        
        # Create weighted ensemble
        self.model = VotingClassifier(
            estimators=[
                ('xgb1', self.xgb1),
                ('xgb2', self.xgb2),
                ('xgb3', self.xgb3)
            ],
            weights=[0.4, 0.3, 0.3],
            voting='soft'
        )
        
    def prepare_features(self, df):
        """Prepare features for training/prediction"""
        # Basic text features
        df['exclamation_count'] = df['Utterance'].str.count('!')
        df['question_count'] = df['Utterance'].str.count(r'\?')
        df['ellipsis_count'] = df['Utterance'].str.count(r'\.\.\.')
        
        # Word-level features
        df['word_count'] = df['Utterance'].str.split().str.len()
        df['char_count'] = df['Utterance'].str.len()
        df['avg_word_length'] = df['Utterance'].apply(lambda x: np.mean([len(w) for w in str(x).split()]) if pd.notnull(x) else 0)
        
        # Select features for model
        feature_cols = [
            'exclamation_count', 'question_count', 'ellipsis_count',
            'word_count', 'char_count', 'avg_word_length'
        ]
        
        return df[feature_cols].values
        
    def train_with_cv(self, train_df, n_splits=5):
        """Train with cross-validation"""
        # Reset index to avoid indexing issues
        train_df = train_df.reset_index(drop=True)
        
        X = self.prepare_features(train_df)
        y = train_df['Sentiment'].values  # Convert to numpy array
        
        # Initialize stratified k-fold
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
        
        # Store validation scores
        val_scores = []
        
        # Perform cross-validation
        for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
            # Split data
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
            
            # Scale features
            X_train = self.scaler.fit_transform(X_train)
            X_val = self.scaler.transform(X_val)
            
            # Train model
            self.model.fit(X_train, y_train)
            
            # Validate
            val_pred = self.model.predict(X_val)
            val_score = classification_report(y_val, val_pred, output_dict=True)
            val_scores.append(val_score['weighted avg']['f1-score'])
            
            print(f"\nFold {fold + 1} Validation Report:")
            print(classification_report(y_val, val_pred))
            
        print(f"\nAverage F1 Score across folds: {np.mean(val_scores):.3f}")
        
        # Train final model on full dataset
        X = self.scaler.fit_transform(X)
        self.model.fit(X, y)
            
    def predict(self, test_df):
        """Generate predictions for test data"""
        X_test = self.prepare_features(test_df)
        X_test = self.scaler.transform(X_test)
        return self.model.predict(X_test)

# Train model with cross-validation
print("Training model with cross-validation...")
classifier = SentimentClassifier()
classifier.train_with_cv(train_df)

# Make predictions on test data
print("\nGenerating predictions for test data...")
test_predictions = classifier.predict(test_df)

# Create submission file
submission_df = pd.DataFrame({
    'Sr No.': test_df['Sr No.'],
    'Sentiment': test_predictions
})

# Save submission
submission_df.to_csv('submission.csv', index=False)
print("\nSubmission file created!")

# Print class distribution
print("\nPredicted class distribution:")
print(pd.Series(test_predictions).value_counts())

# Print sample predictions
print("\nSample predictions:")
print(submission_df.head())

In [None]:
# Define path to video clips
df = pd.read_csv('test/test_prop_sent_csv3_final.csv', encoding='ISO-8859-1')
video_dir = 'test/test_prop_sent_data3'


# Function to get video file path from IDs
def get_video_clip_path(row):
    dialogue_id = row['Dialogue_ID']
    utterance_id = row['Utterance_ID']
    filename = f"dia{dialogue_id}_utt{utterance_id}.mp4"
    return os.path.join(video_dir, filename)

# Apply the function to get file paths for each sampled clip
df['video_clip_path'] = df.apply(get_video_clip_path, axis=1)

# Check sample paths
print(df[['Dialogue_ID', 'Utterance_ID', 'video_clip_path']].head())

In [None]:
all_preds = ["your_prediction" for i in df['Utterance_ID']]
all_ids = df["Sr No."]
submission_df = pd.DataFrame({
        'Sr No.': all_ids,
        'Emotion': all_preds
    })
    
# Save the DataFrame to CSV
submission_df.to_csv("submission.csv", index=False)