# Generate the dataset

So we want to assign a list of genres to each of the movies that we have present plus the emotion count that was found from the BERT labels. 

This will generate the data that the ML's will actually train on. 

In [1]:
import pandas as pd

# Load metadata
metadata = pd.read_csv('BERT_annotations/movie_metadata/movie_meta_data.csv')

print(f"Loaded {len(metadata)} movies")
print(f"\nSample genres column:")
print(metadata['genres'].head(10))

# Extract all unique genres
# Split by comma, strip whitespace, collect all unique values
all_genres = set()

for genres_str in metadata['genres'].dropna():
    # Split by comma and strip whitespace
    genres_list = [g.strip() for g in str(genres_str).split(',')]
    all_genres.update(genres_list)

# Sort for readability
all_genres = sorted(list(all_genres))

print(f"\nTotal unique genres: {len(all_genres)}")
print(f"\nAll genres:")
for genre in all_genres:
    print(f"  - {genre}")

# Show genre distribution
genre_counts = {}
for genres_str in metadata['genres'].dropna():
    for genre in str(genres_str).split(','):
        genre = genre.strip()
        genre_counts[genre] = genre_counts.get(genre, 0) + 1

print(f"\nGenre frequency (top 15):")
sorted_genres = sorted(genre_counts.items(), key=lambda x: x[1], reverse=True)
for genre, count in sorted_genres[:15]:
    print(f"  {genre}: {count}")

Loaded 2858 movies

Sample genres column:
0                        Comedy, Music, Romance
1                                Drama, Romance
2           Action, Adventure, Sci-Fi, Thriller
3                          Comedy, Drama, Music
4                          Comedy, Crime, Sport
5                                 Comedy, Drama
6                Biography, Drama, History, War
7                                 Comedy, Drama
8    Action, Adventure, Comedy, Drama, Thriller
9                           Drama, Romance, War
Name: genres, dtype: object

Total unique genres: 27

All genres:
  - Action
  - Adventure
  - Animation
  - Biography
  - Comedy
  - Crime
  - Documentary
  - Drama
  - Family
  - Fantasy
  - Film-Noir
  - Game-Show
  - History
  - Horror
  - Music
  - Musical
  - Mystery
  - News
  - Reality-TV
  - Romance
  - Sci-Fi
  - Short
  - Sport
  - Talk-Show
  - Thriller
  - War
  - Western

Genre frequency (top 15):
  Drama: 1550
  Comedy: 880
  Thriller: 859
  Action: 622
  Rom

In [None]:
import pandas as pd

def create_movie_features_with_genres(labeled_sentences_file, metadata_file, output_file='movie_features.csv'):
    """
    Create movie-level features with emotion sequences and genres.
    """
    # Load labeled sentences
    df_labeled = pd.read_csv(labeled_sentences_file)
    print(f"Loaded {len(df_labeled)} labeled sentences")
    
    # Load metadata
    metadata = pd.read_csv(metadata_file)
    print(f"Loaded {len(metadata)} movies from metadata")
    
    # Create emotion sequences by movie (preserving order)
    emotion_counts = df_labeled.groupby('movie', sort=False).agg({
        'emotion': lambda x: list(x)
    }).reset_index()
    emotion_counts.rename(columns={'emotion': 'emotion_sequence'}, inplace=True)
    print(f"\nCreated emotion sequences for {len(emotion_counts)} movies")
    
    # Extract IMDb ID from filename and convert to int
    emotion_counts['imdbid'] = emotion_counts['movie'].str.extract(r'_(\d+)_').astype(int)
    
    # Also ensure metadata imdbid is int
    metadata['imdbid'] = metadata['imdbid'].astype(int)
    
    # Merge with metadata on imdbid
    movie_features = emotion_counts.merge(
        metadata[['imdbid', 'title', 'genres']], 
        on='imdbid', 
        how='left'
    )
    
    # Reorder columns: movie, title, genres, emotion_sequence
    cols = ['movie', 'title', 'genres', 'emotion_sequence']
    movie_features = movie_features[cols]
    
    # Add sequence length for reference
    movie_features['sequence_length'] = movie_features['emotion_sequence'].apply(len)
    
    print(f"\nMerged data: {len(movie_features)} movies with genres")
    print(f"Movies without genre info: {movie_features['genres'].isna().sum()}")
    print(f"Average sequence length: {movie_features['sequence_length'].mean():.0f}")
    
    # Save to CSV
    movie_features.to_csv(output_file, index=False)
    print(f"\nSaved to {output_file}")
    
    return movie_features

# Usage
movie_features = create_movie_features_with_genres(
    'partial_results_3940200.csv',
    'BERT_annotations/movie_metadata/movie_meta_data.csv'
)

print(movie_features.head(10))

Loaded 3940200 labeled sentences
Loaded 2858 movies from metadata

Aggregated emotions for 1643 movies

Merged data: 1643 movies with genres
Movies without genre info: 2

Saved to movie_features.csv
                                         movie                       title  \
0         10 Cloverfield Lane_1179933_anno.txt         10 Cloverfield Lane   
1  10 Things I Hate About You_0147800_anno.txt  10 Things I Hate About You   
2                12 Angry Men_0118528_anno.txt                12 Angry Men   
3                  12 Monkeys_0114746_anno.txt                  12 Monkeys   
4            12 Years a Slave_2024544_anno.txt            12 Years a Slave   
5                   127 Hours_1542344_anno.txt                   127 Hours   
6                    13 13 13_2991516_anno.txt                    13/13/13   
7                        1408_0450385_anno.txt                        1408   
8   1492 Conquest of Paradise_0103594_anno.txt  1492: Conquest of Paradise   
9                  15