In [1]:
import pandas as pd
import numpy as np
import nltk
import os
import os.path
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

2023-10-08 13:05:16.945345: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Data Loading

In [2]:
titles = pd.read_table('../data/movie_titles_metadata.txt', delimiter=r" \+\+\+\$\+\+\+ ", engine='python', encoding='latin1', names=['movie_id','movie_title','movie_year','imdb_rating','num_ratings','genre'])

characters = pd.read_table('../data/movie_characters_metadata.txt', delimiter=r" \+\+\+\$\+\+\+ ", engine='python', encoding='latin1', names=['character_id','character_name','movie_id','movie_title','gender','credits_position'])

lines = pd.read_table('../data/movie_lines.txt', delimiter=r" \+\+\+\$\+\+\+ ", engine='python', encoding='latin1', names=['line_id','character_id','movie_id','character_name','line'])

convos = pd.read_table('../data/movie_conversations.txt', delimiter=r" \+\+\+\$\+\+\+ ", engine='python', names=['character_id_first','character_id_second','movie_id','line_order'])

In [3]:
titles.head()

Unnamed: 0,movie_id,movie_title,movie_year,imdb_rating,num_ratings,genre
0,m0,10 things i hate about you,1999,6.9,62847,"['comedy', 'romance']"
1,m1,1492: conquest of paradise,1992,6.2,10421,"['adventure', 'biography', 'drama', 'history']"
2,m2,15 minutes,2001,6.1,25854,"['action', 'crime', 'drama', 'thriller']"
3,m3,2001: a space odyssey,1968,8.4,163227,"['adventure', 'mystery', 'sci-fi']"
4,m4,48 hrs.,1982,6.9,22289,"['action', 'comedy', 'crime', 'drama', 'thrill..."


In [4]:
characters.head()

Unnamed: 0,character_id,character_name,movie_id,movie_title,gender,credits_position
0,u0,BIANCA,m0,10 things i hate about you,f,4
1,u1,BRUCE,m0,10 things i hate about you,?,?
2,u2,CAMERON,m0,10 things i hate about you,m,3
3,u3,CHASTITY,m0,10 things i hate about you,?,?
4,u4,JOEY,m0,10 things i hate about you,m,6


In [5]:
lines.head()

Unnamed: 0,line_id,character_id,movie_id,character_name,line
0,L1045,u0,m0,BIANCA,They do not!
1,L1044,u2,m0,CAMERON,They do to!
2,L985,u0,m0,BIANCA,I hope so.
3,L984,u2,m0,CAMERON,She okay?
4,L925,u0,m0,BIANCA,Let's go.


In [6]:
convos.head()

Unnamed: 0,character_id_first,character_id_second,movie_id,line_order
0,u0,u2,m0,"['L194', 'L195', 'L196', 'L197']"
1,u0,u2,m0,"['L198', 'L199']"
2,u0,u2,m0,"['L200', 'L201', 'L202', 'L203']"
3,u0,u2,m0,"['L204', 'L205', 'L206']"
4,u0,u2,m0,"['L207', 'L208']"


# Pre-processing

In [7]:
# Gets the actual line from the line ID
def get_line(line_id):
    return lines['line'].loc[lines['line_id'] == line_id].to_numpy()[0]

In [8]:
# Convert line order to array of actual lines
def get_line_array(line_order):
    
    # Get first line order
    text = convos['line_order'].loc[0]
    
    # Convert line order to array
    text = text.replace("['", "").replace("']", "").split("', '")
    
    result = map(get_line, text) 
    return list(result)

## Sample conversion from conversation to lines

In [15]:
# Replace each line ID with the actual line
temp = get_line_array(convos['line_order'].iloc[0])
print(temp)
print(len(temp))

['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.', "Well, I thought we'd start with pronunciation, if that's okay with you.", 'Not the hacking and gagging and spitting part.  Please.', "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?"]
4


## Convert all conversations to lines

In [12]:
all_line_orders = convos['line_order']
all_line_orders

0                    ['L194', 'L195', 'L196', 'L197']
1                                    ['L198', 'L199']
2                    ['L200', 'L201', 'L202', 'L203']
3                            ['L204', 'L205', 'L206']
4                                    ['L207', 'L208']
                             ...                     
83092    ['L666324', 'L666325', 'L666326', 'L666327']
83093                          ['L666575', 'L666576']
83094                          ['L666256', 'L666257']
83095    ['L666369', 'L666370', 'L666371', 'L666372']
83096               ['L666520', 'L666521', 'L666522']
Name: line_order, Length: 83097, dtype: object

# Neural Network

In [13]:
model = Sequential()

2023-10-08 13:05:23.919021: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
