In [1]:
import sklearn.feature_extraction.text as sklearn_text
import pickle
import os
import json
import numpy as np
from scipy.sparse import csr_matrix
import pandas as pd
from collections import Counter
import re

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

### Load Data

In [3]:
scripts = []
for season in range(1,12):
    for x in os.listdir('data/site=kacl780/season={}'.format(season)):
        path = os.path.join('data/site=kacl780/season={}'.format(season), x)
        with open(path, 'r') as f:
            script = json.load(f)
            script['all_speech'] = ' '.join(list(map(lambda x: x['line'], script['dialogue'])))
            scripts.append(script)
titles = list(map(lambda x: x['title'], scripts))
print('Loaded {} episodes.'.format(len(titles)))

Loaded 263 episodes.


In [4]:
def all_dialogue(scripts):
    return sum(list(map(lambda x: x['dialogue'], scripts)),[])

def get_episode(scripts, season, episode):
    return list(filter(lambda x: (x['season'] == season) & (x['episode'] == episode), scripts))

In [5]:
def char_speech(character, scripts, join=False, inverse=False):
    speech = filter(lambda x: x['character'] == character, all_dialogue(scripts))
    speech = list(map(lambda x: x['line'], speech))
    if join:
        speech = ' '.join(speech)
        speech = re.sub(re.compile(' +'), ' ', speech).strip()
    return speech

In [6]:
line_counts = Counter(list(map(lambda y: y['character'], sum(list(map(lambda x: x['dialogue'], scripts)),[]))))
top_chars = sorted(line_counts.keys(), key = lambda x:line_counts[x], reverse = True)[:10]
top_char_speech = dict(zip(top_chars, list(map(lambda x: char_speech(x, scripts, True), top_chars))))

### Create Vectorizer, Fit and Transform

In [7]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

In [8]:
stop_words = LemmaTokenizer()(' '.join(stopwords.words('english'))) + ['.',',','?','!','...', 'oh']
#stop_words='english'

In [9]:
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words,  tokenizer=LemmaTokenizer())
#tfidf  = tfidf_vectorizer.fit_transform(list(map(lambda x: x['all_speech'], scripts)))
tfidf  = tfidf_vectorizer.fit_transform(top_char_speech.values())
len(tfidf_vectorizer.get_feature_names_out())

22148

In [10]:
#df = pd.DataFrame(tfidf.todense(), index=titles, columns=tfidf_vectorizer.get_feature_names_out())
#df = pd.DataFrame.sparse.from_spmatrix(tfidf, index=top_char_speech.keys(), columns=tfidf_vectorizer.get_feature_names_out())
df = pd.DataFrame(tfidf.todense(), index=top_char_speech.keys(), columns=tfidf_vectorizer.get_feature_names_out())
df.to_csv('tfidf.csv')

### Process Output

In [11]:
avgs = df.mean(axis=0)
avgs.name = '#Average'
df.append(avgs).sort_values(by='#Average', ascending=False, axis=1).columns[:30]

Index([''m', 'frasier', 'well', 'know', 'get', 'nile', 'right', 'like', 'hey',
       'go', 'got', 'one', '``', '''', 'come', 'roz', 'look', 'daphne',
       'going', 'see', 'yes', 'yeah', 'think', 'want', 'back', 'time',
       'little', 'thing', 'good', 'crane'],
      dtype='object')

In [12]:
def top_n_by_char(character, n, df):
    return df.append(avgs).sort_values(by=character, ascending=False, axis=1).columns[:n]

In [13]:
df.append(avgs).sort_values(by='#Average', ascending=False, axis=1)

Unnamed: 0,'m,frasier,well,know,get,nile,right,like,hey,go,...,seafront,gomez,seabee—goes,concourse,lumbering,concurred,concurs,luminary,scurrilous,relive
Frasier,0.275568,0.130032,0.395302,0.262327,0.116904,0.298428,0.185372,0.111925,0.009647,0.121092,...,0.000306,0.000306,0.000306,0.000306,0.000306,0.000306,0.000306,0.000306,0.000306,0.000306
Niles,0.295821,0.351886,0.351323,0.185945,0.131852,0.134669,0.13805,0.109595,0.04296,0.123963,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Martin,0.229787,0.223594,0.342562,0.265641,0.190349,0.195564,0.165251,0.160688,0.194108,0.153843,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Daphne,0.281566,0.092282,0.28209,0.200819,0.159397,0.20344,0.150483,0.169359,0.027071,0.149434,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Roz,0.274061,0.411692,0.253626,0.245212,0.170086,0.111187,0.123207,0.146647,0.133499,0.155061,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bulldog,0.21157,0.137383,0.093421,0.145626,0.11815,0.024729,0.123645,0.134636,0.478362,0.131888,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Lilith,0.253786,0.456815,0.253786,0.126893,0.086287,0.126893,0.10659,0.096439,0.0,0.121817,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Kenny,0.280587,0.179392,0.20239,0.156393,0.147193,0.0092,0.105795,0.133394,0.342415,0.091996,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bebe,0.237251,0.424846,0.165524,0.143454,0.099315,0.049657,0.060692,0.154489,0.0,0.082762,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Donny,0.292431,0.164961,0.187456,0.277435,0.142467,0.17246,0.164961,0.104975,0.081026,0.097477,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
for char in top_chars:
    print(char + ': '+ ', '.join(list(top_n_by_char(char, 20, df))))

Frasier: well, nile, 'm, know, yes, dad, right, roz, frasier, go, see, '', ``, get, daphne, like, think, one, come, good
Niles: frasier, well, 'm, daphne, dad, know, yes, ``, '', mari, right, nile, get, one, go, going, like, think, see, back
Martin: well, know, 'm, frasier, yeah, nile, hey, get, right, like, got, go, come, guy, eddie, look, one, ', daphne, going
Daphne: crane, well, 'm, dr., nile, know, like, get, right, go, look, yes, one, think, daphne, ``, '', going, come, see
Roz: frasier, 'm, well, know, get, go, like, ``, '', hey, one, really, got, right, yeah, look, nile, going, guy, think
Bulldog: hey, doc, got, roz, 'm, bulldog, yeah, know, frasier, like, go, guy, right, get, '', ``, back, look, one, come
Lilith: frasier, well, 'm, lilith, yes, frederick, know, nile, go, u, thank, see, right, '', think, ``, brian, like, one, want
Kenny: hey, doc, 'm, well, frasier, got, know, yeah, kenny, get, roz, great, like, show, one, '', ``, right, look, guy
Bebe: frasier, 'm, bebe, darli

### Debug

In [15]:
#get_episode(scripts, 7, 19)
char_speech('Bebe', scripts)

['Frasier! Thank God.',
 "What kind of agent would I be if I weren't the first to tell you that you've been nominated for a 1994 SeaBea!",
 "Yes. You, Frasier Crane M.D., PhD, S-T-U-D, are the man of the hour. Bebe Glaser, Frasier's agent.",
 "You're not a psychiatrist too?",
 "Oh, please! If I'm ever to have a breakdown, let me have it now! Double-double decaf, to go.",
 "Oh, isn't he precious? You must be very proud of Frasier.",
 'I bet you two had wicked little hair-pulling fights when you were tots.',
 'Who are you?',
 "Oh, yes that's right, I've seen you bring him coffee. By the way, would you mind getting mine?",
 "Oh, not really, darling. I've already got your tux, rented you a limo and your tickets will be waiting for you at the door.",
 "Your subtlety floors me, I'd love to. I am thrilled, thrilled, thrilled for you both. I've got to run. Two of my other clients weren't nominated and I have to tell them what a worthless award this is. FADE TO: ROZ AND FRASIER HATCH A MERRY PL