In [1]:
import sklearn.feature_extraction.text as sklearn_text
import pickle
import os
import json
import numpy as np
from scipy.sparse import csr_matrix
import pandas as pd
from collections import Counter
import re

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

### Load Data

In [3]:
scripts = []
for season in range(1,12):
    for x in os.listdir('data/site=kacl780/season={}'.format(season)):
        path = os.path.join('data/site=kacl780/season={}'.format(season), x)
        with open(path, 'r') as f:
            script = json.load(f)
            script['all_speech'] = ' '.join(list(map(lambda x: x['line'], script['dialogue'])))
            scripts.append(script)
titles = list(map(lambda x: x['title'], scripts))
print('Loaded {} episodes.'.format(len(titles)))

Loaded 263 episodes.


In [4]:
def all_dialogue(scripts):
    return sum(list(map(lambda x: x['dialogue'], scripts)),[])

def get_episode(scripts, season, episode):
    return list(filter(lambda x: (x['season'] == season) & (x['episode'] == episode), scripts))

In [5]:
def char_speech(character, scripts, join=False):
    speech = filter(lambda x: x['character'] == character, all_dialogue(scripts))
    speech = list(map(lambda x: x['line'].replace("\'","'"), speech))
    if join:
        speech = ' '.join(speech)
        speech = re.sub(re.compile(' +'), ' ', speech).strip()
    return speech

In [6]:
line_counts = Counter(list(map(lambda y: y['character'], sum(list(map(lambda x: x['dialogue'], scripts)),[]))))
top_chars = sorted(line_counts.keys(), key = lambda x:line_counts[x], reverse = True)[:10]
top_char_speech = dict(zip(top_chars, list(map(lambda x: char_speech(x, scripts, True), top_chars))))

### Create Vectorizer, Fit and Transform

In [7]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

In [33]:
stop_words = LemmaTokenizer()(' '.join(stopwords.words('english'))) + ['.',',','?','!','...', 'oh']
#stop_words='english'

In [34]:
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words,  tokenizer=LemmaTokenizer())
#tfidf  = tfidf_vectorizer.fit_transform(list(map(lambda x: x['all_speech'], scripts)))
tfidf  = tfidf_vectorizer.fit_transform(top_char_speech.values())
len(tfidf_vectorizer.get_feature_names_out())

22270

In [35]:
#df = pd.DataFrame(tfidf.todense(), index=titles, columns=tfidf_vectorizer.get_feature_names_out())
#df = pd.DataFrame.sparse.from_spmatrix(tfidf, index=top_char_speech.keys(), columns=tfidf_vectorizer.get_feature_names_out())
df = pd.DataFrame(tfidf.todense(), index=top_char_speech.keys(), columns=tfidf_vectorizer.get_feature_names_out())
df.to_csv('tfidf.csv')

### Process Output

In [36]:
avgs = df.mean(axis=0)
avgs.name = '#Average'
df.append(avgs).sort_values(by='#Average', ascending=False, axis=1).columns[:30]

Index(['frasier', ''m', 'well', 'know', 'get', 'nile', 'right', 'like', 'hey',
       'go', 'got', 'one', '``', '''', 'come', 'daphne', 'roz', 'look', 'see',
       'going', 'yes', 'yeah', 'think', 'want', 'back', 'time', 'little',
       'thing', 'good', 'crane'],
      dtype='object')

In [37]:
def top_n_by_char(character, n, df):
    return df.append(avgs).sort_values(by=character, ascending=False, axis=1).columns[:n]

In [38]:
df.append(avgs).sort_values(by='#Average', ascending=False, axis=1)

Unnamed: 0,frasier,'m,well,know,get,nile,right,like,hey,go,...,corruption,podiatrist,sandbag,sandalwood,fourteen-dollar-a-pound,sugary,mover,sugarcoating,movie-ending,batter-dipped
Frasier,0.133717,0.273039,0.391624,0.259813,0.117801,0.299379,0.183707,0.110852,0.009555,0.122845,...,0.000303,0.000303,0.000303,0.000303,0.000303,0.000303,0.000303,0.000303,0.000303,0.000303
Niles,0.358199,0.290155,0.344922,0.182557,0.131109,0.135811,0.135535,0.108704,0.042178,0.123088,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Martin,0.226298,0.227914,0.33977,0.263475,0.190737,0.197849,0.163904,0.160025,0.192137,0.153559,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Daphne,0.096808,0.278,0.279035,0.198276,0.158931,0.208629,0.148577,0.167214,0.026728,0.149095,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Roz,0.417029,0.271277,0.25105,0.242722,0.170738,0.113627,0.121956,0.145157,0.132143,0.154081,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bulldog,0.151273,0.205299,0.091844,0.143169,0.116156,0.029714,0.121559,0.132364,0.470289,0.129663,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Lilith,0.462776,0.251508,0.251508,0.125754,0.085513,0.130784,0.105634,0.095573,0.0,0.120724,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Kenny,0.182155,0.277786,0.200371,0.154832,0.145724,0.009108,0.104739,0.132062,0.338998,0.091078,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bebe,0.428323,0.227716,0.162654,0.140967,0.097593,0.048796,0.05964,0.151811,0.0,0.070484,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Donny,0.178606,0.290234,0.186047,0.27535,0.141396,0.178606,0.163722,0.104187,0.080417,0.096745,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
for char in top_chars:
    print(char + ': '+ ', '.join(list(top_n_by_char(char, 20, df))))

Frasier: well, nile, 'm, know, yes, dad, right, roz, frasier, go, see, get, '', ``, daphne, like, think, one, come, good
Niles: frasier, well, 'm, daphne, dad, know, yes, ``, '', mari, nile, right, get, one, go, going, like, think, look, see
Martin: well, know, 'm, frasier, yeah, nile, hey, get, right, like, got, go, come, eddie, look, guy, one, ', daphne, going
Daphne: crane, well, 'm, dr., nile, know, like, get, go, right, look, one, yes, daphne, think, ``, '', going, come, frasier
Roz: frasier, 'm, well, know, get, go, like, ``, '', one, hey, got, really, right, look, yeah, nile, going, guy, think
Bulldog: hey, doc, got, roz, 'm, bulldog, frasier, yeah, know, like, go, guy, right, get, ``, '', back, look, come, one
Lilith: frasier, well, 'm, lilith, yes, frederick, nile, know, go, u, thank, see, right, think, ``, '', brian, like, one, time
Kenny: hey, doc, 'm, well, frasier, got, know, yeah, get, roz, kenny, like, great, show, one, ``, '', right, look, guy
Bebe: frasier, 'm, bebe, d

### Debug

In [15]:
#get_episode(scripts, 7, 19)
#char_speech('Bebe', scripts)