In [1]:
%matplotlib inline

import pandas
import numpy as np
import matplotlib.pyplot as plt
from nltk import word_tokenize, stem

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2, f_classif

from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, f1_score, accuracy_score, roc_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split

from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier

### Загрузка данных

In [27]:
path = 'All-seasons.csv'
df = pandas.read_csv(path)
df.head()

Unnamed: 0,Season,Episode,Character,Line
0,10,1,Stan,"You guys, you guys! Chef is going away. \n"
1,10,1,Kyle,Going away? For how long?\n
2,10,1,Stan,Forever.\n
3,10,1,Chef,I'm sorry boys.\n
4,10,1,Stan,"Chef said he's been bored, so he joining a gro..."


Выделим героев с наибольшим числом реплик.

In [3]:
df.groupby('Character').count().sort_values(by='Season').tail(10)

Unnamed: 0_level_0,Season,Episode,Line
Character,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Mr. Mackey,633,633,633
Sharon,862,862,862
Kenny,881,881,881
Chef,917,917,917
Mr. Garrison,1002,1002,1002
Randy,2467,2467,2467
Butters,2602,2602,2602
Kyle,7099,7099,7099
Stan,7680,7680,7680
Cartman,9774,9774,9774


Оставим из них только детей.

In [205]:
main_characters=['Cartman', 'Stan', 'Kyle', 'Butters', 'Kenny']
df_main = df[df['Character'].isin(main_characters)]
df_main.describe()

Unnamed: 0,Season,Episode,Character,Line
count,28036,28036,28036,28036
unique,18,18,5,25502
top,2,1,Cartman,What?\n
freq,2422,2233,9774,194


Возьмем в качестве тестовой выборки последние четыре сезона.

In [128]:
df_test = df_main[df_main['Season'].astype(int) >= 15]
df_train = df_main[df_main['Season'].astype(int) < 15]

y = np.array(df_train['Character'])
y_test = np.array(df_test['Character'])

### Baseline

Построим dummy-решение: равновероятный выбор между героями.

In [13]:
cls_dummy = DummyClassifier('uniform').fit(np.zeros((len(y), 1)), y)
y_pred = cls_dummy.predict(np.zeros((len(y_test), 1)))
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

    Butters       0.14      0.20      0.16       631
    Cartman       0.39      0.19      0.26      1713
      Kenny       0.03      0.23      0.05       124
       Kyle       0.23      0.20      0.21      1092
       Stan       0.20      0.19      0.20       894

avg / total       0.27      0.20      0.22      4454



В качестве baseline решения возьмем NaiveBayes над ненормализованной (но токенизированной) таблицей bag-of-words.

RandomForest размера 100 на этих данных уже работает достаточно долго.

In [110]:
import re

class TokenTokenizer(object):
    def __call__(self, string):
        string = re.sub(r'(\W)', r' \1 ', string)
        tokens = string.strip().split()
        tokens = [token for token in tokens if token != '']
        return tokens
    
class WordTokenizer(object):
    def __call__(self, string):
        tokens = re.split(r'\W', string)
        tokens = [token.lower() for token in tokens if token != '']
        return tokens

In [238]:
vectorizer = CountVectorizer(
    tokenizer=TokenTokenizer())
X_bayes = vectorizer.fit_transform(df_train['Line'])
print('Train set size: {}, vocab size: {}'.format(*X.shape))
X_test_bayes = vectorizer.transform(df_test['Line'])
print('Test set size: {}'.format(*X_test.shape))

Train set size: 23582, vocab size: 11154
Test set size: 4454


In [239]:
cls_baseline = MultinomialNB()
cls_baseline.fit(X_bayes, y)
y_pred = cls_baseline.predict(X_test_bayes)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

    Butters       0.62      0.10      0.16       631
    Cartman       0.51      0.64      0.57      1713
      Kenny       0.87      0.67      0.76       124
       Kyle       0.40      0.27      0.32      1092
       Stan       0.30      0.46      0.36       894

avg / total       0.47      0.44      0.41      4454



Дальше попробуем 2 различных пути: с помощью feature_selection, ngrams и max/min df будем подбирать оптимальные параметры векторизации для линейной регрессии, и на основе составленных вручную признаков будем обучать random forest.

Но сперва подумаем, как обработать данные.

### Обработка данных

Посмотрим на примеры реплик каждого из персонажей:

In [42]:
df_sample = []
for character in main_characters:
    df_sample.append(df_main[df_main['Character'] == character].sample(5))
pandas.concat(df_sample)

Unnamed: 0,Season,Episode,Character,Line
63787,8,1,Cartman,Why?! Why?! Why did you have to take them bot...
48732,4,14,Cartman,How good?\n
57741,6,12,Cartman,"Nah, I'll lose it for sure. You keep track of ..."
62083,7,9,Cartman,Our platinum album ceremony. I spared no expen...
35594,2,11,Cartman,"Let's see, where have I been, where have I bee..."
55404,6,4,Stan,"Cartman, wake up!\n"
15709,14,10,Stan,"Well, my friends are worried that I'm showing ..."
59040,6,16,Stan,Liver medicine?\n
22264,16,11,Stan,"""On the morrow""? What the fuck is wrong with K..."
47472,4,10,Stan,Have you confessed all your sins yet?\n


Хм, все реплики Кенни выделены в скобки!
(в мультсериале он говорит сквозь капюшон и его слова с трудом можно было разобрать).

Используем эту особенность -- оставим токенизацию по знакам препинания.

Еще из особенностей: короткие предложения с большой буквы: лучше привести к нижнему регистру и отдельно учесть длину/ количество слов в предложении.

Построим частотные списки по каждому персонажу:

In [75]:
from collections import Counter, defaultdict

popular_words = defaultdict(list)
for character in main_characters:
    char_lines = list(df_train[df_train['Character'] == character]['Line'])
    char_lines = [WordTokenizer()(line) for line in char_lines]
    char_words = [word for line in char_lines for word in line]
    char_cnt = Counter(char_words)
    popular_words[character] = [word for word, count in char_cnt.most_common(300)]
    
for character in popular_words:
    print('{}: {}\n'.format(character, ', '.join(popular_words[character][:50])))

Stan: you, i, to, we, the, s, it, t, a, that, what, and, dude, is, this, re, on, of, have, do, in, just, no, he, yeah, don, all, can, are, my, oh, cartman, they, be, for, get, go, not, but, come, gonna, know, me, m, so, dad, with, us, your, now

Cartman: you, i, the, to, s, a, it, and, that, t, we, of, is, my, what, me, in, this, m, on, have, all, oh, can, guys, for, re, just, no, kyle, your, do, are, don, get, be, with, so, right, now, not, out, here, go, gonna, he, know, they, like, but

Kenny: i, you, yeah, s, that, the, what, it, to, a, oh, no, hey, and, me, guys, t, m, fuck, uh, my, we, woohoo, this, do, get, is, don, dude, okay, huh, on, of, they, not, fucking, all, have, are, god, so, now, too, ve, gonna, can, in, go, re, got

Kyle: you, i, to, the, s, it, we, t, a, that, what, and, is, dude, of, cartman, have, re, this, can, do, on, in, don, he, yeah, just, no, all, my, are, for, get, be, not, they, me, but, your, with, m, oh, so, go, out, know, gonna, up, there, us

Butters: i

Слишком много повторяющихся стоп-слов.
Посмотрим на топ пересечений:

In [94]:
intersection_50 = set.intersection(*[set(words[:50]) for words in popular_words.values()])
intersection_100 = set.intersection(*[set(words[:100]) for words in popular_words.values()])
intersection_200 = set.intersection(*[set(words[:200]) for words in popular_words.values()])
intersection_200.difference_update(intersection_100)
intersection_100.difference_update(intersection_50)
print('First 50 intersection:\n' + ', '.join(sorted(intersection_50)))
print('\nFirst 100 intersection:\n' + ', '.join(sorted(intersection_100)))
print('\nFirst 200 intersection:\n' + ', '.join(sorted(intersection_200)))

First 50 intersection:
a, all, and, are, can, do, don, get, go, gonna, have, i, in, is, it, m, me, my, no, not, of, oh, on, re, s, t, that, the, they, this, to, we, what, you

First 100 intersection:
at, be, but, come, did, for, got, guys, he, here, hey, how, just, know, let, like, ll, look, now, okay, out, really, right, see, so, there, think, uh, ve, was, well, why, with, yeah, your

First 200 intersection:
about, again, as, aw, because, could, d, down, god, going, good, has, her, him, his, huh, if, jesus, kenny, little, mom, one, our, please, said, she, some, stupid, sure, take, then, time, too, up, wait, wanna, want, way, when, where, would


In [102]:
stop_words = intersection_50.difference('oh')
stop_words |= intersection_100.difference('guys hey okay really right uh yeah'.split())
stop_words = list(sorted(stop_words))
print('Stop words:\n' + ', '.join(stop_words))

Stop words:
a, all, and, are, at, be, but, can, come, did, do, don, for, get, go, gonna, got, have, he, here, how, i, in, is, it, just, know, let, like, ll, look, m, me, my, no, not, now, of, oh, on, out, re, s, see, so, t, that, the, there, they, think, this, to, ve, was, we, well, what, why, with, you, your


Теперь посмотрим на еще раз на частотные списки без стоп-слов:

In [106]:
for character in popular_words:
    print('{}: {}\n'.format(character, ', '.join([w for w in popular_words[character][:70] if w not in stop_words])))

Stan: dude, yeah, cartman, dad, us, guys, kyle, our, up, right, about, hey, okay, kenny

Cartman: guys, kyle, right, yeah, up, god, butters, if, kenny, about, okay, one

Kenny: yeah, hey, guys, fuck, uh, woohoo, dude, okay, huh, fucking, god, too, o, right, cartman, fuckin, gotta, take, hoo

Kyle: dude, cartman, yeah, up, us, stan, right, about, hey, if, him, kenny, really, people

Butters: uh, hey, yeah, huh, eric, sure, ah, dad, fellas, o, if, right, up



In [206]:
#import warnings
#warnings.filterwarnings('ignore')

def normalize(text):
    tokens = WordTokenizer()(text)
    tokens = [token for token in tokens if token not in stop_words]
    return tokens
    

df_main['Line_normalized'] = df_main['Line'].map(lambda text: normalize(text))

In [122]:
df_vocab = df_main[df_main['Season'].astype(int) < 15]['Line_normalized']
vocabulary = sorted(set(word for words in df_vocab for word in words))
print('Vocabulary size: {}'.format(len(vocabulary)))

Vocabulary size: 11054


### RandomForest

Начнем с самых простых признаков: ранее мы поняли, что реплики Кенни легко можно отличить по наличию скобок.

Добавим имена упоминаемых героев

In [207]:
df_main['talk_Kenny'] = df_main['Line'].map(lambda text: int(text[0] + text.strip()[-1] == '()'))

In [208]:
for character in main_characters:
    df_main['word_{}'.format(character)] = df_main['Line'].map(lambda text: int(character in text))

In [209]:
X = np.array(df_main[df_main['Season'].astype(int) < 15].ix[:, 5:])
y = np.array(df_main[df_main['Season'].astype(int) < 15]['Character'])

X_train, X_valid, y_train, y_valid = train_test_split(X, y)
cls_rf = RandomForestClassifier(n_estimators=100)
cls_rf.fit(X_train, y_train)
y_pred = cls_rf.predict(X_valid)
print(classification_report(y_valid, y_pred))

             precision    recall  f1-score   support

    Butters       0.00      0.00      0.00       486
    Cartman       0.36      0.98      0.53      2004
      Kenny       0.99      0.97      0.98       208
       Kyle       0.47      0.10      0.16      1488
       Stan       0.71      0.01      0.01      1710

avg / total       0.48      0.39      0.26      5896



С помощью всего 6 признаков мы получили неплохой precision, но f1-score хуже, чем у baseline.

Добавим синтаксические признаки и посмотрим на распределение по персонажам:

In [210]:
import string

df_main['count_symbols'] = df_main['Line'].map(lambda text: len(text))
df_main['count_words'] = df_main['Line'].map(lambda text: len(WordTokenizer()(text)))
df_main['count_nonstopwords'] = df_main['Line_normalized'].map(lambda tokens: len(tokens))
df_main['count_upper'] = df_main['Line'].map(lambda text: len([c for c in text if c in string.uppercase]))
for sign in ("!.,?'"):
    df_main['count_{}'.format(sign)] = df_main['Line'].map(lambda text: text.count(sign))
df_main['count_digits'] = df_main['Line'].map(lambda text: len([c for c in text if c in '0123456789']))

In [211]:
df_main[df_main['Season'].astype(int) < 15].groupby('Character').mean()

Unnamed: 0_level_0,talk_Kenny,word_Cartman,word_Stan,word_Kyle,word_Butters,word_Kenny,count_symbols,count_words,count_nonstopwords,count_upper,count_!,count_.,"count_,",count_?,count_',count_digits
Character,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Butters,0.0,0.008625,0.021816,0.015728,0.014713,0.006596,59.391172,11.995434,6.150685,3.170472,0.602232,1.312024,1.033486,0.358701,0.8138,0.016743
Cartman,0.000248,0.005955,0.020717,0.067361,0.038829,0.031634,69.318199,13.706736,7.035479,3.406029,0.865153,1.304181,1.098003,0.357152,0.769508,0.051482
Kenny,0.977543,0.015852,0.005284,0.009247,0.003963,0.005284,26.446499,4.826948,2.484808,1.766182,0.632761,0.561427,0.397622,0.214003,0.303831,0.005284
Kyle,0.000333,0.07691,0.029299,0.003496,0.015315,0.026969,46.884302,9.335109,4.634926,2.299484,0.560013,0.919594,0.64891,0.342101,0.562344,0.030797
Stan,0.000589,0.05305,0.003389,0.034483,0.016799,0.027704,47.041851,9.410109,4.681698,2.178308,0.442529,0.938403,0.761273,0.333923,0.598585,0.027409


In [212]:
X = np.array(df_main[df_main['Season'].astype(int) < 15].ix[:, 5:])
y = np.array(df_main[df_main['Season'].astype(int) < 15]['Character'])

X_train, X_valid, y_train, y_valid = train_test_split(X, y)
cls_rf = RandomForestClassifier(n_estimators=100)
cls_rf.fit(X_train, y_train)
y_pred = cls_rf.predict(X_valid)
print(classification_report(y_valid, y_pred))

             precision    recall  f1-score   support

    Butters       0.14      0.06      0.08       477
    Cartman       0.45      0.56      0.50      2028
      Kenny       0.98      0.97      0.97       186
       Kyle       0.33      0.30      0.31      1488
       Stan       0.37      0.35      0.36      1717

avg / total       0.39      0.41      0.39      5896



Незначительное улучшение есть.

Добавим частотные признаки:

In [219]:
popular_words_filtered = {char : [w for w in words if w not in stop_words] for char, words in popular_words.items()}

for words in popular_words_filtered:
    for word in words[:200]:
        df_main['word_{}'.format(word)] = df_main['Line_normalized'].map(lambda tokens: int(word in tokens))

In [230]:
X = np.array(df_main[df_main['Season'].astype(int) < 15].ix[:, 5:])
y = np.array(df_main[df_main['Season'].astype(int) < 15]['Character'])

X_train, X_valid, y_train, y_valid = train_test_split(X, y)
cls_rf = RandomForestClassifier(n_estimators=100)
cls_rf.fit(X_train, y_train)
y_pred = cls_rf.predict(X_valid)
print(classification_report(y_valid, y_pred))

             precision    recall  f1-score   support

    Butters       0.14      0.06      0.08       505
    Cartman       0.44      0.56      0.49      1980
      Kenny       0.99      0.97      0.98       181
       Kyle       0.35      0.30      0.32      1488
       Stan       0.37      0.37      0.37      1742

avg / total       0.39      0.41      0.39      5896



Увы, из hand-crafted признаков не удалось получить ничего стоящего.

### Логистическая регрессия

In [241]:
df_test = df_main[df_main['Season'].astype(int) >= 15]
df_train = df_main[df_main['Season'].astype(int) < 15]

y = np.array(df_train['Character'])
y_test = np.array(df_test['Character'])

Попробуем применить логистическую регрессию при базовой обработке текста.

In [253]:
vectorizer = CountVectorizer(
    tokenizer=TokenTokenizer())
X = vectorizer.fit_transform(df_train['Line'])
X_train, X_valid, y_train, y_valid = train_test_split(X, y)
X_test_log = vectorizer.transform(df_test['Line'])
cls_linear = LogisticRegression(random_state=42)
cls_linear.fit(X_train, y_train)
y_pred = cls_linear.predict(X_valid)
print(classification_report(y_valid, y_pred))

             precision    recall  f1-score   support

    Butters       0.58      0.33      0.42       464
    Cartman       0.57      0.63      0.60      2025
      Kenny       0.97      0.98      0.98       187
       Kyle       0.42      0.34      0.38      1531
       Stan       0.44      0.52      0.47      1689

avg / total       0.51      0.51      0.50      5896



Аналогично, вместо LogisticRegression можно использовать SGDClassifier.

Он немного быстрее на большом объеме данных и позволяет выбрать функцию ошибки.

Например, 'log' соотвествует логистической регрессии:

In [229]:
vectorizer = CountVectorizer(
    tokenizer=TokenTokenizer())
X = vectorizer.fit_transform(df_train['Line'])
X_train, X_valid, y_train, y_valid = train_test_split(X, y)
cls_sgd = SGDClassifier(
    loss='log',
    n_iter=50,                
    random_state=42,
    shuffle=True)
cls_sgd.fit(X_train, y_train)
y_pred = cls_sgd.predict(X_valid)
print(classification_report(y_valid, y_pred))

             precision    recall  f1-score   support

    Butters       0.62      0.34      0.44       500
    Cartman       0.54      0.65      0.59      1996
      Kenny       0.97      0.99      0.98       185
       Kyle       0.44      0.36      0.40      1476
       Stan       0.46      0.49      0.47      1739

avg / total       0.51      0.51      0.51      5896



Вот лучший результат, который удалось получить на валидации:

In [242]:
vectorizer = TfidfVectorizer(
    tokenizer=TokenTokenizer(),
    ngram_range=(1, 3),
    max_df=0.25)
X = vectorizer.fit_transform(df_train['Line'])
feature_selector = SelectKBest(f_classif, k=50000)
X = feature_selector.fit_transform(X, y)
print('Train set size: {}, vocab size: {}'.format(*X.shape))

X_train, X_valid, y_train, y_valid = train_test_split(X, y)
cls_sgd = SGDClassifier(
    loss='modified_huber',
    n_iter=50,                
    random_state=42,
    shuffle=True)
cls_sgd.fit(X_train, y_train)
y_pred = cls_sgd.predict(X_valid)
print(classification_report(y_valid, y_pred))

Train set size: 23582, vocab size: 50000
             precision    recall  f1-score   support

    Butters       0.74      0.21      0.32       498
    Cartman       0.58      0.78      0.67      2025
      Kenny       0.97      0.99      0.98       187
       Kyle       0.56      0.41      0.48      1482
       Stan       0.54      0.57      0.56      1704

avg / total       0.59      0.58      0.57      5896



In [247]:
vectorizer = TfidfVectorizer(
    tokenizer=TokenTokenizer(),
    ngram_range=(1, 3),
    max_df=0.25)
X_sgd = vectorizer.fit_transform(df_train['Line'])
X_test_sgd = vectorizer.transform(df_test['Line'])
feature_selector = SelectKBest(f_classif, k=50000)
X_sgd = feature_selector.fit_transform(X_sgd, y)
X_test_sgd = feature_selector.transform(X_test_sgd)
print('Train set size: {}, vocab size: {}'.format(*X_sgd.shape))

X_train, X_valid, y_train, y_valid = train_test_split(X_sgd, y)
cls_sgd = SGDClassifier(
    loss='modified_huber',
    n_iter=50,                
    random_state=42,
    shuffle=True)
cls_sgd.fit(X_train, y_train)
y_pred = cls_sgd.predict(X_valid)
print(classification_report(y_valid, y_pred))

Train set size: 23582, vocab size: 50000
             precision    recall  f1-score   support

    Butters       0.77      0.29      0.42       480
    Cartman       0.59      0.76      0.67      2044
      Kenny       0.98      0.98      0.98       193
       Kyle       0.53      0.44      0.48      1472
       Stan       0.55      0.54      0.55      1707

avg / total       0.59      0.59      0.58      5896



### Сравнение моделей

У всех наших моделей была разная предобработка, поэтому придется использовать три разные таблицы X_test:

In [232]:
X_test_rf = np.array(df_main[df_main['Season'].astype(int) >= 15].ix[:, 5:])

In [254]:
y_pred_bayes = cls_baseline.predict(X_test_bayes)
y_proba_bayes = cls_baseline.predict_proba(X_test_bayes)

y_pred_rf = cls_rf.predict(X_test_rf)
y_proba_rf = cls_rf.predict_proba(X_test_rf)

y_pred_sgd = cls_sgd.predict(X_test_sgd)
y_proba_sgd = cls_sgd.predict_proba(X_test_sgd)

y_pred_log = cls_linear.predict(X_test_log)
y_proba_log = cls_linear.predict_proba(X_test_log)

In [250]:
print('Bayes classification report')
print(classification_report(y_test, y_pred_bayes))

Bayes classification report
             precision    recall  f1-score   support

    Butters       0.62      0.10      0.16       631
    Cartman       0.51      0.64      0.57      1713
      Kenny       0.87      0.67      0.76       124
       Kyle       0.40      0.27      0.32      1092
       Stan       0.30      0.46      0.36       894

avg / total       0.47      0.44      0.41      4454



In [251]:
print('Random forest classification report')
print(classification_report(y_test, y_pred_rf))

Random forest classification report
             precision    recall  f1-score   support

    Butters       0.21      0.04      0.07       631
    Cartman       0.48      0.59      0.53      1713
      Kenny       0.86      0.77      0.81       124
       Kyle       0.35      0.30      0.32      1092
       Stan       0.26      0.34      0.30       894

avg / total       0.37      0.39      0.37      4454



In [252]:
print('SGD linear model classification report')
print(classification_report(y_test, y_pred_sgd))

Linear model classification report
             precision    recall  f1-score   support

    Butters       0.65      0.13      0.22       631
    Cartman       0.52      0.67      0.58      1713
      Kenny       0.88      0.81      0.85       124
       Kyle       0.40      0.31      0.35      1092
       Stan       0.30      0.38      0.34       894

avg / total       0.47      0.45      0.43      4454



In [255]:
print('Logistic regession classification report')
print(classification_report(y_test, y_pred_log))

Logistic regession classification report
             precision    recall  f1-score   support

    Butters       0.61      0.17      0.27       631
    Cartman       0.55      0.61      0.58      1713
      Kenny       0.88      0.81      0.85       124
       Kyle       0.41      0.35      0.38      1092
       Stan       0.30      0.45      0.36       894

avg / total       0.48      0.46      0.45      4454



Итак, SGDClassifier, показавший себя лучше всего на валидации, к сожалению, переобучился, и лучшим результатом на тесте стала логистическая регрессиия.