In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

In [2]:
df = pd.read_csv('utterances.csv')
df['speaker'] = df['speaker'].str.upper()
df['speaker'] = df['speaker'].str.split(', H', expand=True)[0]
df = df[df['speaker'] != '_NO_SPEAKER']
df['utterance_len'] = df['utterance'].str.len()
df = df.dropna()
host_id_df = pd.read_json('host_id.json', orient='index')
host_id_df = pd.read_json('host_id.json', orient='index')
host_id_df.reset_index(inplace=True)
host_id_df = host_id_df.rename(columns = {'index':'speaker'})
host_id_df['speaker'] = host_id_df['speaker'].astype('string')
host_id_df['speaker'] = host_id_df['speaker'].str.upper()
host_id_df.rename(columns={0:'host_id'}, inplace=True)
mergedf = df.merge(host_id_df, how='left', on='speaker')
mergedf['host_id'] = mergedf['host_id'].fillna(-1)
PreProcess_df = mergedf[mergedf['host_id'] != -1]
PreProcess_df = PreProcess_df.loc[PreProcess_df['utterance_len'] >= 100]
top_speakers = PreProcess_df.groupby(['speaker']).size().loc[PreProcess_df.groupby(['speaker']).size() > 20000]
df = pd.DataFrame(PreProcess_df.loc[PreProcess_df['speaker'].isin(top_speakers.index.values)])
df = df.reset_index(drop=True)
df['utterance'] = df['utterance'].str.lower()
df['unidentified'] = df['utterance'].str.startswith('unidentified')
df = df[df['unidentified'] == False]
df.count()
df['speaker'].value_counts()
df = df.groupby('speaker').apply(lambda x: x.sample(20000))
df['speaker'].value_counts()
def add_label(x):
    if x=='NEAL CONAN':
        return 1
    elif x=='STEVE INSKEEP':
        return 2
    elif x=='ROBERT SIEGEL':
        return 3
    elif x=='IRA FLATOW':
        return 4
    elif x=='FARAI CHIDEYA':
        return 5
    elif x=='MELISSA BLOCK':
        return 6
    else:
        return 7
 
df['label'] = df['speaker'].apply(add_label)
df = df.reset_index(drop=True)
df.head()
df = df.reset_index(drop=True)
df.head()
X_train, X_test, y_train, y_test = train_test_split(df['utterance'],df['label'], stratify=df['label'])

In [3]:
Tfid_Vector = TfidfVectorizer(analyzer='word', stop_words='english', max_features = None, ngram_range=[1, 1], 
                       binary=False, norm=None, smooth_idf=True, strip_accents=None,
                       sublinear_tf=True, use_idf=False)

In [4]:
Multi_tfid_pipeline = Pipeline([
    ('Tfid', Tfid_Vector),
    ('MultinomialNB', MultinomialNB())
])

In [5]:
MultinomialNB_parameters = {
    'Tfid__ngram_range': ([1,1],[1,2],[1,3], [1,4]),
    'MultinomialNB__alpha': [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000],
    'MultinomialNB__fit_prior': (True, False)    
}

In [6]:
MultinomialNB_gs = GridSearchCV(Multi_tfid_pipeline, MultinomialNB_parameters, cv=3, verbose = 3, n_jobs = 1)

In [7]:
MultinomialNB_gs.fit(X_train, y_train)

Fitting 3 folds for each of 64 candidates, totalling 192 fits
[CV 1/3] END MultinomialNB__alpha=1e-05, MultinomialNB__fit_prior=True, Tfid__ngram_range=[1, 1];, score=0.499 total time=   2.4s
[CV 2/3] END MultinomialNB__alpha=1e-05, MultinomialNB__fit_prior=True, Tfid__ngram_range=[1, 1];, score=0.498 total time=   2.4s
[CV 3/3] END MultinomialNB__alpha=1e-05, MultinomialNB__fit_prior=True, Tfid__ngram_range=[1, 1];, score=0.501 total time=   2.4s
[CV 1/3] END MultinomialNB__alpha=1e-05, MultinomialNB__fit_prior=True, Tfid__ngram_range=[1, 2];, score=0.525 total time=   7.8s
[CV 2/3] END MultinomialNB__alpha=1e-05, MultinomialNB__fit_prior=True, Tfid__ngram_range=[1, 2];, score=0.522 total time=   7.7s
[CV 3/3] END MultinomialNB__alpha=1e-05, MultinomialNB__fit_prior=True, Tfid__ngram_range=[1, 2];, score=0.527 total time=   7.6s
[CV 1/3] END MultinomialNB__alpha=1e-05, MultinomialNB__fit_prior=True, Tfid__ngram_range=[1, 3];, score=0.528 total time=  15.1s
[CV 2/3] END MultinomialNB__

[CV 1/3] END MultinomialNB__alpha=0.001, MultinomialNB__fit_prior=False, Tfid__ngram_range=[1, 2];, score=0.537 total time=   7.8s
[CV 2/3] END MultinomialNB__alpha=0.001, MultinomialNB__fit_prior=False, Tfid__ngram_range=[1, 2];, score=0.534 total time=   7.7s
[CV 3/3] END MultinomialNB__alpha=0.001, MultinomialNB__fit_prior=False, Tfid__ngram_range=[1, 2];, score=0.540 total time=   7.8s
[CV 1/3] END MultinomialNB__alpha=0.001, MultinomialNB__fit_prior=False, Tfid__ngram_range=[1, 3];, score=0.540 total time=  15.0s
[CV 2/3] END MultinomialNB__alpha=0.001, MultinomialNB__fit_prior=False, Tfid__ngram_range=[1, 3];, score=0.537 total time=  14.9s
[CV 3/3] END MultinomialNB__alpha=0.001, MultinomialNB__fit_prior=False, Tfid__ngram_range=[1, 3];, score=0.540 total time=  14.6s
[CV 1/3] END MultinomialNB__alpha=0.001, MultinomialNB__fit_prior=False, Tfid__ngram_range=[1, 4];, score=0.538 total time=  22.1s
[CV 2/3] END MultinomialNB__alpha=0.001, MultinomialNB__fit_prior=False, Tfid__ngra

[CV 3/3] END MultinomialNB__alpha=10, MultinomialNB__fit_prior=True, Tfid__ngram_range=[1, 3];, score=0.519 total time=  14.5s
[CV 1/3] END MultinomialNB__alpha=10, MultinomialNB__fit_prior=True, Tfid__ngram_range=[1, 4];, score=0.523 total time=  21.9s
[CV 2/3] END MultinomialNB__alpha=10, MultinomialNB__fit_prior=True, Tfid__ngram_range=[1, 4];, score=0.523 total time=  21.8s
[CV 3/3] END MultinomialNB__alpha=10, MultinomialNB__fit_prior=True, Tfid__ngram_range=[1, 4];, score=0.520 total time=  21.3s
[CV 1/3] END MultinomialNB__alpha=10, MultinomialNB__fit_prior=False, Tfid__ngram_range=[1, 1];, score=0.506 total time=   2.3s
[CV 2/3] END MultinomialNB__alpha=10, MultinomialNB__fit_prior=False, Tfid__ngram_range=[1, 1];, score=0.502 total time=   2.3s
[CV 3/3] END MultinomialNB__alpha=10, MultinomialNB__fit_prior=False, Tfid__ngram_range=[1, 1];, score=0.503 total time=   2.3s
[CV 1/3] END MultinomialNB__alpha=10, MultinomialNB__fit_prior=False, Tfid__ngram_range=[1, 2];, score=0.520

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('Tfid',
                                        TfidfVectorizer(ngram_range=[1, 1],
                                                        norm=None,
                                                        stop_words='english',
                                                        sublinear_tf=True,
                                                        use_idf=False)),
                                       ('MultinomialNB', MultinomialNB())]),
             n_jobs=1,
             param_grid={'MultinomialNB__alpha': [1e-05, 0.0001, 0.001, 0.1, 1,
                                                  10, 100, 1000],
                         'MultinomialNB__fit_prior': (True, False),
                         'Tfid__ngram_range': ([1, 1], [1, 2], [1, 3], [1, 4])},
             verbose=3)

In [8]:
print(MultinomialNB_gs.best_params_)
print(MultinomialNB_gs.best_score_)

{'MultinomialNB__alpha': 0.1, 'MultinomialNB__fit_prior': True, 'Tfid__ngram_range': [1, 3]}
0.563152380952381


In [9]:
MultinomialNB_y_pred = MultinomialNB_gs.predict(X_train)

In [10]:
print(classification_report(y_train, MultinomialNB_y_pred))
print(accuracy_score(y_train, MultinomialNB_y_pred))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00     15000
           2       1.00      1.00      1.00     15000
           3       1.00      1.00      1.00     15000
           4       1.00      1.00      1.00     15000
           5       1.00      1.00      1.00     15000
           6       1.00      1.00      1.00     15000
           7       1.00      1.00      1.00     15000

    accuracy                           1.00    105000
   macro avg       1.00      1.00      1.00    105000
weighted avg       1.00      1.00      1.00    105000

0.9992190476190477


In [11]:
test_pred = MultinomialNB_gs.predict(X_test)

In [12]:
print(classification_report(y_test, test_pred))
print(accuracy_score(y_test, test_pred))

              precision    recall  f1-score   support

           1       0.72      0.60      0.65      5000
           2       0.50      0.50      0.50      5000
           3       0.48      0.41      0.44      5000
           4       0.77      0.86      0.81      5000
           5       0.67      0.82      0.74      5000
           6       0.49      0.43      0.45      5000
           7       0.44      0.48      0.46      5000

    accuracy                           0.58     35000
   macro avg       0.58      0.58      0.58     35000
weighted avg       0.58      0.58      0.58     35000

0.5838571428571429


# --------------------------------------------------------------------------