In [70]:
#!/usr/bin/env python

# system tools
import os
import sys
sys.path.append(os.path.join(".."))

# pandas, numpy
import pandas as pd
import numpy as np

# import my classifier utility functions
import utils.classifier_utils as clf
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

# Machine learning stuff from sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import metrics


# tools from tensorflow
import tensorflow as tf
from tensorflow.random import set_seed
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Dense, 
                                     Dropout,
                                     BatchNormalization,
                                    )
from tensorflow.keras.optimizers import SGD
from tensorflow.keras import backend as K
from tensorflow.keras.utils import plot_model
from tensorflow.keras.regularizers import L2

# matplotlib
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use("Agg")

random_state = 42

#set seed for reproducibility
set_seed(random_state)
np.random.seed(random_state)

def plot_history(H, epochs):
    """
    Utility function for plotting model history using matplotlib
    
    H: model history 
    epochs: number of epochs for which the model was trained
    """
    plt.style.use("fivethirtyeight")
    plt.figure()
    plt.plot(np.arange(0, epochs), H.history["loss"], label="train_loss")
    plt.plot(np.arange(0, epochs), H.history["val_loss"], label="val_loss")
    plt.plot(np.arange(0, epochs), H.history["accuracy"], label="train_acc")
    plt.plot(np.arange(0, epochs), H.history["val_accuracy"], label="val_acc")
    plt.title("Training Loss and Accuracy")
    plt.xlabel("Epoch #")
    plt.ylabel("Loss/Accuracy")
    plt.legend()
    plt.tight_layout()
    plt.draw()
    plt.savefig(os.path.join("..","models", "nn_training_history.png"))


def main():

    """
    A function for running text classification of GoT texts from the terminal
    """
    # loading data
    data = pd.read_csv(os.path.join("..", "data", "raw","Game_of_Thrones_Script.csv"))

# gathering all lines from a given character by a seson an episode to context and model's accuracy
    data = data.groupby(["Season", "Episode", "Name"])
    data = data["Sentence"].agg(lambda x: " ".join(x)).to_frame()
    data = data.reset_index().rename(columns ={"Sentence": "Text"}) #resetting index

  # train and test split using sklearn
    X_train, X_test, y_train, y_test = train_test_split(data.Text,
                                                      data["Season"], 
                                                      test_size=0.1, 
                                                          random_state=random_state)
    print("Data loaded and split")

      ### Building network ###
    print(y_test[0:5])
      # integers to one-hot vectors
    lb = LabelBinarizer()
                        
    y_train_bin = lb.fit_transform(y_train)
    y_test_bin = lb.fit_transform(y_test)

      # the nn will have a vocabulary size of 15000
    maxlen = 15000

    vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features = maxlen)
    X_train_feats = vectorizer.fit_transform(X_train).toarray()
    X_test_feats = vectorizer.transform(X_test).toarray()

      # l2 regularization
    l2 = L2(0.00001)

      # a new neural network
    model = Sequential()
    model.add(Dense(64, activation='relu', kernel_regularizer=l2,input_shape=(maxlen,)))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))

    model.add(Dense(8, activation='softmax'))

    # compiler
    model.compile(loss='categorical_crossentropy',
                    optimizer= SGD(learning_rate=  .01),
                    metrics=['accuracy'])

    epochs = 10

    print("fitting nn-model")

    # a fit history of the network
    history = model.fit(X_train_feats, y_train_bin,
                          epochs=epochs,
                          verbose=False,
                          validation_data=(X_test_feats, y_test_bin))



        # get the ground truth of your data. 
        #test_labels=validation_generator.classes 
    
        # predict the probability distribution of the data
    predictions=model.predict(X_test_feats, verbose=1)

      # get the class with highest probability for each sample
    y_pred = np.argmax(predictions, axis=1)
    
    le = LabelEncoder()
    y_test_int = le.fit_transform(y_test)
    
    # get the classification report
    cr =  metrics.classification_report(y_test_int, y_pred, target_names = y_test.sort_values().unique())
    print(cr)




In [71]:
main()

Data loaded and split
1968    Season 6
124     Season 1
1127    Season 3
1615    Season 5
1359    Season 4
Name: Season, dtype: object
fitting nn-model
              precision    recall  f1-score   support

    Season 1       0.35      0.41      0.38        32
    Season 2       0.35      0.30      0.32        43
    Season 3       0.22      0.31      0.25        42
    Season 4       0.18      0.13      0.15        39
    Season 5       0.24      0.30      0.27        40
    Season 6       0.20      0.21      0.20        38
    Season 7       0.71      0.31      0.43        16
    Season 8       0.50      0.24      0.32        17

    accuracy                           0.27       267
   macro avg       0.34      0.28      0.29       267
weighted avg       0.30      0.27      0.28       267

