# SVM CLASSIFIER MODELS

In [2]:
import pandas as pd
import nltk
import re
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
%store -r

### Functions

In [4]:
def partition (char_sens, n):
    '''Splits a list of sentences into 
    n parts, of equall, or close to equal 
    number of sentences'''
    return [char_sens[i::n] for i in range(n)]

In [5]:
def corpus_split (play_characters, n):
    '''Takes a dict as input and performs n partitions,
    as list of sentences, in every value for every key'''
    play_characters_part = {}
    for char in play_characters.keys():
        play_characters_part[char] = partition(play_characters[char], n)
    return play_characters_part  

In [6]:
def data_modeling (author_characters_split):
    '''It takes a dict of characters:split-speech-lines form,
    and makes a data frame out of it, where every row contains
    one single partition out of the total five for each character'''
    x = [item[0] for item in author_characters_split.items()]
    df1 = pd.DataFrame({'Characters': x[0], 'Corpuses': author_characters_split[x[0]]})
    for row in x[1:]:
        df2 = pd.DataFrame({'Characters': row, 'Corpuses': author_characters_split[row]})
        df1 = pd.concat([df1, df2])
    return df1

In [7]:
def list_generator (file):
    '''Generates a list of words from a file'''
    f = open(file, "r")
    contents = f.read()         
    f.close() 
    words = re.findall(r"\w+", contents)
   
    return words

### Vectorizers

In [8]:
# German stop list words for german vectorizers
german_sw = list_generator('german_stop_words.txt')

In [9]:
wsw_vectorizer = TfidfVectorizer(stop_words = 'english')
wsw_g_vectorizer = TfidfVectorizer(stop_words = german_sw)
bgram_vectorizer = TfidfVectorizer(ngram_range =(2,2))
char_ngram_vectorizer = TfidfVectorizer(analyzer = 'word', ngram_range =(1,3), stop_words = 'english')
char_g_ngram_vectorizer = TfidfVectorizer(analyzer = 'word', ngram_range =(1,3), stop_words = german_sw)

### Models

In [10]:
svm_clf = make_pipeline(TfidfVectorizer(), SVC(kernel='linear'))
wsw_svm_clf = make_pipeline(wsw_vectorizer, SVC(kernel='linear'))
wsw_g_svm_clf = make_pipeline(wsw_g_vectorizer, SVC(kernel='linear'))
bgram_svm_clf = make_pipeline(bgram_vectorizer, SVC(kernel='linear'))
char_ngram_svm_clf = make_pipeline(char_ngram_vectorizer, SVC(kernel='linear'))
char_g_ngram_svm_clf = make_pipeline(char_g_ngram_vectorizer, SVC(kernel='linear'))

## Oscar Wilde 

In [11]:
wilde_split = corpus_split(wilde_characters, 5)
wilde_df = data_modeling(wilde_split)
wilde_y = wilde_df['Characters']
wilde_X = wilde_df['Corpuses'].apply(lambda x : "".join(x))

### Results

#### SVM classifier 

In [11]:
wilde_scores = cross_validate(svm_clf, wilde_X, wilde_y, scoring=['accuracy'], cv=5, return_train_score=False)
wilde_scores['test_accuracy'].mean()

0.9428571428571428

#### SVM classifier without stop words

In [12]:
bgram_wilde_scores = cross_validate(bgram_svm_clf, wilde_X, wilde_y, scoring=['accuracy'], cv=5, return_train_score=False)
bgram_wilde_scores['test_accuracy'].mean()

0.9047619047619048

#### SVM classifier with word bi-grams

In [13]:
wsw_wilde_scores = cross_validate(wsw_svm_clf, wilde_X, wilde_y, scoring=['accuracy'], cv=5, return_train_score=False)
wsw_wilde_scores['test_accuracy'].mean()

0.9333333333333333

#### SVM classifier without stop words and with character-grams in range 1 to 3

In [49]:
char_ngram_wilde_scores = cross_validate(char_ngram_svm_clf, wilde_X, wilde_y, scoring=['accuracy'], cv=5, return_train_score=False)
char_ngram_wilde_scores['test_accuracy'].mean()

0.8857142857142858

## George Bernard Shaw 

In [14]:
shaw_split = corpus_split(shaw_characters,5)
shaw_df = data_modeling(shaw_split)
shaw_y = shaw_df['Characters']
shaw_X = shaw_df['Corpuses'].apply(lambda x : "".join(x))

### Results

#### SVM classifier 

In [15]:
shaw_scores = cross_validate(svm_clf, shaw_X, shaw_y, scoring=['accuracy'], cv=5, return_train_score=False)
shaw_scores['test_accuracy'].mean()

0.9826086956521738

#### SVM classifier without stop words

In [16]:
bgram_shaw_scores = cross_validate(bgram_svm_clf, shaw_X, shaw_y, scoring=['accuracy'], cv=5, return_train_score=False)
bgram_shaw_scores['test_accuracy'].mean()

0.9217391304347826

#### SVM classifier with word bi-grams

In [20]:
wsw_shaw_scores = cross_validate(wsw_svm_clf, shaw_X, shaw_y, scoring=['accuracy'], cv=5, return_train_score=False)
wsw_shaw_scores['test_accuracy'].mean()

1.0

#### SVM classifier without stop words and with character-grams in range 1 to 3

In [50]:
char_ngram_shaw_scores = cross_validate(char_ngram_svm_clf, shaw_X, shaw_y, scoring=['accuracy'], cv=5, return_train_score=False)
char_ngram_shaw_scores['test_accuracy'].mean()

0.9739130434782608

## Ben Jonson

In [21]:
jonson_split = corpus_split(jonson_characters, 5)
jonson_df = data_modeling(jonson_split)
jonson_y = jonson_df['Characters']
jonson_X = jonson_df['Corpuses'].apply(lambda x : "".join(x))

### Results

#### SVM classifier 

In [22]:
jonson_scores = cross_validate(svm_clf, jonson_X, jonson_y, scoring=['accuracy'], cv=5, return_train_score=False)
jonson_scores['test_accuracy'].mean()

0.9

#### SVM classifier without stop words

In [23]:
bgram_jonson_scores = cross_validate(bgram_svm_clf, jonson_X, jonson_y, scoring=['accuracy'], cv=5, return_train_score=False)
bgram_jonson_scores['test_accuracy'].mean()

0.9

#### SVM classifier with word bi-grams

In [24]:
wsw_jonson_scores = cross_validate(wsw_svm_clf, jonson_X, jonson_y, scoring=['accuracy'], cv=5, return_train_score=False)
wsw_jonson_scores['test_accuracy'].mean()

0.990909090909091

#### SVM classifier without stop words and with character-grams in range 1 to 3

In [51]:
char_ngram_jonson_scores = cross_validate(char_ngram_svm_clf, jonson_X, jonson_y, scoring=['accuracy'], cv=5, return_train_score=False)
char_ngram_jonson_scores['test_accuracy'].mean()

0.9454545454545455

## William Shakespeare

In [25]:
shakespeare_split = corpus_split(shakespeare_characters, 5)
shakespeare_df = data_modeling(shakespeare_split)
shakespeare_y = shakespeare_df['Characters']
shakespeare_X = shakespeare_df['Corpuses'].apply(lambda x : "".join(x))

### Results

#### SVM classifier 

In [26]:
shakespeare_scores = cross_validate(svm_clf, shakespeare_X, shakespeare_y, scoring=['accuracy'], cv=5, return_train_score=False)
shakespeare_scores['test_accuracy'].mean()

0.819047619047619

#### SVM classifier without stop words

In [27]:
bgram_shakespeare_scores = cross_validate(bgram_svm_clf, shakespeare_X, shakespeare_y, scoring=['accuracy'], cv=5, return_train_score=False)
bgram_shakespeare_scores['test_accuracy'].mean()

0.8666666666666668

#### SVM classifier with word bi-grams

In [28]:
wsw_shakespeare_scores = cross_validate(wsw_svm_clf, shakespeare_X, shakespeare_y, scoring=['accuracy'], cv=5, return_train_score=False)
wsw_shakespeare_scores['test_accuracy'].mean()

0.9428571428571428

#### SVM classifier without stop words and with character-grams in range 1 to 3

In [52]:
char_ngram_shakespeare_scores = cross_validate(char_ngram_svm_clf, shakespeare_X, shakespeare_y, scoring=['accuracy'], cv=5, return_train_score=False)
char_ngram_shakespeare_scores['test_accuracy'].mean()

0.9142857142857144

## Friedrich Schiller

In [29]:
schiller_split = corpus_split(schiller_characters, 5)
schiller_df = data_modeling(schiller_split)
schiller_y = schiller_df['Characters']
schiller_X = schiller_df['Corpuses'].apply(lambda x : "".join(x))

### Results

#### SVM classifier 

In [30]:
schiller_scores = cross_validate(svm_clf, schiller_X, schiller_y, scoring=['accuracy'], cv=5, return_train_score=False)
schiller_scores['test_accuracy'].mean()

0.8941176470588236

#### SVM classifier without stop words

In [31]:
bgram_schiller_scores = cross_validate(bgram_svm_clf, schiller_X, schiller_y, scoring=['accuracy'], cv=5, return_train_score=False)
bgram_schiller_scores['test_accuracy'].mean()

0.9411764705882353

#### SVM classifier with word bi-grams

In [32]:
wsw_schiller_scores = cross_validate(wsw_g_svm_clf, schiller_X, schiller_y, scoring=['accuracy'], cv=5, return_train_score=False)
wsw_schiller_scores['test_accuracy'].mean()

0.9882352941176471

#### SVM classifier without stop words and with character-grams in range 1 to 3

In [53]:
char_ngram_schiller_scores = cross_validate(char_g_ngram_svm_clf, schiller_X, schiller_y, scoring=['accuracy'], cv=5, return_train_score=False)
char_ngram_schiller_scores['test_accuracy'].mean()

0.9411764705882353

## Johann Wolfgang von Goethe

In [33]:
goethe_split = corpus_split(goethe_characters, 5)
goethe_df = data_modeling(goethe_split)
goethe_y = goethe_df['Characters']
goethe_X = goethe_df['Corpuses'].apply(lambda x : "".join(x))

### Results

#### SVM classifier 

In [34]:
goethe_scores = cross_validate(svm_clf, goethe_X, goethe_y, scoring=['accuracy'], cv=5, return_train_score=False)
goethe_scores['test_accuracy'].mean()

0.6933333333333334

#### SVM classifier without stop words

In [35]:
bgram_goethe_scores = cross_validate(bgram_svm_clf, goethe_X, goethe_y, scoring=['accuracy'], cv=5, return_train_score=False)
bgram_goethe_scores['test_accuracy'].mean()

0.6799999999999999

#### SVM classifier with word bi-grams

In [36]:
wsw_goethe_scores = cross_validate(wsw_g_svm_clf, goethe_X, goethe_y, scoring=['accuracy'], cv=5, return_train_score=False)
wsw_goethe_scores['test_accuracy'].mean()

0.9199999999999999

#### SVM classifier without stop words and with character-grams in range 1 to 3

In [54]:
char_ngram_goethe_scores = cross_validate(char_g_ngram_svm_clf, goethe_X, goethe_y, scoring=['accuracy'], cv=5, return_train_score=False)
char_ngram_goethe_scores['test_accuracy'].mean()

0.6933333333333332