# 0.) Load libraries

In [1]:
# Load libraries
%matplotlib inline
import csv
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import re
import seaborn as sns
import socket

from itertools import product
from os.path import isfile
from random import randint

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.neural_network import MLPClassifier as nn



from nltk.corpus import gutenberg, stopwords

# Download the texts
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('stopwords')

sns.set(style='ticks', color_codes=True)

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\whm0004\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\whm0004\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\whm0004\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 1.)  Find interesting phrases in Alice In Wonderland.

Most code adapted from module 10 notebook.

In [2]:
# Load the text
sentences = gutenberg.sents('carroll-alice.txt') # returns as a list of lists of strings -_-

# Format the text
stop_list = stopwords.words('english')
term_list = []
for idx, terms in enumerate(sentences):
    # Remove stop words
    terms = [w for w in terms if w not in stop_list]
    
    # Remove 'CHAPTER'
    terms = [w for w in terms if re.search(r'CHAPTER', w) is None]
    # Remove non-words and short words
    terms = [w for w in terms if re.search(r'[A-Za-z]{3,}', w) is not None]
    
    # If we haven't emptied the list of terms,
    if len(terms) > 0:
        # make everything lowercase (reduces total size)
        terms = [w.lower() for w in terms]
    
        # add to the list
        term_list.append(terms)

display(term_list[:5])

[['alice', 'adventures', 'wonderland', 'lewis', 'carroll'],
 ['down', 'rabbit', 'hole'],
 ['alice',
  'beginning',
  'get',
  'tired',
  'sitting',
  'sister',
  'bank',
  'nothing',
  'twice',
  'peeped',
  'book',
  'sister',
  'reading',
  'pictures',
  'conversations',
  'use',
  'book',
  'thought',
  'alice',
  'without',
  'pictures',
  'conversation'],
 ['considering',
  'mind',
  'well',
  'could',
  'hot',
  'day',
  'made',
  'feel',
  'sleepy',
  'stupid',
  'whether',
  'pleasure',
  'making',
  'daisy',
  'chain',
  'would',
  'worth',
  'trouble',
  'getting',
  'picking',
  'daisies',
  'suddenly',
  'white',
  'rabbit',
  'pink',
  'eyes',
  'ran',
  'close'],
 ['there',
  'nothing',
  'very',
  'remarkable',
  'alice',
  'think',
  'very',
  'much',
  'way',
  'hear',
  'rabbit',
  'say',
  'dear']]

In [3]:
# Denormalize
trans_list = []
item_names = {}
item_ids = {}

id_counter = 0
for terms in term_list:
    transaction = []
    
    for term in terms:
        if term not in item_ids:
            item_ids[term] = id_counter
            item_names[id_counter] = term
            id_counter += 1
            
        transaction += [item_ids[term]]
    
    trans_list += [transaction]
    
len_ids, len_trans = len(item_ids), len(trans_list)
    
items = np.arange(0, len_ids)

# Information
print(f'len_ids={len_ids}, len_trans={len_trans}')

len_ids=2472, len_trans=1673


In [4]:
# Binarize
transactions = np.full((len_trans, len_ids), False, dtype=np.bool)

for idx, trans in enumerate(trans_list):
    for term in trans:
        transactions[idx][term] = True 
        
print(f'{transactions[:10].astype(int)}')

[[1 1 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]]


In [5]:
# Make Weka csv
out_file = 'alice.csv'
with open('alice.csv', 'w', newline='') as file:
    csvwriter = csv.writer(file, delimiter=',', quoting=csv.QUOTE_ALL, quotechar="'", lineterminator='\n')
    csvwriter.writerow([item_names[i] for i in range(len_ids)])
    
    for idx in range(len_trans):
        csvwriter.writerow(list(map(lambda x: '' if not x else 'True', transactions[idx])))

# 2.) MNIST NeuralNetworkMLP

In [9]:
# Load MNIST
digits = load_digits()['images']
digits = digits.reshape((len(digits), -1))
labels = load_digits()['target']

# Make NN
hidden_layers = (64, 64)
NN = nn(hidden_layers)

# Cross-validate
kfold = KFold(n_splits=10)
scores = cross_val_score(NN, digits, labels, cv=kfold, scoring='accuracy', n_jobs=4)
scores_str = ''
for idx, score in enumerate(scores):
    this_score = f'{score:.3f}'
    
    scores_str += this_score
    
    if idx < len(scores) - 1:
        scores_str += ', '
    
print(f'MLPClassifier with 2 hidden layers had accuracies {scores_str} in\n10-fold cross-validation.')

MLPClassifier with 2 hidden layers had accuracies 0.900, 0.994, 0.906, 0.956, 0.972, 0.961, 0.978, 0.961, 0.939, 0.961 in
10-fold cross-validation.
