## Riscriviamo le funzioni per il data cleaning e il bow

In [1]:
import string
import spacy
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import TfidfVectorizer

english_stopwords = stopwords.words('english')
nlp = spacy.load('en_core_web_sm')
punctuation = set(string.punctuation)

def data_cleaner(dataset):
    dataset_to_return = []
    for sentence in dataset:
        sentence = sentence.lower()
        for c in string.punctuation:
            sentence = sentence.replace(c, " ")
        document = nlp(sentence)
        sentence = ' '.join(token.lemma_ for token in document)
        sentence = ' '.join(word for word in sentence.split() if word not in english_stopwords)
        sentence = re.sub('\d', '', sentence)
        dataset_to_return.append(sentence)

    return dataset_to_return


def bow_tfidf(dataset, tfidf_vectorizer):
    if tfidf_vectorizer == None:
        tfidf_vectorizer = TfidfVectorizer()
        X = tfidf_vectorizer.fit_transform(dataset)
    else:
        X = tfidf_vectorizer.transform(dataset)
        
    return X.toarray(), tfidf_vectorizer

## Importiamo i dataset di training e test

In [2]:
from sklearn.datasets import fetch_20newsgroups
training_dataset = fetch_20newsgroups(subset='train')
test_dataset = fetch_20newsgroups(subset='test')
training_data = training_dataset['data']
training_target = training_dataset['target']
test_data = test_dataset['data']
test_target = test_dataset['target']

In [3]:
set(training_dataset['target_names'])

{'alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc'}

In [4]:
set(test_dataset['target_names'])

{'alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc'}

In [5]:
len(training_data)

11314

In [6]:
len(test_data)

7532

## Preprocessing dei due dataset

In [7]:
training_data_cleaned, tfidf_vectorizer = bow_tfidf(data_cleaner(training_data), None)

In [8]:
test_data_cleaned, tfidf_vectorizer = bow_tfidf(data_cleaner(test_data), tfidf_vectorizer)

In [9]:
training_data_cleaned

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [10]:
len(training_data_cleaned[0])

85861

## Training del modello MLP

In [11]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(activation='logistic',
                    hidden_layer_sizes=(100,),
                    max_iter=100,
                    solver='adam',
                    tol=0.005,
                    verbose=True)

clf.fit(training_data_cleaned,training_target)


Iteration 1, loss = 2.96068117
Iteration 2, loss = 2.84575317
Iteration 3, loss = 2.70571062
Iteration 4, loss = 2.50874556
Iteration 5, loss = 2.24233564
Iteration 6, loss = 1.92176401
Iteration 7, loss = 1.58873002
Iteration 8, loss = 1.28436972
Iteration 9, loss = 1.03004214
Iteration 10, loss = 0.82739668
Iteration 11, loss = 0.66974385
Iteration 12, loss = 0.54951430
Iteration 13, loss = 0.45605936
Iteration 14, loss = 0.38382021
Iteration 15, loss = 0.32750966
Iteration 16, loss = 0.28284630
Iteration 17, loss = 0.24756801
Iteration 18, loss = 0.21853918
Iteration 19, loss = 0.19496381
Iteration 20, loss = 0.17553189
Iteration 21, loss = 0.15933475
Iteration 22, loss = 0.14560792
Iteration 23, loss = 0.13416534
Iteration 24, loss = 0.12439207
Iteration 25, loss = 0.11591883
Iteration 26, loss = 0.10869069
Iteration 27, loss = 0.10241793
Iteration 28, loss = 0.09691251
Iteration 29, loss = 0.09214376
Iteration 30, loss = 0.08789092
Iteration 31, loss = 0.08414793
Iteration 32, los

MLPClassifier(activation='logistic', max_iter=100, tol=0.005, verbose=True)

## Testiamo il modello 

In [12]:
clf.score(test_data_cleaned, test_target)

0.856744556558683

In [13]:
target = clf.predict(bow_tfidf(data_cleaner(["This is a mac book pro!!!"]),tfidf_vectorizer)[0])[0]

In [14]:
target

4

In [15]:
test_dataset['target_names'][target]

'comp.sys.mac.hardware'

In [20]:
test_data[0]

'From: v064mb9k@ubvmsd.cc.buffalo.edu (NEIL B. GANDLER)\nSubject: Need info on 88-89 Bonneville\nOrganization: University at Buffalo\nLines: 10\nNews-Software: VAX/VMS VNEWS 1.41\nNntp-Posting-Host: ubvmsd.cc.buffalo.edu\n\n\n I am a little confused on all of the models of the 88-89 bonnevilles.\nI have heard of the LE SE LSE SSE SSEI. Could someone tell me the\ndifferences are far as features or performance. I am also curious to\nknow what the book value is for prefereably the 89 model. And how much\nless than book value can you usually get them for. In other words how\nmuch are they in demand this time of year. I have heard that the mid-spring\nearly summer is the best time to buy.\n\n\t\t\tNeil Gandler\n'

In [18]:
test_dataset['target_names'][clf.predict([test_data_cleaned[0]])[0]]

'sci.electronics'

In [23]:
test_dataset['target_names'][test_target[0]]

'rec.autos'

In [24]:
test_data[1]

'From: Rick Miller <rick@ee.uwm.edu>\nSubject: X-Face?\nOrganization: Just me.\nLines: 17\nDistribution: world\nNNTP-Posting-Host: 129.89.2.33\nSummary: Go ahead... swamp me.  <EEP!>\n\nI\'m not familiar at all with the format of these "X-Face:" thingies, but\nafter seeing them in some folks\' headers, I\'ve *got* to *see* them (and\nmaybe make one of my own)!\n\nI\'ve got "dpg-view" on my Linux box (which displays "uncompressed X-Faces")\nand I\'ve managed to compile [un]compface too... but now that I\'m *looking*\nfor them, I can\'t seem to find any X-Face:\'s in anyones news headers!  :-(\n\nCould you, would you, please send me your "X-Face:" header?\n\nI *know* I\'ll probably get a little swamped, but I can handle it.\n\n\t...I hope.\n\nRick Miller  <rick@ee.uwm.edu> | <ricxjo@discus.mil.wi.us>   Ricxjo Muelisto\nSend a postcard, get one back! | Enposxtigu bildkarton kaj vi ricevos alion!\n          RICK MILLER // 16203 WOODS // MUSKEGO, WIS. 53150 // USA\n'

In [27]:
test_data[100]

'Subject: help\nFrom: C..Doelle@p26.f3333.n106.z1.fidonet.org (C. Doelle)\nLines: 13\n\nHello All!\n\n    It is my understanding that all True-Type fonts in Windows are loaded in\nprior to starting Windows - this makes getting into Windows quite slow if you\nhave hundreds of them as I do.  First off, am I correct in this thinking -\nsecondly, if that is the case - can you get Windows to ignore them on boot and\nmaybe make something like a PIF file to load them only when you enter the\napplications that need fonts?  Any ideas?\n\n\nChris\n\n * Origin: chris.doelle.@f3333.n106.z1.fidonet.org (1:106/3333.26)\n'

In [28]:
test_dataset['target_names'][clf.predict([test_data_cleaned[100]])[0]]

'comp.os.ms-windows.misc'

In [29]:
test_dataset['target_names'][test_target[100]]

'comp.os.ms-windows.misc'