In [2]:
# Numeric
import numpy as np
import pandas as pd


# Tools
import os, sys
import re
import string
import csv

from pattern.it import parse
from pattern.it import pprint

from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords as sw
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler

# Classifiers
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC


# Metrics
from sklearn.metrics import f1_score



In [3]:
import spacy

nlp = spacy.load("it_core_news_sm")

In [4]:
directory = "/Users/lpdef/Desktop/Polito/Data Science lab process and methods/Final Exam/dataset_winter_2020/"
dataset = pd.read_csv(directory + "development.csv")
testset = pd.read_csv(directory + "evaluation.csv")

corpus = pd.concat([dataset.iloc[:,0],testset.iloc[:,0]])

dataset.head()
dataset.iloc[:,0].size

28754

In [5]:
X_train = dataset.iloc[:,0]
# doc = nlp(dataset.iloc[0,0])
# for token in doc:
#     print(token.text, token.lemma_, token.pos_, token.tag_)

In [6]:

class LemmaTokenizer(object):
    
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.Porter_Stemmer = SnowballStemmer("italian")
        
#     def __call__(self, document):
#         lemmas = []
#         tokens = nlp(document)
#         re_digit = re.compile("[0-9]")
        
#         for token in tokens:
#             if token.text not in string.punctuation and len(token.text) > 3 and len(token.text) < 16 and not re_digit.match(token.text):
#                 lemmas.append(token.lemma_)
            
            
#         return lemmas
        
    def __call__(self, document):
#         print(document)
        lemmas = []
        re_digit = re.compile("[0-9]")

        for t in word_tokenize(document):
            t = t.strip()
            t=t.translate(str.maketrans('', '', string.punctuation))
            lemma = self.lemmatizer.lemmatize(t)

            if lemma not in string.punctuation and len(lemma) > 3 and len(lemma) < 16 and not re_digit.match(lemma):
                lemma=self.Porter_Stemmer.stem(lemma)
                lemmas.append(lemma)
            
        return lemmas

    

stopWords = sw.words('italian')
lemmaTokenizer = LemmaTokenizer()
vectorizer = TfidfVectorizer(tokenizer=lemmaTokenizer, stop_words=stopWords, max_df=0.9, min_df=5, lowercase=True)

# X_train = vectorizer.fit_transform(X_train)


In [7]:
vectorizer.fit(corpus)

  'stop_words.' % sorted(inconsistent))


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.9, max_features=None,
                min_df=5, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words=['ad', 'al', 'allo', 'ai', 'agli', 'all', 'agl',
                            'alla', 'alle', 'con', 'col', 'coi', 'da', 'dal',
                            'dallo', 'dai', 'dagli', 'dall', 'dagl', 'dalla',
                            'dalle', 'di', 'del', 'dello', 'dei', 'degli',
                            'dell', 'degl', 'della', 'delle', ...],
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<__main__.LemmaTokenizer object at 0x1a29edd590>,
                use_idf=True, vocabulary=None)

In [8]:
X_train = vectorizer.transform(X_train)

In [9]:
Y_test = vectorizer.transform(testset.iloc[:,0])

In [10]:
len(vectorizer.get_feature_names())


10409

In [11]:
y_train = dataset.iloc[:, 1]

# clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000)
# clf = SVC()

# pipeline = make_pipeline(MaxAbsScaler(), clf)
# pipeline.fit(X_train, y_train)

f1_cv = cross_val_score(clf, X_train, y_train, cv=5, scoring='f1_weighted')

print(f"Macro-f1 for each iteration: {f1_cv}")
mean_macro_f1 = f1_cv.mean()
std_macro_f1 = f1_cv.std() * 2
print(f"Macro-f1 (statistics): {mean_macro_f1:.2f} (+/- {std_macro_f1:.2f})")

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]Macro-f1 for each iteration: [0.95960779 0.96351691 0.95626203 0.96098701 0.96008974]
Macro-f1 (statistics): 0.96 (+/- 0.00)


In [12]:
clf.fit(X_train, y_train)

y_pred = clf.predict(Y_test)


[LibLinear]

In [13]:
testset.shape

(12323, 1)

In [14]:
def dump_to_file(filename, labels):
    """Dump the evaluated labels to a CSV file."""
    with open(filename, mode='w', encoding='UTF-8') as f:
        fwriter = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        fwriter.writerow(["Id", "Predicted"])
        for Id, label in enumerate(labels):
            fwriter.writerow([Id, label])
            

dump_to_file("Labels.csv", y_pred)

In [15]:
print(testset.iloc[1000,0])
print()
print(y_pred[1000])

y_pred.shape[0] == testset.shape[0]


Il nostro primo soggiorno a Napoli e abbiamo scelto questo hotel dal British Airways" consigliato elenco - e siamo contenti così. Ottima vista sulla Baia verso Capri e Castell dell'Ovo con molti servizi locali - soprattutto ristoranti, ma anche ottimi collegamenti bus route, una vicina fermata del taxi e un traffico - gratis a piedi la città lungo il fronte mare o sulla collina lungo la Via Chiaia.
L'hotel stesso offre un'ottima colazione e camere eleganti, ma consigliamo pagare di più per la vista sul mare.

pos


True

In [16]:
'marco' in vectorizer.get_feature_names()

vectorizer.get_feature_names()

['aaron',
 'abat',
 'abatjour',
 'abbagl',
 'abbai',
 'abband',
 'abbandon',
 'abbass',
 'abbast',
 'abbatt',
 'abbaz',
 'abbell',
 'abbi',
 'abbiam',
 'abbigl',
 'abbin',
 'abbon',
 'abbond',
 'abbondant',
 'abbondantissim',
 'abbord',
 'abbracc',
 'abbuff',
 'abet',
 'abil',
 'abit',
 'abitu',
 'abitual',
 'abitudin',
 'abruzz',
 'abruzzes',
 'abus',
 'acac',
 'acant',
 'acarus',
 'acca',
 'accad',
 'accadem',
 'accadr',
 'accald',
 'accamp',
 'accant',
 'accaparr',
 'accapato',
 'accapatoi',
 'accappato',
 'accappatoi',
 'accarezz',
 'accatast',
 'accattiv',
 'acced',
 'acceler',
 'accend',
 'accenn',
 'accension',
 'accent',
 'accentu',
 'accert',
 'accertat',
 'acces',
 'access',
 'accessibil',
 'accessor',
 'accett',
 'acchit',
 'acciai',
 'accident',
 'accidental',
 'accing',
 'acciottol',
 'acciug',
 'accoccol',
 'accod',
 'accogl',
 'accoglient',
 'accoglit',
 'accolg',
 'accoll',
 'accolt',
 'accomod',
 'accompagn',
 'acconsent',
 'accont',
 'accontent',
 'accopp',
 'accor',
