### Import libraries

In [5]:
# Numeric
import numpy as np
import pandas as pd


# Tools
import os, sys
import re
import string
import csv
import itertools


# Natural language processing
import spacy

from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords as sw
from nltk.stem.snowball import SnowballStemmer


# Preprocessing and Features engineering
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.feature_selection import chi2



# Clustering
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering



# Classifiers
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import make_pipeline



# Metrics
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score


# Visualization
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline



### Some utility functions

In [23]:
def print_f1score(scores):
    """Pritn the evaluated f1scores, their mean and standard deviation."""
    print(f"Weighted-f1 for each iteration: {scores}")
    mean_weighted_f1 = scores.mean()
    std_weighted_f1 = scores.std() * 2
    print(f"Weighted-f1 (statistics): {mean_weighted_f1:.3f} (+/- {std_weighted_f1:.3f})")

In [22]:
def write_to_file(filename, labels):
    """Write the evaluated labels to a CSV file."""
    with open(filename, mode='w', encoding='UTF-8') as f:
        fwriter = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        fwriter.writerow(["Id", "Predicted"])
        for Id, label in enumerate(labels):
            fwriter.writerow([Id, label])

### Read data

In [19]:
directory = "/Users/lpdef/Desktop/Polito/Data Science lab process and methods/Final Exam/dataset_winter_2020/"
dataset = pd.read_csv(directory + "development.csv")
testset = pd.read_csv(directory + "evaluation.csv")

print(dataset.iloc[:,0].size)
dataset.head()

28754


Unnamed: 0,text,class
0,Non è l'hotel più lussuoso in cui abbia mai so...,pos
1,Siamo stati qui per 1 notte prima della nostra...,pos
2,Hotel è ben posizionato per visitare Torino. A...,pos
3,All'arrivo la cordialità e disponibilità dello...,pos
4,Abbiamo soggiornato per due notti alla fine de...,pos


In [20]:
X_train = dataset.iloc[:,0]
y_train = dataset.iloc[:, 1]

### Features engineering

In [8]:
nlp = spacy.load("it_core_news_sm", disable=["ner", "parser", "tagger", "textcat"])
# spacy_stopWords = spacy.lang.it.stop_words.STOP_WORDS

In [10]:
class LemmaTokenizer(object):
    
    def __init__(self):
        self.lemmatizer = nlp
        self.Porter_Stemmer = SnowballStemmer("italian")
        
        
    def __call__(self, document):
        lemmas = []
        re_digit = re.compile("[0-9]")
        document = document.replace("'", " ")

        for t in self.lemmatizer(document):
            if bool(re.match("[0-9]+", t.lemma_))== False:
                t = t.lemma_.strip()
                t = t.translate(str.maketrans('', '', string.punctuation))
                
                if bool(t.isalpha())==True:
                    if len(t) > 1 and len(t) < 20:
                        t = self.Porter_Stemmer.stem(t)
                        lemmas.append(t)

        
            
        return lemmas
    

stopWords = sw.words('italian')
stopWords.remove('ma')
stopWords.remove('non')

stopWords.extend(['avra', 'avro', 'sar', 'aver', 'com', 'contr', 'dar', 'esser', 'far', 'fec', 'foss', 
                  'lor', 'nostr', 'perc', 'qual', 'quant', 'quell', 'quest', 'star', 'stemm', 
                  'stess', 'stetter', 'tutt', 'vostr',
                 
                 'dio', 'neo', 'piu',
                  
                 'luned', 'marted', 'mercoled', 'gioved', 'venerd', 'sabat', 'domenic',
                  
                 'aglo', 'avere', 'avutare', 'avutere', 'dallare', 'ebbo', 'essere',
                 'facciata', 'facere', 'fara', 'faro', 'farsene', 'fossa', 'fosso', 'mieo', 'perca',
                 'sara', 'saro', 'stara', 'stare', 'stessa', 'staro', 'stesso', 'suoo', 'tuoo', 'aver',
                 'avut', 'fac', 'facc', 'farse', 'nostr',
                 
                 'fars', 'farsen',
                 
                  'lorenz', 'anna', 'marc', 'francesc',

                 'hotel', 'camer'])



lemmaTokenizer = LemmaTokenizer()
vectorizer = TfidfVectorizer(tokenizer=lemmaTokenizer, stop_words=stopWords, min_df=0.0005, 
                             strip_accents='unicode', lowercase=True, ngram_range=(1,3), sublinear_tf=True)


In [11]:
vectorizer.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=0.0005, ngram_range=(1, 3), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words=['ad', 'al', 'allo', 'ai', 'agli', 'all', 'agl',
                            'alla', 'alle', 'con', 'col', 'coi', 'da', 'dal',
                            'dallo', 'dai', 'dagli', 'dall', 'dagl', 'dalla',
                            'dalle', 'di', 'del', 'dello', 'dei', 'degli',
                            'dell', 'degl', 'della', 'delle', ...],
                strip_accents='unicode', sublinear_tf=True,
                token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<__main__.LemmaTokenizer object at 0x1a271f5bd0>,
                use_idf=True, vocabulary=None)

In [21]:
X_train = vectorizer.transform(X_train)
print(len(vectorizer.get_feature_names()))

20081


###  Model selection and cross validation

In [29]:
y_train = dataset.iloc[:, 1]

clf = LogisticRegression(verbose=1, solver='liblinear',random_state=0, C=6, penalty='l2', max_iter=1000)

cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='f1_weighted', n_jobs=-1)

print_f1score(cv_scores)

Weighted-f1 for each iteration: [0.96758311 0.97102079 0.96126049 0.96186063 0.96682949]
Weighted-f1 (statistics): 0.966 (+/- 0.007)


### Training and prediction

In [27]:
Y_test = vectorizer.transform(testset.iloc[:,0])

In [28]:
clf.fit(X_train, y_train)
y_pred = clf.predict(Y_test)
testset.shape

[LibLinear]

(12323, 1)

### Additional insights

### Writing the results

In [31]:
write_to_file("Labels.csv", y_pred)