# Configuration parameters

In [1]:
import os

configuration = 1    # possible values are: 1, 2, 3

###
# configuration=1 --> P1: tokenization, stop word removal
# configuration=2 --> P2: tokenization, stop word removal, lemmatization
# configuration=3 --> P3: tokenization, stop word removal, lemmatization, keyphrase extraction
###

folder="configuration_"+str(configuration)

if configuration == 1:
    lemmatization=False
    bigram=False
elif configuration == 2:
    lemmatization=True
    bigram=False
elif configuration == 3:
    lemmatization=True
    bigram=True

if not os.path.exists(folder):
    os.makedirs(folder)
if not os.path.exists(folder+"/models"):
    os.makedirs(folder+"/models")
if not os.path.exists(folder+"/simple"):
    os.makedirs(folder+"/simple")
if not os.path.exists(folder+"/tfidf"):
    os.makedirs(folder+"/tfidf")
if not os.path.exists(folder+"/simple/csv"):
    os.makedirs(folder+"/simple/csv")
if not os.path.exists(folder+"/tfidf/csv"):
    os.makedirs(folder+"/tfidf/csv")

# 1. Database connection

In [2]:
from psycopg2 import sql, connect

In [3]:
#connection_string="dbname='crime_news' user='crime_news_ro' host='localhost' port=5532 password='**********'"
conn = connect(
                dbname='crime_news', 
                host='localhost', 
                port=5532,
                user='crime_news_ro', 
                password='**********'
            )

print("psycopg2 connection:", conn)

psycopg2 connection: <connection object at 0x000001C7A462BD00; dsn: 'user=crime_news_ro password=xxx dbname=crime_news host=localhost port=5532', closed: 0>


In [4]:
import pandas as pd
import numpy as np
import ast
import gensim, nltk, re
import spacy
from gensim.models.doc2vec import TaggedDocument
from gensim.models import KeyedVectors
import logging
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from mpl_toolkits import mplot3d
import matplotlib.pyplot as plt
from multi_rake import Rake
import warnings
warnings.filterwarnings(action='once')

# 2. Fetch data from table 'news'

We use psycopg2 to fetch all the records of table 'news' and create a pandas dataframe.

In [5]:
def get_columns_names(conn, table):

    # declare an empty list for the column names
    columns = []

    # declare cursor objects from the connection    
    col_cursor = conn.cursor()

    # concatenate string for query to get column names
    # SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS WHERE table_name = 'some_table';
    sql_str = "SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS WHERE "
    sql_str += "table_name = '{}';".format( table )

    try:
        sql_object = sql.SQL(
            # pass SQL statement to sql.SQL() method
            sql_str
        ).format(
            # pass the identifier to the Identifier() method
            sql.Identifier( table )
        )

        # execute the SQL string to get list with col names in a tuple
        col_cursor.execute( sql_object )

        # get the tuple element from the liast
        col_names = ( col_cursor.fetchall() )

        # iterate list of tuples and grab first element
        for tup in col_names:
            print(tup)
            # append the col name string to the list
            columns += [ tup[0] ]

        # close the cursor object to prevent memory leaks
        col_cursor.close()

    except Exception as err:
        print ("get_columns_names ERROR:", err)

    # return the list of column names
    return columns

In [6]:
def get_data(conn, table):
    print('start')
    # declare cursor objects from the connection    
    cursor = conn.cursor()

    # concatenate string for query to get column names
    # SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS WHERE table_name = 'some_table';
    sql_str = "SELECT * FROM crime_news.{} ;".format( table )

    try:
        sql_object = sql.SQL(
            # pass SQL statement to sql.SQL() method
            sql_str
        ).format(
            # pass the identifier to the Identifier() method
            sql.Identifier(table)
        )

        # execute the SQL string to get list with col names in a tuple
        cursor.execute( sql_object )

        # get the tuple element from the liast
        records = cursor.fetchall()
        print('end')

    except Exception as err:
        print ("get_columns_names ERROR:", err)

    # return the list of column names
    return np.array(records)

In [7]:
def get_dataframe_from_table(conn, table):
    # get columns of table
    columns = get_columns_names(conn, table)
        
    # get data of table
    data = get_data(conn, table)
    
    # get the frame
    frame = pd.DataFrame(data=data, columns=columns)
    
    return frame

In [12]:
data = get_dataframe_from_table(conn, 'news')
data

('url',)
('title',)
('description',)
('text',)
('municipality',)
('area',)
('address',)
('date',)
('time',)
('geom',)
('object',)
('newspaper',)
('tag',)
('is_general',)
('date_event',)
('new_tag',)
start
end


Unnamed: 0,url,title,description,text,municipality,area,address,date,time,geom,object,newspaper,tag,is_general,date_event,new_tag
0,https://gazzettadimodena.gelocal.it/modena/cro...,Sassuolo. Banda del tombino in azione vetrine ...,Ancora furti e vandalismi in città. Nelle ore ...,SASSUOLO. Ancora furti e vandalismi in città. ...,sassuolo,,via radici,2021-06-25,01:00:00,0101000020E61000001F97827F5B91254022BAB1EABD45...,vandalismi,Gazzetta di Modena,furto,0,,
1,https://gazzettadimodena.gelocal.it/modena/cro...,Catturano un ladro: «Sono un podista» Agenti a...,I quattro ammanettarono l’uomo nelle vicinanze...,Serena Arbizzi Sta entrando nel vivo il proces...,modena,accademia militare,,2021-06-24,01:00:00,,«sono,Gazzetta di Modena,furto,0,,
2,https://gazzettadimodena.gelocal.it/modena/cro...,L’encomio di Mattarella per gli agenti .,,"Un arresto per evasione, un altro per furto fu...",modena,accademia militare,,2021-06-24,01:00:00,,,Gazzetta di Modena,furto,0,,
3,https://gazzettadimodena.gelocal.it/modena/cro...,Rapinarono un portavalori al Grandemilia di Mo...,Due in manette: uno è stato preso a Napoli a b...,"Serena Arbizzi Era a Napoli, sulla nave. Stava...",modena,accademia militare,,2021-09-01,01:00:00,,,Gazzetta di Modena,furto,0,,
4,https://gazzettadimodena.gelocal.it/modena/cro...,Emilia Romagna. A scuola i figli dei medici? «...,La Regione ha convenuto di inviare una richie...,Paola Ducci Per ora è certo che da oggi in cla...,modena,accademia militare,,2021-03-08,01:00:00,,,Gazzetta di Modena,furto,0,2021-03-08,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17527,http://www.modenatoday.it/video/trovata-arma-a...,"Agguato al primario di Cardiologia, ritrovata ...",Indagini a tutto tondo di Carabinieri e Procur...,Indagini a tutto tondo di Carabinieri e Procur...,marano,,,2016-11-14,12:38:00,0101000020E61000005820D50B997E2640EA31DB5F8732...,soda,ModenaToday,aggressione,0,2016-11-11,
17528,http://gazzettadimodena.gelocal.it/modena/cron...,"Sassuolo, anziana va dal medico e i ladri le s...",Hanno smurato la porta d’ingresso dell’abitazi...,SASSUOLO. Continuano con incredibile frequenza...,sassuolo,,via pergolesi rometta rometta,2017-01-31,01:00:00,0101000020E610000033593739D7932540E7E6768AB043...,svaligiano,Gazzetta di Modena,furto,0,2017-01-31,furto
17529,https://gazzettadimodena.gelocal.it/modena/cro...,"Boom di furti in casa a Modena, comitato in pi...",Domani mattina il presidio dei residenti del V...,"MODENA. Mentre parla, indica le inferriate al ...",modena,,,2020-01-01,01:00:00,0101000020E61000008B451D67E4D92540662E7079AC52...,casa,Gazzetta di Modena,furto,0,2020-01-01,furto
17530,http://www.modenatoday.it/cronaca/sequestri-pr...,"Finanza al mercato di Maranello, sequestrati 3...",Ambulante straniero vendeva merce priva dei re...,La Guardia di finanza prosegue nella lotta ai...,Maranello,,,2014-09-18,15:02:00,0101000020E61000004F1432A193BB25404E6B894B4443...,prodotti,ModenaToday,sequestro,0,2014-09-18,


In [20]:
data.shape

(17504, 16)

In [21]:
data['newspaper'].value_counts()

Gazzetta di Modena    11604
ModenaToday            5900
Name: newspaper, dtype: int64

# 3. Preprocessing

The preprocessing step includes lemmatization, stop words removal and tokenization

In [10]:
# Run this to download the file "italian.pickle"
# nltk.download()

In [11]:
tokenizer = nltk.data.load('tokenizers/punkt/italian.pickle')

### a. Without lemmatization

In [12]:
def review_to_wordlist(review, remove_stopwords=False):
    """
    Convert a review to a list of words. Removal of stop words is optional.
    """
    # remove non-letters
    review_text = (re.sub("[^a-zA-Z]"," ", review)).lower()
    
    # convert to lower case and split at whitespace
    words = review_text.split()
    
    # remove stop words (false by default)
    if remove_stopwords:
        stops = set(nltk.corpus.stopwords.words('italian'))
        words = [w for w in words if not w in stops]

    return words

In [13]:
def review_to_sentences(review, tokenizer, remove_stopwords=False):
    """
    Split review into list of sentences where each sentence is a list of words.
    Removal of stop words is optional.
    """
    # use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    
    # each sentence is furthermore split into words
    sentences = []    
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            sentences += review_to_wordlist(raw_sentence, remove_stopwords)
            
    return sentences

In [14]:
data = data[['url', 'title', 'text', 'newspaper', 'date', 'time', 'tag']].copy()

In [15]:
%%time

#if lemmatization==False and bigram==False:
if configuration == 1:
    data.insert(3, 'preprocessed', data['text'].apply(lambda article: review_to_sentences(article, tokenizer, remove_stopwords=True)) )

Wall time: 0 ns


### b. With Lemmatization

In [16]:
class DocPreprocess(object):

    def __init__(self, nlp, stop_words, docs, #labels, 
                     build_bi=False, min_count=3, threshold=5, 
                     allowed_postags=['ADV', 'VERB', 'ADJ', 'NOUN', 'PROPN', 'NUM']):

        self.nlp = nlp
        self.stop_words = stop_words
        self.docs = docs
        # self.labels = labels
        self.doc_ids = np.arange(len(docs))
        self.simple_doc_tokens = [gensim.utils.simple_preprocess(doc, deacc=True) for doc in self.docs]

        if build_bi:
            self.bi_detector = self.build_bi_detect(self.simple_doc_tokens, min_count=min_count, threshold=threshold)
            self.new_docs = self.make_bigram_doc(self.bi_detector, self.simple_doc_tokens)
            
            rake = Rake(min_chars=3,
                    max_words=3,
                    min_freq=2,
                    language_code='it',
                    stopwords=stop_words)
                    #punctuations=',;.:-_()\'!?"')
            self.keyphrase = self.docs.apply(lambda text: rake.apply(text, text_for_stopwords=None))
            self.new_keyphrases = [self.make_keyphrase_doc(doc) for doc in range(len(self.new_docs))]
        else:
            self.new_docs = self.make_simple_doc(self.simple_doc_tokens)
        self.doc_words = [self.lemmatize(doc, allowed_postags=allowed_postags) for doc in self.new_docs]
        self.tagdocs = [TaggedDocument(words=words, tags=[tag]) for words, tag in zip(self.doc_words, self.doc_ids)]
        
        if build_bi:
            [self.doc_words[doc].extend(self.new_keyphrases[doc]) for doc in range(len(self.new_docs))]
        
    def build_bi_detect(self, simple_doc_tokens, min_count, threshold):
        # nostopword_doc_tokens = [token for token in simple_doc_tokens if (token not in self.stop_words)]
        nostopword_doc_tokens = []
        for array in simple_doc_tokens:
            new_array=[]
            for element in array:
                if element not in self.stop_words:
                    new_array.append(element)
            nostopword_doc_tokens.append(new_array)
        
        bi_ = gensim.models.phrases.Phrases(nostopword_doc_tokens, min_count=min_count, threshold=threshold, scoring='default')
        bi_detector = gensim.models.phrases.Phraser(bi_)  # wrapper enhance efficiency
        return bi_detector

    def make_keyphrase_doc(self, doc):
        keyphrases = []
        keys = ""
        for k in self.keyphrase[doc]:
            keys = keys + " " + k[0].replace(' ', '_')  # concatenate back to a sentence
        keyphrases.append(keys)
        return keyphrases
    
    def make_bigram_doc(self, bi_detector, simple_doc_tokens):
        #nostopword_doc_tokens = [token for token in simple_doc_tokens if (token not in self.stop_words)]
        bi_doc_tokens = [bi_detector[doc_tokens] for doc_tokens in simple_doc_tokens]
        bi_docs = []
        for bi_tokens in bi_doc_tokens:
            bi_doc = " ".join(bi_tokens)  # concatenate back to a sentence
            bi_docs.append(bi_doc)
        return bi_docs

    def make_simple_doc(self, simple_doc_tokens):
        simple_docs = []
        for doc_tokens in simple_doc_tokens:
            simple = " ".join(doc_tokens)  # concatenate back to a sentence
            simple_docs.append(simple)
        return simple_docs

    def lemmatize(self, doc, allowed_postags):
        doc = self.nlp(doc)
        tokens = [token.lemma_ for token in doc if (token.text not in self.stop_words)]
                  #(token.pos_ in allowed_postags) and (token.text not in self.stop_words)]
        return tokens

In [17]:
# follow the instructions at https://spacy.io/usage for downloading

nlp = spacy.load('it_core_news_sm')
stop_words = nltk.corpus.stopwords.words('italian')

In [18]:
%%time

if configuration == 2:
    all_docs = DocPreprocess(nlp, stop_words, data['text'])
    data.insert(3, "preprocessed", all_docs.doc_words, True)

Wall time: 8min 29s


### c. Keyphrase extraction

In [19]:
%%time

if configuration == 3:
    all_docs = DocPreprocess(nlp, stop_words, data['text'], True)
    data.insert(3, "preprocessed", all_docs.doc_words, True)

Wall time: 0 ns


In [20]:
data.preprocessed

0        [sassuolo, ancorare, furto, vandalismo, citta,...
1        [sereno, arbizzi, entrare, vivere, processare,...
2        [arrestare, evasione, altro, furto, fuori, ora...
3        [sereno, arbizzi, napoli, nave, partire, volto...
4        [paola, ducci, orare, certo, oggi, classe, pre...
                               ...                        
17499    [indagine, tondo, carabiniere, procurare, chiu...
17500    [sassuolo, continuare, incredibile, frequenza,...
17501    [modena, mentre, parlare, indire, inferriata, ...
17502    [guardia, finanza, proseguire, lottare, produr...
17503    [modena, stare, sorteggiare, lettera, alfabeto...
Name: preprocessed, Length: 17504, dtype: object

Delete those articles whose word list is empty

In [21]:
indexes = []
for index, sent in enumerate(data['preprocessed']):
    if len(sent) == 0:
        indexes.append(index)

In [22]:
data = data.drop(indexes, axis=0)
data.shape

(17455, 8)

In [23]:
data = data.reset_index(drop=True)

In [24]:
data

Unnamed: 0,url,title,text,preprocessed,newspaper,date,time,tag
0,https://gazzettadimodena.gelocal.it/modena/cro...,Sassuolo. Banda del tombino in azione vetrine ...,SASSUOLO. Ancora furti e vandalismi in città. ...,"[sassuolo, ancorare, furto, vandalismo, citta,...",Gazzetta di Modena,2021-06-25,01:00:00,furto
1,https://gazzettadimodena.gelocal.it/modena/cro...,Catturano un ladro: «Sono un podista» Agenti a...,Serena Arbizzi Sta entrando nel vivo il proces...,"[sereno, arbizzi, entrare, vivere, processare,...",Gazzetta di Modena,2021-06-24,01:00:00,furto
2,https://gazzettadimodena.gelocal.it/modena/cro...,L’encomio di Mattarella per gli agenti .,"Un arresto per evasione, un altro per furto fu...","[arrestare, evasione, altro, furto, fuori, ora...",Gazzetta di Modena,2021-06-24,01:00:00,furto
3,https://gazzettadimodena.gelocal.it/modena/cro...,Rapinarono un portavalori al Grandemilia di Mo...,"Serena Arbizzi Era a Napoli, sulla nave. Stava...","[sereno, arbizzi, napoli, nave, partire, volto...",Gazzetta di Modena,2021-09-01,01:00:00,furto
4,https://gazzettadimodena.gelocal.it/modena/cro...,Emilia Romagna. A scuola i figli dei medici? «...,Paola Ducci Per ora è certo che da oggi in cla...,"[paola, ducci, orare, certo, oggi, classe, pre...",Gazzetta di Modena,2021-03-08,01:00:00,furto
...,...,...,...,...,...,...,...,...
17450,http://www.modenatoday.it/video/trovata-arma-a...,"Agguato al primario di Cardiologia, ritrovata ...",Indagini a tutto tondo di Carabinieri e Procur...,"[indagine, tondo, carabiniere, procurare, chiu...",ModenaToday,2016-11-14,12:38:00,aggressione
17451,http://gazzettadimodena.gelocal.it/modena/cron...,"Sassuolo, anziana va dal medico e i ladri le s...",SASSUOLO. Continuano con incredibile frequenza...,"[sassuolo, continuare, incredibile, frequenza,...",Gazzetta di Modena,2017-01-31,01:00:00,furto
17452,https://gazzettadimodena.gelocal.it/modena/cro...,"Boom di furti in casa a Modena, comitato in pi...","MODENA. Mentre parla, indica le inferriate al ...","[modena, mentre, parlare, indire, inferriata, ...",Gazzetta di Modena,2020-01-01,01:00:00,furto
17453,http://www.modenatoday.it/cronaca/sequestri-pr...,"Finanza al mercato di Maranello, sequestrati 3...",La Guardia di finanza prosegue nella lotta ai...,"[guardia, finanza, proseguire, lottare, produr...",ModenaToday,2014-09-18,15:02:00,sequestro


In [25]:
data.to_csv(folder+"/dataset.csv", index=False)

# 4. Get a Word2Vec Model

### a. Use a pre-trained model

In [26]:
%%time

# Download model from https://mlunicampania.gitlab.io/italian-word2vec/

pretrained_word_vectors = KeyedVectors.load("W2V.kv", mmap='r+')
#pretrained_vocabs = pretrained_word_vectors.index_to_key
pretrained_vectors = pretrained_word_vectors.vectors

Wall time: 1.73 s


### b. Train a model from scratch

In [27]:
%%time

from gensim.models import Word2Vec

def train_model_from_scratch():
    w2v_model = Word2Vec(sg=1,
                     min_count=20,
                     window=10,
                     size=300,
                     sample=1e-3, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=7)
    w2v_model.build_vocab(data['preprocessed'], progress_per=10000)
    w2v_model.train(data['preprocessed'], total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)
    model_name = folder+"/models/from_scratch.kv"
    word_vectors = w2v_model.wv
    word_vectors.save(model_name)

train_model_from_scratch()
new_word_vectors = KeyedVectors.load(folder+"/models/from_scratch.kv", mmap='r+')
#new_vocabs = new_word_vectors.index_to_key
new_vectors = new_word_vectors.vectors

Wall time: 21min 3s


### c. Retrain a pre-trained Model

In [28]:
%%time

from gensim.models import Word2Vec, KeyedVectors

def retrain_pretrained_model():
    sentences = data['preprocessed'].values.tolist()
    w2v_model = Word2Vec(sg=1,
                    min_count=20,
                    window=10,
                    size=300,
                    sample=1e-3,
                    negative=20,
                    workers=7)
    w2v_model.build_vocab(sentences)
    w2v_model.intersect_word2vec_format("W2V.bin", binary=True, lockf=1.0)
    w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=w2v_model.epochs, report_delay=1)
    model_name = folder+"/models/retrained.kv"
    word_vectors = w2v_model.wv
    word_vectors.save(model_name)

retrain_pretrained_model()
retrained_word_vectors = KeyedVectors.load(folder+"/models/retrained.kv", mmap='r+')
#retrained_vocabs = retrained_word_vectors.index_to_key
retrained_vectors = retrained_word_vectors.vectors

  weights = fromstring(fin.read(binary_len), dtype=REAL)


Wall time: 3min 44s


# 5. Mean Word Embedding

### a. Simple Mean

In [29]:
class MeanEmbeddingVectorizer(object):

    def __init__(self, word_model):
        self.word_model = word_model
        self.vector_size = word_model.vector_size

    def fit(self):  # comply with scikit-learn transformer requirement
        return self

    def transform(self, docs):  # comply with scikit-learn transformer requirement
        doc_word_vector = self.word_average_list(docs)
        return doc_word_vector

    def word_average(self, sent):
        """
        Compute average word vector for a single doc/sentence.
        :param sent: list of sentence tokens
        :return: 
            mean: float of averaging word vectors
        """
        mean = []
        for word in sent:
            if word in self.word_model.vocab:
                mean.append(self.word_model.get_vector(word))
        
        
        if not mean:  # empty words
            # If a text is empty, return a vector of zeros.
            logging.warning("cannot compute average owing to no vector for {}".format(sent))
            return np.zeros(self.vector_size)
        else:
            mean = np.array(mean).mean(axis=0)
            return mean


    def word_average_list(self, docs):
        """
        Compute average word vector for multiple docs, where docs had been tokenized.
        :param docs: list of sentence in list of separated tokens
        :return:
            array of average word vector in shape (len(docs),)
        """
        return np.vstack([self.word_average(sent) for sent in docs])

### b. Mean weighted by TF-IDF score

In [30]:
class TfidfEmbeddingVectorizer(object):

    def __init__(self, word_model):

        self.word_model = word_model
        self.word_idf_weight = None
        self.vector_size = word_model.vector_size

    def fit(self, docs):  # comply with scikit-learn transformer requirement
        """
        Fit in a list of docs, which had been preprocessed and tokenized,
        such as word bi-grammed, stop-words removed, lemmatized, part of speech filtered.
        Then build up a tfidf model to compute each word's idf as its weight.
        Noted that tf weight is already involved when constructing average word vectors, and thus omitted.
        :param
        pre_processed_docs: list of docs, which are tokenized
        :return:
        self
        """

        text_docs = []
        for doc in docs:
            text_docs.append(" ".join(doc))

        tfidf = TfidfVectorizer()
        tfidf.fit(text_docs)  # must be list of text string

        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of
        # known idf's
        max_idf = max(tfidf.idf_)  # used as default value for defaultdict
        self.word_idf_weight = defaultdict(lambda: max_idf, [(word, tfidf.idf_[i]) for word, i in tfidf.vocabulary_.items()])
        return self


    def transform(self, docs):  # comply with scikit-learn transformer requirement
        doc_word_vector = self.word_average_list(docs)
        return doc_word_vector


    def word_average(self, sent):
        """
        Compute average word vector for a single doc/sentence.
        :param sent: list of sentence tokens
        :return:
            mean: float of averaging word vectors
        """

        mean = []
        for word in sent:
            if word in self.word_model.vocab:
                mean.append(self.word_model.get_vector(word) * self.word_idf_weight[word])  # idf weighted

        if len(mean) == 0:  # empty words
            # If a text is empty, return a vector of zeros.
            logging.warning("cannot compute average owing to no vector for {}".format(sent))
            return np.zeros(self.vector_size)
        else:
            mean = np.array(mean).mean(axis=0)
            return mean


    def word_average_list(self, docs):
        """
        Compute average word vector for multiple docs, where docs had been tokenized.
        :param docs: list of sentence in list of separated tokens
        :return:
            array of average word vector in shape (len(docs),)
        """
        return np.vstack([self.word_average(sent) for sent in docs])

### Compute word embeddings and store them on csv files

In [31]:
def vectors_to_csv(word_vectors, mean, lemmatization, bigram, filename):
    if mean=='simple':
        vectorizer = MeanEmbeddingVectorizer(word_vectors)
        vectorizer.fit()
        features = vectorizer.transform(data['preprocessed'])
    elif mean=='tfidf':
        vectorizer = TfidfEmbeddingVectorizer(word_vectors)
        vectorizer.fit(data['preprocessed'])
        features = vectorizer.transform(data['preprocessed'])
    
    vectors = pd.DataFrame(data=features)
    vectors.insert(0, 'url', data['url'].values)
    vectors.insert(1, 'title', data['title'].values)
    vectors.insert(2, 'newspaper', data['newspaper'].values)
    vectors.insert(3, 'text', data['text'].values)
    vectors.insert(4, 'date', data['date'].values)
    vectors.insert(5, 'time', data['time'].values)
    vectors.insert(6, 'preprocessed', data['preprocessed'].values)
    vectors.insert(7, 'target', data['tag'].values)
    vectors = vectors.reset_index(drop=True)
    
    vectors.to_csv(folder+'/'+mean+'/csv/'+filename+'_vectors.csv', index=False)

In [32]:
%%time

vectors_to_csv(pretrained_word_vectors, 'simple', lemmatization, bigram, 'pretrained')

Wall time: 1min 5s


In [33]:
%%time

vectors_to_csv(pretrained_word_vectors, 'tfidf', lemmatization, bigram, 'pretrained')

Wall time: 1min 55s


In [34]:
%%time

vectors_to_csv(new_word_vectors, 'simple', lemmatization, bigram, 'new')

Wall time: 38 s


In [35]:
%%time

vectors_to_csv(new_word_vectors, 'tfidf', lemmatization, bigram, 'new')

Wall time: 1min 28s


In [36]:
%%time

vectors_to_csv(retrained_word_vectors, 'simple', lemmatization, bigram, 'retrained')

Wall time: 41.2 s


In [37]:
%%time

vectors_to_csv(retrained_word_vectors, 'tfidf', lemmatization, bigram, 'retrained')

Wall time: 1min 32s
