# Minería de texto: clasificación de artículos

### Autor: Silvia García Hernández                                                                    
### Fecha: 3/06/2019

In [30]:
#from pybtex.database.input import bibtex

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import string
import pickle 
import time
import nltk

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer

import warnings
warnings.filterwarnings("ignore")

### 1. Reading the dataset

The dataset has 3 columns of interest (title, keywords and abstract). A fouth column will be created to compare the classification scores of the four cases. The idea is to preprocess the columns, vectorized them and then implement a set of machine learning classifications models to compare results.

In [23]:
#Read the data
data_df = pd.read_csv('../dataset/articles.csv')

data_df.head()   

Unnamed: 0,entry,title,keywords,abstract,year,journal
0,AZAZA20181,Context proposals for saliency detection,"Computational saliency, Object segmentation, O...",One of the fundamental properties of a salient...,2018,Computer Vision and Image Understanding
1,BENSHABAT201812,Graph based over-segmentation methods for 3D p...,"3D point cloud over-segmentation, 3D point clo...","Over-segmentation, or super-pixel generation, ...",2018,Computer Vision and Image Understanding
2,YANG201843,Text effects transfer via distribution-aware t...,"Text effects, Texture synthesis, Spatial distr...","In this paper, we explore the problem of fanta...",2018,Computer Vision and Image Understanding
3,BARATH201870,Efficient energy-based topological outlier rej...,"Stereo vision, Outlier filtering, Energy minim...",An approach is proposed for outlier rejection ...,2018,Computer Vision and Image Understanding
4,ARRIGONI201895,Robust synchronization in SO(3) and SE(3) via ...,"Absolute rotations, Global rotations, Structur...",This paper deals with the synchronization prob...,2018,Computer Vision and Image Understanding


In [24]:
#Studying the data (there is an imbalance in the data)
data_df.groupby(['journal']).count()

Unnamed: 0_level_0,entry,title,keywords,abstract,year
journal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Computer Vision and Image Understanding,249,249,249,249,249
Data & Knowledge Engineering,99,99,99,99,99
Journal of Visual Communication and Image Representation,433,433,433,433,433


In [25]:
#Creating the fouth column
data_df['all_text'] = data_df['title'] + ' ' + data_df['keywords'] + ' ' + data_df['abstract'] 

In [26]:
#Encoding the label columns
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
data_df['journal_label'] = le.fit_transform(data_df['journal'])

In [27]:
print('The number of samples is {}'.format(len(data_df)))
data_df.head()

The number of samples is 781


Unnamed: 0,entry,title,keywords,abstract,year,journal,all_text,journal_label
0,AZAZA20181,Context proposals for saliency detection,"Computational saliency, Object segmentation, O...",One of the fundamental properties of a salient...,2018,Computer Vision and Image Understanding,Context proposals for saliency detection Compu...,0
1,BENSHABAT201812,Graph based over-segmentation methods for 3D p...,"3D point cloud over-segmentation, 3D point clo...","Over-segmentation, or super-pixel generation, ...",2018,Computer Vision and Image Understanding,Graph based over-segmentation methods for 3D p...,0
2,YANG201843,Text effects transfer via distribution-aware t...,"Text effects, Texture synthesis, Spatial distr...","In this paper, we explore the problem of fanta...",2018,Computer Vision and Image Understanding,Text effects transfer via distribution-aware t...,0
3,BARATH201870,Efficient energy-based topological outlier rej...,"Stereo vision, Outlier filtering, Energy minim...",An approach is proposed for outlier rejection ...,2018,Computer Vision and Image Understanding,Efficient energy-based topological outlier rej...,0
4,ARRIGONI201895,Robust synchronization in SO(3) and SE(3) via ...,"Absolute rotations, Global rotations, Structur...",This paper deals with the synchronization prob...,2018,Computer Vision and Image Understanding,Robust synchronization in SO(3) and SE(3) via ...,0


In [28]:
#Create dataframe with information of interest
data_to_proccess = data_df[['title', 'keywords', 'abstract', 'all_text', 'journal_label']]

### 2. Preprocessing Data 

In [33]:
#Removing punctuation signs, lowercase, stopwords
from nltk.corpus import stopwords
stop = stopwords.words('english')

column_names = ['title', 'keywords', 'abstract', 'all_text']

for column in column_names:
    data_to_proccess[column] = data_to_proccess[column].str.replace(r'[^\w\s]+', '')
    data_to_proccess[column] = data_to_proccess[column].str.lower()
    data_to_proccess[column] = data_to_proccess[column].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    
data_to_proccess.head()

Unnamed: 0,title,keywords,abstract,all_text,journal_label
0,context proposals saliency detection,computational saliency object segmentation obj...,one fundamental properties salient object regi...,context proposals saliency detection computati...,0
1,graph based oversegmentation methods 3d point ...,3d point cloud oversegmentation 3d point cloud...,oversegmentation superpixel generation common ...,graph based oversegmentation methods 3d point ...,0
2,text effects transfer via distributionaware te...,text effects texture synthesis spatial distrib...,paper explore problem fantastic specialeffects...,text effects transfer via distributionaware te...,0
3,efficient energybased topological outlier reje...,stereo vision outlier filtering energy minimiz...,approach proposed outlier rejection set 2d poi...,efficient energybased topological outlier reje...,0
4,robust synchronization so3 se3 via lowrank spa...,absolute rotations global rotations structuref...,paper deals synchronization problem arises mul...,robust synchronization so3 se3 via lowrank spa...,0


### 2.1 Stemming

In [34]:
#Using Porter
porter = PorterStemmer()

for column in column_names:
    data_to_proccess[column] = data_to_proccess[column].apply(lambda x: ' '.join([porter.stem(word) for word in x.split()]))

data_to_proccess.head()

Unnamed: 0,title,keywords,abstract,all_text,journal_label
0,context propos salienc detect,comput salienc object segment object propos,one fundament properti salient object region c...,context propos salienc detect comput salienc o...,0
1,graph base oversegment method 3d point cloud,3d point cloud oversegment 3d point cloud segm...,oversegment superpixel gener common preliminar...,graph base oversegment method 3d point cloud 3...,0
2,text effect transfer via distributionawar text...,text effect textur synthesi spatial distribut ...,paper explor problem fantast specialeffect syn...,text effect transfer via distributionawar text...,0
3,effici energybas topolog outlier reject,stereo vision outlier filter energi minim poin...,approach propos outlier reject set 2d point co...,effici energybas topolog outlier reject stereo...,0
4,robust synchron so3 se3 via lowrank spars matr...,absolut rotat global rotat structurefrommot gl...,paper deal synchron problem aris multipl 3d po...,robust synchron so3 se3 via lowrank spars matr...,0


### 3. Feature extraction (BoW, TF-IDF)

In [36]:
#TF-IDF definition
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words = 'english',
        sublinear_tf = True,
        strip_accents = 'ascii',
        analyzer = 'word',
        token_pattern = r'\w{2,}', 
        ngram_range = (1, 3),
        max_features = 800,
        max_df = 0.7)

In [39]:
tf_idf_list = []

for column in column_names:
    tf_idf_list.append(tfidf.fit_transform(data_to_proccess[column]))

In [61]:
#Function to visualize tfidf output
def visualize_words_and_frequency_tfidf(tfidf_list, method):

    weights = np.asarray(tfidf_list.mean(axis = 0)).ravel().tolist()
    weights_df = pd.DataFrame({'Word': method.get_feature_names(), 'Weight': weights})
    weights_df = weights_df.sort_values(by = 'Weight', ascending = False).head()
    
    print(weights_df.head(5))

In [62]:
visualize_words_and_frequency_tfidf(tf_idf_list[3], tfidf)

       Word    Weight
331    imag  0.055628
424  method  0.047048
759     use  0.040084
434   model  0.037070
270  featur  0.035079


In [63]:
pd.DataFrame(tf_idf_list[3].toarray(), columns=tfidf.get_feature_names()).head()

Unnamed: 0,2d,3d,3d model,3d reconstruct,3dhevc,abil,abl,accord,account,accur,...,way,web,weight,wide,wide use,window,word,work,year,yield
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.100921,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.105624,0.238512,0.0,0.0,0.0,0.0,0.0,0.094357,0.101706,0.0,...,0.0,0.0,0.091337,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.116179,0.0,0.0,0.0
3,0.152822,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.117648,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.098776,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
#Bag of Words definition
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words = 'english',
        strip_accents = 'ascii',
        analyzer = 'word',
        token_pattern = r'\w{3,}',
        ngram_range = (1, 3),
        max_features = 800,
        max_df = 0.7)

In [60]:
cv_list = []

for column in column_names:
    cv_list.append(cv.fit_transform(data_to_proccess[column]))

In [67]:
visualize_words_and_frequency_tfidf(cv_list[3], cv)

       Word    Weight
327    imag  2.591549
421  method  1.791293
758     use  1.377721
431   model  1.153649
265  featur  1.021767


In [66]:
pd.DataFrame(cv_list[3].toarray(), columns=cv.get_feature_names()).head()

Unnamed: 0,3dhevc,abil,abl,accord,account,accur,accuraci,achiev,action,action recognit,...,web,weight,wide,wide use,window,word,work,work propos,year,yield
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 4. Dimensionality Reduction (LSA)

In [19]:
#print("El número de atributos calculados con TF-IDF es: %d" % docTrans0.get_shape()[1])

print("\n---Dimensionality reduction with LSA---")

%%time

svd = TruncatedSVD(n_components = 500, n_iter = 100, random_state = 42)
normalizer = Normalizer(copy = False)
lsa = make_pipeline(svd, normalizer)

for name in column_names:
    

tfidf_matrix_lsa0 = lsa.fit_transform(tfidf_matrix_list[0][0])
tfidf_matrix_lsa1 = lsa.fit_transform(tfidf_matrix_list[0][1])
tfidf_matrix_lsa2 = lsa.fit_transform(tfidf_matrix_list[0][2])

tfidf_matrix_lsa = [[tfidf_matrix_lsa0, tfidf_matrix_lsa1, tfidf_matrix_lsa2]]

print("Ha llevado %.3f segundos" % (time.time() - t0))

variance_lsa = svd.explained_variance_ratio_.sum()

print("Cantidad de información contenida en LSA (varianza): {}%".format(int(variance_lsa * 100)))

El número de atributos calculados con TF-IDF es: 800

---Reducción de dimensionalidad con LSA---
Ha llevado 7.405 segundos
Cantidad de información contenida en LSA (varianza): 97%
