# Tools for Text Mining

In [1]:
import pandas as pd
import numpy as np
import re
from bokeh.io import output_notebook
output_notebook()
import warnings
warnings.filterwarnings('ignore')

In [2]:
%reload_ext autoreload
%autoreload 2

Import the dataset

In [3]:
df = pd.read_csv("data/20newsgroup_body.csv")

In [4]:
df.head()

Unnamed: 0,text,class,filename
0,In article <1993Apr14.125813.21737@ncsu.edu> ...,talk.politics.mideast,75895
1,"ab4z@Virginia.EDU (""Andi Beyer"") writes: > F...",talk.politics.mideast,76248
2,Andrew Varvel writes: > > > Serdar Argic >(a...,talk.politics.mideast,76277
3,In article <1993Apr15.204930.9517@thunder.mcr...,talk.politics.mideast,76045
4,Srinivas Suder writes: >If the Haitian peopl...,talk.politics.mideast,77197


Dropping missing values:

In [5]:
df.isnull().sum()

text        8
class       0
filename    0
dtype: int64

In [6]:
df = df.dropna()

### Text cleaning and pre-processing 

In [7]:
print('{} classes:\n{}'.format(len(df['class'].unique()), df['class'].unique()))

20 classes:
['talk.politics.mideast' 'rec.autos' 'comp.sys.mac.hardware' 'alt.atheism'
 'rec.sport.baseball' 'comp.os.ms-windows.misc' 'rec.sport.hockey'
 'sci.crypt' 'sci.med' 'talk.politics.misc' 'rec.motorcycles'
 'comp.windows.x' 'comp.graphics' 'comp.sys.ibm.pc.hardware'
 'sci.electronics' 'talk.politics.guns' 'sci.space'
 'soc.religion.christian' 'misc.forsale' 'talk.religion.misc']


In [8]:
from mltools.textMining import TextPreprocessing

Using the class TextPreprocessing we eliminate the special characters and stopwords in the input text. The output will be a pandas Dataframe like the input one with a new column that contain the list of tokens extracted and a text field that contain the original text cleaned.

In [9]:
tp = TextPreprocessing(lemmatization = False)

The *fit* method take a pandas DataFrame as input and the name of the column that contain the text to processing.

In [10]:
output = tp.fit(df, "text")

Data cleaning...
Standardization...
Tokenization...
Removing stopwords...
Finish


In [11]:
output.head()

Unnamed: 0,text,class,filename,tokens
0,in article apr hernlem b...,talk.politics.mideast,75895,"[article, hernlem, brad, hernlem, writes, leba..."
1,ab z andi beyer writes first of all i...,talk.politics.mideast,76248,"[andi, beyer, writes, first, never, said, holo..."
2,andrew varvel writes serdar argic a...,talk.politics.mideast,76277,"[andrew, varvel, writes, serdar, argic, serdar..."
3,in article apr hasan writ...,talk.politics.mideast,76045,"[article, hasan, writes, article, astein, alan..."
4,srinivas suder writes if the haitian peopl...,talk.politics.mideast,77197,"[srinivas, suder, writes, haitian, people, cur..."


### Extracting Skip-Gram

### Data augmentation 

### Vectorization

In [12]:
#suddivisione training set / test set
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(output[["tokens", "filename"]], output["class"], 
                                                    test_size=0.3, 
                                                    random_state=40)

In [14]:
from mltools.textMining.text_preprocessing import VectorizeData

In [15]:
#as default we consider the tf-idf
token_toVect = VectorizeData()

In [16]:
X_train_tfidf, X_test_tfidf, vectorizer = token_toVect.fit(X_train["tokens"], X_test["tokens"])

In [17]:
X_train_tfidf.shape, X_test_tfidf.shape

((13970, 60000), (5988, 60000))

### Word Embeddings

In [None]:
from mltools.textMining.word2vec import *

In [None]:
word2vec_model = load_model("GoogleNews-vectors-negative300.bin.gz")

In [None]:
embeddings = get_word2vec_embeddings(word2vec_model, output, "tokens")

In [None]:
np.array(embeddings)

### Text Classification using the evaluateModels module

In [18]:
from mltools.evaluateModels import CrossValidation

In [19]:
cv = CrossValidation(models=["MultinomialNB", "LogisticRegression"], 
                     scores = ["accuracy", "f1_multiclass", "precision_multiclass", "recall_multiclass"],
                     params_file = "./param_file.txt")

In [20]:
cv.get_models_info

MultinomialNB
GaussianNB
SVM
SGD_SVM
SGD_regressor
SVM_regressor
KNN
KNN_regressor
DT
DT_regressor
RandomForest
RandomForest_regressor
LinearRegression
NonNegativeLinearRegression
LogisticRegression
LassoRegression
RidgeRegression
ElasticNet
LinearGAM
PoissonGAM
GammaGAM
MARS
GradientBoosting
GradientBoosting_regressor
AdaBoost
AdaBoost_regressor
LightGBM
LightGBM_regressor
XGBoost
XGBoost_regressor


In [21]:
cv.get_scores_info

accuracy
auc
f1
f1_multiclass
mcc
precision
precision_multiclass
recall
recall_multiclass
mse
rmse
mae
medae
msle
r2


In [22]:
res, model = cv.fit(X_train_tfidf, y_train)

Model: MultinomialNB
Searching the best MultinomialNB with grid search cv...

Best_estimator: MultinomialNB(alpha=0.1, class_prior=None, fit_prior=False)

Best_scores: 0.848890479599141
Evaluate the best model configuration with a new cross validation...

Model: LogisticRegression
Searching the best LogisticRegression with grid search cv...

Best_estimator: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Best_scores: 0.8206871868289191
Evaluate the best model configuration with a new cross validation...

Finish


In [23]:
res

Unnamed: 0,LogisticRegression,MultinomialNB
computation_total,36.9073,0.851345
test_accuracy_mean,0.820699,0.848888
test_accuracy_sd,0.00809733,0.00958564
test_accuracy_ci_95%,"[0.8188, 0.8226]","[0.8466, 0.8512]"
test_accuracy_ci_99%,"[0.8179, 0.8235]","[0.8456, 0.8522]"
train_accuracy_mean,0.935107,0.962499
train_accuracy_sd,0.00107115,0.000693363
train_accuracy_ci_95%,"[0.9349, 0.9354]","[0.9623, 0.9627]"
train_accuracy_ci_99%,"[0.9347, 0.9355]","[0.9623, 0.9627]"
test_f1_multiclass_mean,0.818745,0.846937


In [24]:
from sklearn.metrics import accuracy_score, auc, f1_score, precision_score, recall_score, matthews_corrcoef

nb_clf = model['MultinomialNB']
nb_clf = nb_clf.fit(X_train_tfidf, y_train) 
y_predicted = nb_clf.predict(X_test_tfidf)

print("MultinomialNB accuracy on test set:", accuracy_score(y_test, y_predicted))

MultinomialNB accuracy on test set: 0.8560454241816967


### Feature Importance in classification 

In [25]:
from mltools.textMining import *

In [26]:
lgr_clf = model['LogisticRegression']
lgr_clf = lgr_clf.fit(X_train_tfidf, y_train)

In [27]:
importance = get_most_important_features(vectorizer, lgr_clf, n=10)

In [28]:
plot_important_words(importance, lgr_clf.classes_)

### Most frequent words

In [30]:
from mltools.textMining.features_importance import plot_word_freq

In [31]:
plot_word_freq(output, target="class", col="tokens")

## BBC NEWS SUMMARIZATION 

### Dataset construction

In [38]:
import re
import os
import sys
import codecs

In [39]:
path_news = "./data/BBC_Business_News/business_news"
path_summary = "./data/BBC_Business_News/business_summary"

In [40]:
def create_df(path, column_name):
    dataset = pd.DataFrame()
    filename_list = os.listdir(path)
    for filename in filename_list:
        with codecs.open("{}/{}".format(path, filename), "r", encoding='utf-8', errors='ignore') as file:
            text = file.readlines()
            clean_text = [re.sub(r'(\n)', '', line) for line in text]
            complete_text = " ".join(clean_text)
            df = np.array(complete_text).reshape(1,1)
        dataset = pd.concat([dataset, pd.DataFrame(df)])
        
    dataset.columns = [column_name]
    dataset.index = np.arange(len(dataset))
    return dataset

In [41]:
dataset_news = create_df(path_news, 'article')
dataset_summaries = create_df(path_summary, 'reference_summary')

In [43]:
dataset = pd.concat([dataset_news, dataset_summaries], axis = 1)
dataset.head()

Unnamed: 0,article,reference_summary
0,UK economy facing 'major risks' The UK manufa...,"""Despite some positive news for the export sec..."
1,Aids and climate top Davos agenda Climate cha...,"At the same time, about 100,000 people are exp..."
2,Asian quake hits European shares Shares in Eu...,The unfolding scale of the disaster in south A...
3,India power shares jump on debut Shares in In...,"Shares in India's largest power producer, Nati..."
4,Lacroix label bought by US firm Luxury goods ...,LVMH said the French designer's haute couture ...


In [44]:
dataset = dataset[dataset['article'].apply(len) > 100]

In [45]:
dataset_news['article'][0]

'UK economy facing \'major risks\'  The UK manufacturing sector will continue to face "serious challenges" over the next two years, the British Chamber of Commerce (BCC) has said.  The group\'s quarterly survey of companies found exports had picked up in the last three months of 2004 to their best levels in eight years. The rise came despite exchange rates being cited as a major concern. However, the BCC found the whole UK economy still faced "major risks" and warned that growth is set to slow. It recently forecast economic growth will slow from more than 3% in 2004 to a little below 2.5% in both 2005 and 2006.  Manufacturers\' domestic sales growth fell back slightly in the quarter, the survey of 5,196 firms found. Employment in manufacturing also fell and job expectations were at their lowest level for a year.  "Despite some positive news for the export sector, there are worrying signs for manufacturing," the BCC said. "These results reinforce our concern over the sector\'s persisten

In [46]:
dataset_finale.shape

(510, 2)

### Summarization

In particular, the class Summarization uses 'extraction-based' methods, which work by selecting a subset of the most important phrases existing in the original text to compose the summary.
:
- TextRank (gensim and sumy);
- LexRank (sumy);
- Lsa (sumy);
- Luhn (sumy). 

Furthermore, by setting the parameter 'keywords' equal to True, it's possible to extract the keywords from the input text.

First of all, it is necessary to filter the text field to eliminate any fields containing empty strings or texts with too few sentences:

In [61]:
from mltools.textMining import Summarization

In [62]:
Summarization.getInfo_models()

['textrank-g', 'textrank-s', 'lexrank-s', 'lsa-s', 'luhn-s']


In [63]:
dataset = dataset[dataset['article'].apply(Summarization.count_sentences) > 5]

In [64]:
dataset.shape

(510, 2)

In [65]:
SUM = Summarization(models = ['textrank-g', 'lexrank-s', 'lsa-s'])

In [66]:
df_summary = SUM.fit(dataset, field = 'article', n_sentences = 3)
df_summary.head()

Summarizing...
Finish



Unnamed: 0,article,reference_summary,textrank-g_summary,lexrank-s_summary,lsa-s_summary
0,UK economy facing 'major risks' The UK manufa...,"""Despite some positive news for the export sec...",UK economy facing 'major risks' The UK manufa...,The group's quarterly survey of companies foun...,Employment in manufacturing also fell and job ...
1,Aids and climate top Davos agenda Climate cha...,"At the same time, about 100,000 people are exp...",Other issues to be discussed at the five-day c...,"The World Health Organisation (WHO) said 700,0...","Some 2,000 business and political leaders from..."
2,Asian quake hits European shares Shares in Eu...,The unfolding scale of the disaster in south A...,Asian quake hits European shares Shares in Eu...,But the disaster has little impact on stock ma...,"More than 23,000 people have been killed follo..."
3,India power shares jump on debut Shares in In...,"Shares in India's largest power producer, Nati...","""Power needs in India are set to rise and NTPC...",The government's partial sell-off of NTPC is p...,It was India's second $1bn stock debut in thre...
4,Lacroix label bought by US firm Luxury goods ...,LVMH said the French designer's haute couture ...,LVMH said the French designer's haute couture ...,Lacroix label bought by US firm Luxury goods ...,Lacroix label bought by US firm Luxury goods ...


In [67]:
df_summary.loc[0]['article']

'UK economy facing \'major risks\'  The UK manufacturing sector will continue to face "serious challenges" over the next two years, the British Chamber of Commerce (BCC) has said.  The group\'s quarterly survey of companies found exports had picked up in the last three months of 2004 to their best levels in eight years. The rise came despite exchange rates being cited as a major concern. However, the BCC found the whole UK economy still faced "major risks" and warned that growth is set to slow. It recently forecast economic growth will slow from more than 3% in 2004 to a little below 2.5% in both 2005 and 2006.  Manufacturers\' domestic sales growth fell back slightly in the quarter, the survey of 5,196 firms found. Employment in manufacturing also fell and job expectations were at their lowest level for a year.  "Despite some positive news for the export sector, there are worrying signs for manufacturing," the BCC said. "These results reinforce our concern over the sector\'s persisten

In [68]:
df_summary.loc[0]['textrank-g_summary']

'UK economy facing \'major risks\'  The UK manufacturing sector will continue to face "serious challenges" over the next two years, the British Chamber of Commerce (BCC) has said. The BCC found confidence increased in the quarter across both the manufacturing and service sectors although overall it failed to reach the levels at the start of 2004.'

In [69]:
df_summary.loc[0]['lexrank-s_summary']

'The group\'s quarterly survey of companies found exports had picked up in the last three months of 2004 to their best levels in eight years. However, the BCC found the whole UK economy still faced "major risks" and warned that growth is set to slow. "Despite some positive news for the export sector, there are worrying signs for manufacturing," the BCC said.'

### Evaluation

One of the most commonly used metrics for this purpose is ROUGE, which essentially calculates the overlaps of the n-grams between the two summaries being compared. A high degree of overlap should indicate a high level of concepts shared between the two summaries.
 
- Rouge-1 (one-grams);
- Rouge-2 (bi-grams);
- Rouge-L (Longest Common Subsequence).

For each of them, by setting the parameter 'type_metric' equal to 'f', 'p' or 'r', one has the f-score, the precision or the recall respectively (default is 'f').

In [70]:
dictionary, df_scores = SUM.evaluate(df_summary, field_summary = 'reference_summary',
                            metrics = ['rouge-1', 'rouge-2', 'rouge-l'], type_metric = 'r')

Validation...

Dataframe for the rouge-1 metric has been saved...
Dataframe for the rouge-2 metric has been saved...
Dataframe for the rouge-l metric has been saved...

Finish


In [71]:
dictionary.keys()

dict_keys(['rouge-1_r_df', 'rouge-2_r_df', 'rouge-l_r_df'])

In [72]:
dictionary['rouge-1_r_df'].head()

Unnamed: 0,textrank-g_rouge-1_r,lexrank-s_rouge-1_r,lsa-s_rouge-1_r
0,0.447917,0.40625,0.125
1,0.335196,0.47486,0.346369
2,0.241176,0.370588,0.176471
3,0.689655,0.155172,0.344828
4,0.690476,1.0,0.714286


In [78]:
df_scores

Unnamed: 0,textrank-g,lexrank-s,lsa-s
rouge-1_r_mean,0.479335,0.463226,0.351689
rouge-1_r_std,0.157243,0.146458,0.151427
rouge-2_r_mean,0.387534,0.36371,0.225757
rouge-2_r_std,0.168674,0.157101,0.157348
rouge-l_r_mean,0.475142,0.456906,0.342256
rouge-l_r_std,0.158878,0.149238,0.15341


### Extracting keywords

In [74]:
from mltools.textMining import Keywords

In [75]:
kw = Keywords()

In [76]:
kw_res = kw.fit(dataset_finale, field = 'article')

Keywords Extraction...
Finish


In [77]:
kw_res.head()

Unnamed: 0,article,reference_summary,keywords
0,UK economy facing 'major risks' The UK manufa...,"""Despite some positive news for the export sec...","[bcc, despite, major]"
1,Aids and climate top Davos agenda Climate cha...,"At the same time, about 100,000 people are exp...","[president, anti, leaders, leader]"
2,Asian quake hits European shares Shares in Eu...,The unfolding scale of the disaster in south A...,"[shares, share, markets, market, hits, hit]"
3,India power shares jump on debut Shares in In...,"Shares in India's largest power producer, Nati...","[power, shares, share, firms, firm]"
4,Lacroix label bought by US firm Luxury goods ...,LVMH said the French designer's haute couture ...,"[label, labels, group]"
