<a href="https://colab.research.google.com/github/ThazSobral/tcc/blob/main/test_doc2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Bibliotecas

In [171]:
# dataframe
import pandas as pd
from sklearn.model_selection import train_test_split

# utils
import re
import numpy as np
import multiprocessing

import statsmodels.api as sm

# doc2vec
from gensim.models import doc2vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# classificação
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# avaliação
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

In [172]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Funções

### Dados


In [173]:
def read_dataset(path):
  return pd.read_csv(path, header=0)

In [174]:
def build_train_test_data(dataset):

  X = dataset[['id_', 'tweet']]
  y = dataset[['flood']]

  x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)

  return x_train, x_test, y_train, y_test, X

In [175]:
def add_column(df, array, column_name):
  df[column_name] = array
  return df

### Doc2Vec

In [176]:
def tagged_tweets_and_create_documents(corpus):

  documents = [TaggedDocument(clear_data(corpus.loc[i].tweet), [i]) for i in corpus.id_]

  return documents

In [177]:
def clear_data(text, nums=True):

  remove_ellipsis_re = re.compile(r'[^\.]\.{2,3}')
  punct_re = re.compile(r"[\"'\[\],'#.:;()&!@\u2026]")
  number_re = re.compile(r'\d+')
  # mentions_re = re.compile(r'\B@[A-z|0-9]+')
  mentions_re = re.compile(r'\B@')
  links_re = re.compile(r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})')

  text = text.lower()

  text = text.replace("'",'')

  text = re.sub(links_re, '', text)
  text = re.sub(mentions_re, '', text)
  text = re.sub(remove_ellipsis_re, '', text)
  text = re.sub(punct_re, '', text)

  if nums:
    text = re.sub(number_re, '', text)

  # text = ' '.join(text.split())

  return text.split()

In [178]:
def create_d2v(documents, vecs, type):
  # definir a quantidade de núcleos do processador para paralelizar o processamento
  cores = multiprocessing.cpu_count()

  # conforme o trabalho original foi selecionado os seguintes parâmentros:
  # vetor de 160 dimensões
  # janela de 10
  # contagem de frequencia mínima igual a 4
  # amostra de 1e-4
  if type == 'dm':
    # criar modelo dm
    model = Doc2Vec(vector_size=vecs, window=10, min_count=4, workers=cores, sample=1e-4, dm=1, dm_concat=1)
  elif type == 'dbow':
    # criar modelo dbow
    model = Doc2Vec(vector_size=vecs, window=10, min_count=4, workers=cores, sample=1e-4, dm=0)
  else: 
    print('Invalid type!')

  model.build_vocab(documents)
  return model

In [179]:
def save_model(model, fname):
  model.save(fname)

In [223]:
def train_d2v(model, documents, epochs ):
  for epoch in range(0, epochs):
    print(f'--treinamento na {epoch}ª época--')
    # logging.info('Training iteration #{}'.format(epoch))
    # random.shuffle(documents)
    model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)
    # model.alpha -= 0.0002
  
  return model

In [181]:
# def get_vector_d2v(model, data):
def get_vector_d2v(model_dm, model_dbow, data):
  # return ([list(model.docvecs[id]).sum() for id in range(0, len(data))])
  return ( [list(model_dm.docvecs[id]) + list(model_dbow.docvecs[id]) for id in range(0, len(data))])

### Classificação

#### Logistic Regression

In [182]:
def create_model_logistic_regression(x, y):
  model_log = LogisticRegression(C=1200, penalty=='l2', tol=0.0001, n_jobs=-1)
  model_log.fit(x, y)

  return model_log

#### Random Forest

In [183]:
def create_model_random_forest(x, y):
  model_rand = RandomForestClassifier(n_estimators=100)
  model_rand.fit(x, y)

  return model_rand

#### SVC

In [271]:
def create_model_svc(x, y):
  model_svc = SVC(C=1200, tol=0.0001)
  model_svc.fit(x, y)

  return model_svc

### Avaliação

In [185]:
def predict_tweet(model_dm, model_dbow, model_classifier, text):
  # vec = model_d2v.infer_vector(doc_words=text.split(), alpha=0.025, steps=20)
  vec_dm = model_dm.infer_vector(doc_words=text.split(), alpha=0.025, steps=20)
  vec_dbow = model_dbow.infer_vector(doc_words=text.split(), alpha=0.025, steps=20)

  vec = vec_dm + vec_dbow
  vec = sm.add_constant(vec)
  # print(len(vec))
  print(model_classifier.predict(vec.reshape(1, -1)))
  if (model_classifier.predict(vec.reshape(1, -1)) == 1):
    print('Avaliação positiva!\n')
  else: print('Avaliação negativa\n')

In [186]:
def avaliation_classification(model_dm, model_dbow, model_classifier, text, label):
  # vec = model_d2v.infer_vector(doc_words=text.split(), alpha=0.025, steps=20)
  vec_dm = model_dm.infer_vector(doc_words=text.split(), alpha=0.025, steps=20)
  vec_dbow = model_dbow.infer_vector(doc_words=text.split(), alpha=0.025, steps=20)

  vec = vec_dm + vec_dbow
  vec = sm.add_constant(vec)
  if (model_classifier.predict(vec.reshape(1, -1)) == label):
    print('right')
  else: print('wrong')

In [187]:
def classifier_test(model, data, label):
  predictions = model.predict(data)
  print('Testing predicted classes: {}'.format(np.unique(predictions)))
  print('Testing accuracy: {}'.format(accuracy_score(label, predictions)))
  print('Testing F1 score: {}'.format(f1_score(label, predictions, average='weighted')))

In [188]:
def confusion_matrix_test(model, data, label):
  predictions = model.predict(data)

  cm = confusion_matrix(label, predictions)
  print(f'''  
  verdadeiros negativos: {cm[0,0]}
  falsos negativos: {cm[0,1]}
  falsos positivos: {cm[1,0]}
  verdadeiros positivos: {cm[1,1]}
  ''')

In [189]:
def classification_report_test(model, data, label):
  predictions = model.predict(data)

  print(classification_report(label, predictions))

## Main

In [190]:
df = read_dataset('/content/drive/My Drive/TCC_Thalles Sobral/2-dados/germany/use-data/adjusted.csv')

In [191]:
df = df.drop(columns='hashtag')

In [192]:
x_train, x_test, y_train, y_test, all_data =  build_train_test_data(df)

### Doc2Vec

In [193]:
documents = tagged_tweets_and_create_documents(all_data)

In [194]:
%time model_d2v_dm = create_d2v(documents, 160, 'dm')

CPU times: user 16.2 s, sys: 118 ms, total: 16.3 s
Wall time: 16.4 s


In [195]:
%time model_d2v_dm = train_d2v(model_d2v_dm, documents, 2) # conforme o trabalho original foi selecionado 24 épocas

CPU times: user 1min 29s, sys: 2.47 s, total: 1min 31s
Wall time: 49.9 s


In [196]:
%time model_d2v_dbow = create_d2v(documents, 160, 'dbow')

CPU times: user 15 s, sys: 118 ms, total: 15.1 s
Wall time: 15.1 s


In [197]:
%time model_d2v_dbow = train_d2v(model_d2v_dbow, documents, 2) # conforme o trabalho original foi selecionado 24 épocas

CPU times: user 36.6 s, sys: 5.36 s, total: 41.9 s
Wall time: 31.6 s


In [198]:
%time all_regressors = get_vector_d2v(model_d2v_dm, model_d2v_dbow, all_data)

CPU times: user 2.59 s, sys: 902 ms, total: 3.49 s
Wall time: 3.5 s


In [199]:
%time df = add_column(df, all_regressors, 'd2v')

CPU times: user 336 ms, sys: 0 ns, total: 336 ms
Wall time: 336 ms


### Classificação

#### Logistic Regression

In [200]:
# %time model_log = create_model_logistic_regression(all_regressors[:len(y_train)], np.array(y_train))
%time model_log = create_model_logistic_regression(all_regressors[:len(y_train)], y_train.values.ravel())

CPU times: user 1.83 s, sys: 298 ms, total: 2.13 s
Wall time: 33 s


#### Random Forest

In [201]:
# %time model_rand = create_model_random_forest(all_regressors[:len(y_train)], np.array(y_train))
%time model_rand = create_model_random_forest(all_regressors[:len(y_train)], y_train.values.ravel())

CPU times: user 4min 34s, sys: 70.5 ms, total: 4min 34s
Wall time: 4min 34s


#### SVC (Support Vector Classification)

In [272]:
# %time model_svc = create_model_svc(all_regressors[:len(y_train)], np.array(y_train))
%time model_svc = create_model_svc(all_regressors[:len(y_train)], y_train.values.ravel())

CPU times: user 22min 39s, sys: 228 ms, total: 22min 39s
Wall time: 22min 40s


### Avaliação

#### predict classes - accuracy - F1 score (train datas)

In [203]:
%time train_regressors = get_vector_d2v(model_d2v_dm, model_d2v_dbow, x_train)

CPU times: user 3.45 s, sys: 917 ms, total: 4.36 s
Wall time: 4.37 s


Logistic Regression

In [204]:
%time classifier_test(model_log, train_regressors, y_train)

Testing predicted classes: [0]
Testing accuracy: 0.9929503772649667
Testing F1 score: 0.9894380341408324
CPU times: user 1.53 s, sys: 34.1 ms, total: 1.57 s
Wall time: 1.52 s


Random Forest

In [205]:
%time classifier_test(model_rand, train_regressors, y_train)

Testing predicted classes: [0 1]
Testing accuracy: 1.0
Testing F1 score: 1.0
CPU times: user 3.02 s, sys: 55.2 ms, total: 3.07 s
Wall time: 3.02 s


SVC

In [206]:
%time classifier_test(model_svc, train_regressors, y_train)

Testing predicted classes: [0]
Testing accuracy: 0.9929503772649667
Testing F1 score: 0.9894380341408324
CPU times: user 2min 3s, sys: 20.4 ms, total: 2min 3s
Wall time: 2min 3s


#### predict classes - accuracy - F1 score (test datas)

In [207]:
%time test_regressors = get_vector_d2v(model_d2v_dm, model_d2v_dbow, x_test)

CPU times: user 452 ms, sys: 66.3 ms, total: 518 ms
Wall time: 518 ms


Logistic Regression

In [208]:
%time classifier_test(model_log, test_regressors, y_test)

Testing predicted classes: [0]
Testing accuracy: 0.9976870973071205
Testing F1 score: 0.9965319848888025
CPU times: user 178 ms, sys: 12 ms, total: 190 ms
Wall time: 184 ms


Random Forest

In [209]:
%time classifier_test(model_rand, test_regressors, y_test)

Testing predicted classes: [0 1]
Testing accuracy: 0.9919048405749216
Testing F1 score: 0.99363246781653
CPU times: user 384 ms, sys: 77.4 ms, total: 461 ms
Wall time: 363 ms


SVC

In [210]:
%time classifier_test(model_svc, test_regressors, y_test)

Testing predicted classes: [0]
Testing accuracy: 0.9976870973071205
Testing F1 score: 0.9965319848888025
CPU times: user 13.7 s, sys: 4.4 ms, total: 13.7 s
Wall time: 13.7 s


#### Confusion matrix

Logistic Regression

In [211]:
%time confusion_matrix_test(model_log, test_regressors, y_test)

  
  verdadeiros negativos: 6039
  falsos negativos: 0
  falsos positivos: 14
  verdadeiros positivos: 0
  
CPU times: user 186 ms, sys: 13.1 ms, total: 199 ms
Wall time: 189 ms


Random Forest

In [212]:
%time confusion_matrix_test(model_rand, test_regressors, y_test)

  
  verdadeiros negativos: 6004
  falsos negativos: 35
  falsos positivos: 14
  verdadeiros positivos: 0
  
CPU times: user 346 ms, sys: 70.3 ms, total: 416 ms
Wall time: 343 ms


SVC

In [213]:
%time confusion_matrix_test(model_svc, test_regressors, y_test)

  
  verdadeiros negativos: 6039
  falsos negativos: 0
  falsos positivos: 14
  verdadeiros positivos: 0
  
CPU times: user 13.6 s, sys: 2.87 ms, total: 13.6 s
Wall time: 13.7 s


#### Classification report

Logistic Regression

In [214]:
%time classification_report_test(model_log, test_regressors, y_test)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6039
           1       0.00      0.00      0.00        14

    accuracy                           1.00      6053
   macro avg       0.50      0.50      0.50      6053
weighted avg       1.00      1.00      1.00      6053

CPU times: user 185 ms, sys: 9.63 ms, total: 194 ms
Wall time: 182 ms


  _warn_prf(average, modifier, msg_start, len(result))


Random Forest

In [215]:
%time classification_report_test(model_rand, test_regressors, y_test)

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      6039
           1       0.00      0.00      0.00        14

    accuracy                           0.99      6053
   macro avg       0.50      0.50      0.50      6053
weighted avg       1.00      0.99      0.99      6053

CPU times: user 361 ms, sys: 73.3 ms, total: 434 ms
Wall time: 351 ms


SVC

In [216]:
%time classification_report_test(model_svc, test_regressors, y_test)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6039
           1       0.00      0.00      0.00        14

    accuracy                           1.00      6053
   macro avg       0.50      0.50      0.50      6053
weighted avg       1.00      1.00      1.00      6053

CPU times: user 13.6 s, sys: 552 µs, total: 13.6 s
Wall time: 13.6 s


  _warn_prf(average, modifier, msg_start, len(result))


#### Predict tweet

Logistic Regression

In [217]:
# %time predict_tweet(model_d2v_dm, model_log, df.loc[1].tweet)
%time predict_tweet(model_d2v_dm, model_d2v_dbow, model_log, df.loc[1].tweet)

[0]
Avaliação negativa

CPU times: user 12.2 ms, sys: 1.02 ms, total: 13.3 ms
Wall time: 13.9 ms


Random Forest

In [218]:
# predict_tweet(model_dm, model_rand, df.loc[1].tweet)
predict_tweet(model_d2v_dm, model_d2v_dbow, model_rand, df.loc[1].tweet)

[1]
Avaliação positiva!



SVC

In [219]:
# predict_tweet(model_d2v, model_svc, df.loc[1].tweet)
predict_tweet(model_d2v_dm, model_d2v_dbow, model_svc, df.loc[1].tweet)

[0]
Avaliação negativa



#### Predict tweet (with answer)

Logistic Regression

In [220]:
# avaliation_classification(model_d2v, model_log, df.loc[1].tweet, df.loc[1].flood)
avaliation_classification(model_d2v_dm, model_d2v_dbow, model_log, df.loc[1].tweet, df.loc[1].flood)

right


Random Forest

In [221]:
# avaliation_classification(model_d2v, model_rand, df.loc[1].tweet, df.loc[1].flood)
avaliation_classification(model_d2v_dm, model_d2v_dbow, model_rand, df.loc[1].tweet, df.loc[1].flood)

wrong


SVC

In [222]:
# avaliation_classification(model_d2v, model_svc, df.loc[1].tweet, df.loc[1].flood)
avaliation_classification(model_d2v_dm, model_d2v_dbow, model_svc, df.loc[1].tweet, df.loc[1].flood)

right


### Prediction application to the set and classification

In [224]:
df_test = df['tweet']

Logistic Regression 

In [None]:
df_test.apply( lambda x: predict_tweet(model_d2v_dm, model_d2v_dbow, model_log, x))

Random Forest

In [None]:
df_test.apply( lambda x: predict_tweet(model_d2v_dm, model_d2v_dbow, model_rand, x))

SVC

In [None]:
df_test.apply( lambda x: predict_tweet(model_d2v_dm, model_d2v_dbow, model_svc, x))

### Prediction application to the set and classification (with answer)


In [None]:
[print(df.loc[x].tweet) for x in range(0, len(df))]

# avaliation_classification(model_d2v_dm, model_d2v_dbow, model_log, df.loc[1].tweet, df.loc[1].flood)

Logistic Regression 

In [None]:
[(avaliation_classification(model_d2v_dm, model_d2v_dbow, model_log, df.loc[id].tweet, df.loc[id].flood)) for id in range(0, len(df))]

Random Forest

In [None]:
[(avaliation_classification(model_d2v_dm, model_d2v_dbow, model_rand, df.loc[id].tweet, df.loc[id].flood)) for id in range(0, len(df))]

SVC

In [None]:
[(avaliation_classification(model_d2v_dm, model_d2v_dbow, model_svc, df.loc[id].tweet, df.loc[id].flood)) for id in range(0, len(df))]