# Bibliotecas

In [None]:
''' dataframe '''
import pandas as pd
from sklearn.model_selection import train_test_split

''' utils '''
import re
import numpy as np
import multiprocessing
import statsmodels.api as sm

''' doc2vec '''
from gensim.models import doc2vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

''' classificação '''
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

''' avaliação '''
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

'''plotagem'''
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Funções

## Dados


In [None]:
def read_dataset(path):
  return pd.read_csv(path, header=0)

In [None]:
def build_train_test_data(dataset):

  test_size = .3

  X = dataset[['id_', 'tweet']]
  y = dataset[['flood']]

  x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0)

  return x_train, x_test, y_train, y_test, X

In [None]:
def add_column(df, array, column_name):
  df[column_name] = array
  return df

In [None]:
def show_values(df, label):
  target_count = df[label].value_counts()
  print('Class 0:', target_count[0])
  print('Class 1:', target_count[1])
  print('Proportion:', round(target_count[0] / target_count[1], 2), ': 1')

  target_count.plot(kind='bar', title='Count (target)',color = ['#1F77B4', '#FF7F0E'])

In [None]:
def balance_binary_values(df, label_binary, type):
  count_class_0, count_class_1 = df[label_binary].value_counts()
  
  df_class_0 = df[df[label_binary] == 0]
  df_class_1 = df[df[label_binary] == 1]

  if (type == 'under'):
    df_class_0_under = df_class_0.sample(count_class_1)
    df = pd.concat([df_class_0_under, df_class_1], axis=0)
  elif (type == 'over'):

    df_class_0 = df_class_0[:int(len(df_class_0)/2)]

    df_class_1_over = df_class_1.sample(int(len(df_class_0)), replace=True)
    df = pd.concat([df_class_0, df_class_1_over], axis=0)
  else:
    print('fail operation')

  print(f'Random {type}-sampling:')
  print(df[label_binary].value_counts())

  return df

## Doc2Vec

In [None]:
def tagged_tweets_and_create_documents(corpus):
  documents = [TaggedDocument(clear_data(corpus.loc[i].tweet), [i]) for i in corpus.id_]

  return documents

In [None]:
def clear_data(text, nums=True):

  remove_ellipsis_re = re.compile(r'[^\.]\.{2,3}')
  punct_re = re.compile(r"[\"'\[\],'#.:;()&!@\u2026]")
  number_re = re.compile(r'\d+')
  mentions_re = re.compile(r'\B@')
  links_re = re.compile(r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})')

  text = text.lower()

  text = text.replace("'",'')

  text = re.sub(links_re, '', text)
  text = re.sub(mentions_re, '', text)
  text = re.sub(remove_ellipsis_re, '', text)
  text = re.sub(punct_re, '', text)

  if nums:
    text = re.sub(number_re, '', text)

  return text.split()

In [None]:
def create_d2v(documents, type):
  ''' definir a quantidade de núcleos do processador para paralelizar o processamento '''
  cores = multiprocessing.cpu_count()
  # cores = -1
  #20
  vecs = 28
  #10
  window = 8
  #2
  min_count = 2
  #0.1
  alpha = 0.1
  #5
  negative = 5
  #1e-1
  sample = 1e-1

  if type == 'dm':
    ''' criar modelo dm '''
    model = Doc2Vec(vector_size=vecs, window=window, min_count=min_count, workers=cores, dm=1, dm_concat=1, alpha=alpha, sample=sample, negative=negative)
    # model = Doc2Vec(vector_size=vecs, window=window, min_count=min_count, workers=cores, dm=1, dm_concat=1)

  elif type == 'dbow':
    ''' criar modelo dbow '''
    model = Doc2Vec(vector_size=vecs, window=window, min_count=min_count, workers=cores, dm=0, alpha=alpha, sample=sample, negative=negative)
    # model = Doc2Vec(vector_size=vecs, window=window, min_count=min_count, workers=cores, dm=0)

  else: 
    print('Invalid type!')

  model.build_vocab(documents)
  return model

In [None]:
def save_model(model, fname):
  model.save(fname)

In [None]:
import random

def train_d2v(model, documents):
  epochs = 20

  for epoch in range(0, epochs):
    print(f'--treinamento na {epoch+1}ª época--')
    random.shuffle(documents)
    model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)
  
  return model

In [None]:
# def get_vector_d2v(model, data):
def get_vector_d2v(model_dm, model_dbow, data):

  return ( [list(model_dm.docvecs[id]) + list(model_dbow.docvecs[id]) for id in range(0, len(data))])

## Classificação

### Logistic Regression

In [None]:
def create_model_logistic_regression(x, y):
  C = 3
  penalty = 'l2'
  tol = 0.0001
  n_jobs = -1

  model_log = LogisticRegression(C=C, penalty=penalty, tol=tol, n_jobs=n_jobs)
  model_log.fit(x, y)

  return model_log

## Avaliação

In [None]:
def predict_tweet(model_dm, model_dbow, model_classifier, text):
  vec_dm = model_dm.infer_vector(doc_words=text.split(), alpha=0.025, steps=20)
  vec_dbow = model_dbow.infer_vector(doc_words=text.split(), alpha=0.025, steps=20)

  vec = vec_dm + vec_dbow
  vec = sm.add_constant(vec)

  if (model_classifier.predict(vec.reshape(1, -1)) == 1):
    return 1
  else: return 0

In [None]:
def avaliation_classification(model_dm, model_dbow, model_classifier, text, label):
  vec_dm = model_dm.infer_vector(doc_words=text.split(), alpha=0.025, steps=20)
  vec_dbow = model_dbow.infer_vector(doc_words=text.split(), alpha=0.025, steps=20)

  vec = vec_dm + vec_dbow
  vec = sm.add_constant(vec)
  
  if (model_classifier.predict(vec.reshape(1, -1)) == label):
    return 'right'
  else: return 'wrong'

In [None]:
def classifier_test(model, data, label):
  predictions = model.predict(data)
  print('Testing predicted classes: {}'.format(np.unique(predictions)))
  print('Testing accuracy: {}'.format(accuracy_score(label, predictions)))
  print('Testing F1 score: {}'.format(f1_score(label, predictions, average='weighted')))

In [None]:
def confusion_matrix_test(model, data, label):
  predictions = model.predict(data)

  cm = confusion_matrix(label, predictions)
  print('Confusion matrix:\n', cm)

  labels = ['Class 0', 'Class 1']
  fig = plt.figure()
  ax = fig.add_subplot(111)
  cax = ax.matshow(cm, cmap=plt.cm.Blues)
  fig.colorbar(cax)
  ax.set_xticklabels([''] + labels)
  ax.set_yticklabels([''] + labels)
  plt.xlabel('Predicted')
  plt.ylabel('Expected')
  plt.show()

In [None]:
def classification_report_test(model, data, label):
  predictions = model.predict(data)

  print(classification_report(label, predictions))

# Main

In [None]:
df = read_dataset('/content/drive/My Drive/TCC_Thalles Sobral/2-dados/germany/use-data/adjusted.csv')

In [None]:
df = df.drop(columns='hashtag')

TESTE

In [None]:
df_d2v = df

In [None]:
df_d2v['flood'] = df_d2v['relevance'].replace([1, 2], [0, 1])

In [None]:
# df_d2v = balance_binary_values(df_d2v, 'flood', 'under')

In [None]:
# show_values(df_d2v, 'flood')

In [None]:
len(df_d2v)

FIM TESTE

In [None]:
# x_train, x_test, y_train, y_test, all_data =  build_train_test_data(df)
x_train, x_test, y_train, y_test, all_data =  build_train_test_data(df_d2v)

## Doc2Vec

In [None]:
documents = tagged_tweets_and_create_documents(all_data)
# documents = tagged_tweets_and_create_documents(x_train)

In [None]:
%time model_d2v_dm = create_d2v(documents, 'dm')

In [None]:
# %time model_d2v_dm = train_d2v(model_d2v_dm, train_documents, epochs) # conforme o trabalho original foi selecionado 24 épocas]
%time model_d2v_dm = train_d2v(model_d2v_dm, documents) # conforme o trabalho original foi selecionado 24 épocas

In [None]:
%time model_d2v_dbow = create_d2v(documents, 'dbow')

In [None]:
# %time model_d2v_dbow = train_d2v(model_d2v_dbow, train_documents, epochs) # conforme o trabalho original foi selecionado 24 épocas
%time model_d2v_dbow = train_d2v(model_d2v_dbow, documents) # conforme o trabalho original foi selecionado 24 épocas

In [None]:
%time all_regressors = get_vector_d2v(model_d2v_dm, model_d2v_dbow, all_data)

In [None]:
# %time df = add_column(df, all_regressors, 'd2v')

## Classificação

In [None]:
df = read_dataset('/content/drive/My Drive/TCC_Thalles Sobral/2-dados/germany/use-data/adjusted.csv')
df = df.drop(columns='hashtag')

In [None]:
df['flood'] = df['relevance'].replace([1, 2], [0, 1])

In [None]:
show_values(df, 'flood') # tmp

In [None]:
df = balance_binary_values(df, 'flood', 'under') # tmp

In [None]:
show_values(df, 'flood') # tmp

In [None]:
# df_d2v = df_d2v.sample(frac=1, axis=0).reset_index(drop=True)

In [None]:
# x_train, x_test, y_train, y_test, all_data =  build_train_test_data(df_d2v) # tmp
x_train, x_test, y_train, y_test, all_data =  build_train_test_data(df) # tmp

### Logistic Regression

In [None]:
%time train_regressors = get_vector_d2v(model_d2v_dm, model_d2v_dbow, x_train)

In [None]:
# %time model_log = create_model_logistic_regression(all_regressors[:len(y_train)], np.array(y_train))
%time model_log = create_model_logistic_regression(train_regressors, y_train.values.ravel())

# Avaliação

## predict classes - accuracy - F1 score (train datas)

In [None]:
%time train_regressors = get_vector_d2v(model_d2v_dm, model_d2v_dbow, x_train)

Logistic Regression

In [None]:
%time classifier_test(model_log, train_regressors, y_train)

## predict classes - accuracy - F1 score (test datas)

In [None]:
%time test_regressors = get_vector_d2v(model_d2v_dm, model_d2v_dbow, x_test)

Logistic Regression

In [None]:
%time classifier_test(model_log, test_regressors, y_test)

## Predict tweet

Logistic Regression

In [None]:
# %time predict_tweet(model_d2v_dm, model_log, df.loc[1].tweet)
# %time predict_tweet(model_d2v_dm, model_d2v_dbow, model_log, df.iloc[1].tweet)

## Predict tweet (with answer)

Logistic Regression

In [None]:
# avaliation_classification(model_d2v, model_log, df.loc[1].tweet, df.loc[1].flood)
# avaliation_classification(model_d2v_dm, model_d2v_dbow, model_log, df.iloc[1].tweet, df.iloc[1].flood)

## Prediction application to the set and classification

In [None]:
# df_test = df['tweet']

Logistic Regression 

In [None]:
# df_test.apply( lambda x: predict_tweet(model_d2v_dm, model_d2v_dbow, model_log, x))

## Prediction application to the set and classification (with answer)


Logistic Regression 

In [None]:
# [print(avaliation_classification(model_d2v_dm, model_d2v_dbow, model_log, df.iloc[id].tweet, df.iloc[id].flood)) for id in range(0, len(df))]

Salvando resultados

In [None]:
# df.to_csv('tweets-with-d2v.csv')
# !cp tweets-with-d2v.csv "drive/My Drive/TCC_Thalles Sobral/2-dados/germany/use-data"

## Curva ROC

Data

In [None]:
''' All data'''
# labels = df[['flood']].values
# %time regressors = get_vector_d2v(model_d2v_dm, model_d2v_dbow, all_data)
''' Test data'''
labels = y_test
%time regressors = get_vector_d2v(model_d2v_dm, model_d2v_dbow, x_test)

Plor curve ROC

In [None]:

from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

''' generate a no skill prediction (majority class) '''

ns_probs = [0 for _ in range(len(labels))]

''' predict probabilities '''
lr_probs = model_log.predict_proba(regressors)
# lr_probs = model_log.predict(regressors)

''' keep probabilities for the positive outcome only '''
lr_probs = lr_probs[:,1]
# lr_probs = lr_probs

''' calculate scores '''
ns_auc = roc_auc_score(labels, ns_probs)
lr_auc = roc_auc_score(labels, lr_probs)

''' summarize scores '''
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (lr_auc))

''' calculate roc curves '''
ns_fpr, ns_tpr, _ = roc_curve(labels, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(labels, lr_probs)

''' plot the roc curve for the model '''
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
plt.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')

''' axis labels '''
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

''' show the legend '''
plt.legend()

''' show the plot '''
plt.show()

In [None]:
lr_probs

In [None]:
labels.values

## Confusion matrix

Logistic Regression

In [None]:
%time confusion_matrix_test(model_log, regressors, labels)

## Accuracy

In [None]:
y_pred = model_log.predict(regressors)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

## Report

In [None]:
%time classification_report_test(model_log, regressors, labels)

## dados gerados para um dataset novo

In [None]:
df = read_dataset('/content/drive/My Drive/TCC_Thalles Sobral/2-dados/germany/use-data/adjusted.csv')
df = df.drop(columns='hashtag')

In [None]:
# avaliations = []
# %time [(avaliations.append(predict_tweet(model_d2v_dm, model_d2v_dbow, model_log, df.iloc[id].tweet))) for id in range(0, len(df))]

In [None]:
# avaliations

In [None]:
# %time df = add_column(df, avaliations, 'd2v_avaliation')

In [None]:
# df.query('d2v_avaliation == 1')

In [None]:
# len(df.query('relevance == 2'))

In [None]:
# len(df.query('relevance == 2 and d2v_avaliation == 1'))

fazer grafico de dispersão

## Salvando os dados

In [None]:
# df.to_csv('tweets-with-d2v.csv')
# !cp tweets-with-d2v.csv "drive/My Drive/TCC_Thalles Sobral/2-dados/germany/use-data/tweets-with-d2v.csv"