In [1]:
import pandas as pd
import re
import numpy as np
import pickle
import nltk
import spacy
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from catboost import CatBoostRegressor
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import STOPWORDS
%matplotlib inline
from bert_serving.client import BertClient


In [36]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/riedel/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Se leen los .csv

In [2]:
train = pd.read_csv('csv/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [3]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
ids_with_target_error = [328,443,513,2619,3640,3900,4342,5781,6552,6554,6570,6701,6702,6729,6861,7226]
train.loc[train['id'].isin(ids_with_target_error),'target'] = 0

In [5]:
test = pd.read_csv('csv/test.csv')

In [6]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [7]:
# Se leen las abreviaturas 
file = open('abreviaturas.pkl','rb')
abbreviations = pickle.load(file)
file.close()

# Elaboración del modelo

Para este modelo se aplicará NLP para el procesamiento de los tweets y se utilizarán distintos árboles de decisión para realizar las predicciones:

In [8]:
!pip install bert-serving-server  # server
!pip install bert-serving-client  # client, independent of `bert-serving-server`



In [9]:
#!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip && unzip uncased_L-12_H-768_A-12.zip

In [10]:
# esta linea correrla en terminal
# !bert-serving-start -model_dir uncased_L-12_H-768_A-12/ -num_worker=2 -max_seq_len 50

### Preparación del set de datos

In [11]:
# Devuelve lista de tweets con abreviaturas expandidas
def expand_abbreviations(sentences):
    expanded_sentences = []
    
    for sentence in sentences:
        expanded_sentence = []
        
        for word in sentence:
            if word in abbreviations:
                word = word.replace(word,abbreviations[word])
            expanded_sentence.append(word)
            
        tokenized = nltk.word_tokenize(" ".join(expanded_sentence))
        expanded_sentences.append(tokenized)
        
    return expanded_sentences

In [12]:
#Devuelve los tweets en minuscula eliminando simbolos y numeros
def remove_symbols_and_numbers(sentences):
    
    clean_sentences = []

    for sentence in sentences:
        clean_sentence = []
    
        for word in sentence:
            clean_word = re.sub('[^a-zA-Z]',' ', word)
            if(clean_word != ' '):
                clean_sentence.append(clean_word.lower())
                
        tokenized = nltk.word_tokenize(" ".join(clean_sentence))
        clean_sentences.append(tokenized)

    return clean_sentences

In [13]:
def remove_stopwords(sentences):
    clean_sentences = []
    for sentence in sentences:
        clean_sentence = []
        for word in sentence:
            if word not in stop_words:
                clean_sentence.append(word)
        clean_sentences.append(clean_sentence)
    return clean_sentences

In [14]:
# PARA QUE CORRA ESTA FUNCION HAY QUE PONER EN CONSOLA:
# pip install spacy
# spacy download en
# Devuelve los tweets lematizados
def lemmatize_tweets(sentences):
    nlp = spacy.load('en')
    lemmatized = []
    
    for sentence in sentences:
        sentence = ' '.join(sentence)
        doc = nlp(sentence)
        lemmatized.append(" ".join( [token.lemma_ for token in doc] ) )
    
    return lemmatized    

In [15]:
#Devuelve una lista de tweets dejando solo letras y lematizando las palabras
def clean_text(df):
    
    words = df['text'].str.split()
    words = remove_symbols_and_numbers(words)
    words = expand_abbreviations(words)
    words = remove_stopwords(words)
    
    return lemmatize_tweets(words)

In [16]:
# agrego campo ['clean_text']

train ['clean_text'] =  clean_text(train)  
train.head()

Unnamed: 0,id,keyword,location,text,target,clean_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deeds reason earthquake allah forgive
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,resident ask shelter place notify officer evac...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfire evacuation order calif...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,get send photo ruby alaska smoke wildfire pour...


In [17]:
test ['clean_text'] = clean_text(test)
test.head()

Unnamed: 0,id,keyword,location,text,clean_text
0,0,,,Just happened a terrible car crash,happen terrible car crash
1,2,,,"Heard about #earthquake is different cities, s...",hear earthquake different city stay safe
2,3,,,"there is a forest fire at spot pond, geese are...",forest spot pond geese flee street save
3,9,,,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfire
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kill china taiwan


### Embedding de los tweets con bert

In [18]:
bc = BertClient()

In [33]:
### arreglo el vacio
train['text'][3749]

'I See Fire'

In [31]:
bert_train = bc.encode (train['clean_text'].to_list() )
bert_train.shape

ValueError: all elements in the list must be non-empty string, but element 3749 is ' '

In [None]:
bert_test = bc.encode(test['clean_text'].to_list())
bert_test.shape

In [None]:
train_embedding_file = open("train_embedding_file_bert.txt", "w")

for i in bert_train:
    np.savetxt(train_embedding_file, i)

train_embedding_file.close()

In [None]:
test_embedding_file = open("test_embedding_file_bert.txt", "w")

for i in bert_test:
    np.savetxt(test_embedding_file, i)

test_embedding_file.close()

In [None]:
bert_train = np.loadtxt("train_embedding_file_bert.txt").reshape(len(train.index),768)
bert_train.shape

In [None]:
bert_test = np.loadtxt("test_embedding_file_bert.txt").reshape(len(test.index),768)
bert_test.shape

In [None]:
type(bert_test[0])

### Se entrena el modelo

In [None]:
X, y = bert_train, train['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=13)

In [None]:
xg_reg = xgb.XGBRegressor(objective ='binary:logistic', 
                colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 10, alpha = 10, n_estimators = 5)

xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

In [None]:
rf_model = RandomForestRegressor(random_state=13, n_estimators=5, max_depth=10)
rf_model.fit(X_train, y_train)
preds = rf_model.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

In [None]:
lgb_class = lgb.LGBMRegressor(learning_rate = 0.1,
                max_depth = 10, alpha = 10, n_estimators = 5)
lgb_class.fit(X_train, y_train)
preds = lgb_class.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

In [None]:
catb = CatBoostRegressor(iterations=5, depth=10)
catb.fit(X_train, y_train)
preds = catb.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

In [None]:
gb = GradientBoostingRegressor(n_estimators=5, learning_rate=0.1, 
                                max_features=2, max_depth = 10, random_state = 0)
gb.fit(X_train, y_train)
preds = gb.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

In [None]:
eclf2 = VotingRegressor(estimators=[
         ('xgb', xg_reg), ('rf', rf_model), ('catb', catb), ('gb', gb), ('lgbm',lgb_class)])

eclf2 = eclf2.fit(X, y)
preds = eclf2.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

In [None]:
df = pd.DataFrame(eclf2.predict(X),columns=["bert_score"])
df.head()

In [None]:
final = df.to_csv('csv/solo_embedding_bert_train.csv')

### Predicciones

In [None]:
test['target'] = eclf2.predict(bert_test)

In [None]:
test.head()

In [None]:
# test.drop(columns=['keyword','location','text'], inplace=True)
# test.set_index('id', inplace=True)
test.head(10)

In [None]:
final = test.to_csv('csv/submission_bert.csv')

In [None]:
df = pd.DataFrame(eclf2.predict(bert_test),columns=["bert_score"])
df.head()
final = df.to_csv('csv/solo_embedding_bert_test.csv')