# Se importan las bibliotecas necesarias

In [1]:
import pandas as pd
import re
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from catboost import CatBoostClassifier
%matplotlib inline

# Se lee el .csv

In [2]:
train = pd.read_csv('csv/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [3]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
test = pd.read_csv('csv/test.csv')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [5]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


# Elaboración del modelo

Para este modelo se utilizarán árboles de decisión para realizar las predicciones y se tendrán en cuenta los siguientes features extraídos del análisis exploratorio, los cuales fueron seleccionados por el grupo como aquellos que podrían resultan más interesantes:

- Palabras con mayor y menor porcentaje de veracidad.
- Pares de 2 palabras con mayor y menor porcentaje de veracidad.
- Tweets que contienen @, ¿?, ¡! tienden a ser falsos.
- Keywords con mayor y menor porcentaje de veracidad.
- Locaciones más y menos veraces.
- Longitud.

### Preparación del set de datos

#### Parte inicial

In [6]:
train['location'].fillna("none_location",inplace=True)
train['keyword'].fillna("none_keyword",inplace=True)
display(train.head())
test['location'].fillna("none_location",inplace=True)
test['keyword'].fillna("none_keyword",inplace=True)
test.head()

Unnamed: 0,id,keyword,location,text,target
0,1,none_keyword,none_location,Our Deeds are the Reason of this #earthquake M...,1
1,4,none_keyword,none_location,Forest fire near La Ronge Sask. Canada,1
2,5,none_keyword,none_location,All residents asked to 'shelter in place' are ...,1
3,6,none_keyword,none_location,"13,000 people receive #wildfires evacuation or...",1
4,7,none_keyword,none_location,Just got sent this photo from Ruby #Alaska as ...,1


Unnamed: 0,id,keyword,location,text
0,0,none_keyword,none_location,Just happened a terrible car crash
1,2,none_keyword,none_location,"Heard about #earthquake is different cities, s..."
2,3,none_keyword,none_location,"there is a forest fire at spot pond, geese are..."
3,9,none_keyword,none_location,Apocalypse lighting. #Spokane #wildfires
4,11,none_keyword,none_location,Typhoon Soudelor kills 28 in China and Taiwan


In [7]:
min_repetitions = ((0.2/100)*len(train.index))

In [8]:
#Devuelve una lista de tweets dejando solo caracteres alfanumericos
def clean_text(df):
    words = df['text'].str.split()
    clean_words = []

    for sentence in words:
        clean_sentence = []
        for word in sentence:
            clean_word = re.sub('[^A-Za-z0-9]+','', word)
            if(clean_word != ''):
                clean_sentence.append(clean_word.lower())
        clean_words.append(clean_sentence)
    
    return clean_words

In [9]:
# Devuelve una lista de tuplas de la forma (col, %veracidad)
def groupby_veracity(df, col):
    
    veracity = []
    aux = df.groupby(col).agg({'target':['sum','count']})
    aux = aux[aux['target']['count']>min_repetitions]
    aux['veracity'] = aux['target']['sum']/aux['target']['count']
    aux.reset_index(inplace=True)
    
    for i in range(len(aux.index)):
        veracity.append(tuple((aux[col][i],aux['veracity'][i])))
    
    return veracity

In [10]:
train_aux = train.copy()
clean_tweets = clean_text(train_aux)
train_aux['words'] = clean_tweets
train_aux['clean_text'] = [' '.join(x) for x in clean_tweets]

In [11]:
words_pairs = []

for sentence in train_aux['words']:
    pairs = []
    for i in range(len(sentence)-1):
        pairs.append(sentence[i] + ' ' + sentence[i+1])
    words_pairs.append(pairs)

train_aux['pairs'] = words_pairs

In [12]:
train_aux.head()

Unnamed: 0,id,keyword,location,text,target,words,clean_text,pairs
0,1,none_keyword,none_location,Our Deeds are the Reason of this #earthquake M...,1,"[our, deeds, are, the, reason, of, this, earth...",our deeds are the reason of this earthquake ma...,"[our deeds, deeds are, are the, the reason, re..."
1,4,none_keyword,none_location,Forest fire near La Ronge Sask. Canada,1,"[forest, fire, near, la, ronge, sask, canada]",forest fire near la ronge sask canada,"[forest fire, fire near, near la, la ronge, ro..."
2,5,none_keyword,none_location,All residents asked to 'shelter in place' are ...,1,"[all, residents, asked, to, shelter, in, place...",all residents asked to shelter in place are be...,"[all residents, residents asked, asked to, to ..."
3,6,none_keyword,none_location,"13,000 people receive #wildfires evacuation or...",1,"[13000, people, receive, wildfires, evacuation...",13000 people receive wildfires evacuation orde...,"[13000 people, people receive, receive wildfir..."
4,7,none_keyword,none_location,Just got sent this photo from Ruby #Alaska as ...,1,"[just, got, sent, this, photo, from, ruby, ala...",just got sent this photo from ruby alaska as s...,"[just got, got sent, sent this, this photo, ph..."


In [13]:
aux = train_aux.copy()
aux = aux.explode('words')
veracity_words = groupby_veracity(aux, 'words')

In [14]:
aux = train_aux.copy()
aux = aux.explode('pairs')
veracity_pairs = groupby_veracity(aux, 'pairs')

In [15]:
veracity_location = groupby_veracity(aux, 'location')

In [16]:
veracity_keyword = groupby_veracity(aux, 'keyword')

In [17]:
filtered_data = {}

#### Parte final

In [18]:
#Devuelve una lista con la cantidad de palabras en 'words' que aparecen en cada elemento de 'texts'
def count_text_ocurrences(texts, words):
    ocurrences = []
    for text in texts:
        count = 0
        for word in words:
            if word in text:
                count = count+1
        ocurrences.append(count)
    return ocurrences

#Devuelve una lista binaria que indica si cada elemento de 'searched' aparece en 'series' o no
def count_ocurrences(searched, series):
    xd = ' '.join(series)
    ocurrences = []
    for element in searched:
        count = 0
        if element in xd:
            count = count+1
        ocurrences.append(count)
    return ocurrences

#Devuelve una lista binaria que indica si cada elemento de 'texts' contiene o no los chars recibidos
def contains_char(texts, char1, char2=''):
    ocurrences = []
    for text in texts:
        count = 0
        if char1 in text:
            count = 1
        if (count==0)&(char2!=''):
            if char2 in text:
                count = 1
        ocurrences.append(count)
    return ocurrences

In [19]:
# Agrega 2 columnas al df las cuales indican el número de palabras con mayor 
# y menor % de veracidad contiene cada tweet
def relevant_words_occurrences(df, top_limit, low_limit):
    
    top_words = [x[0] for x in veracity_words if x[1]>top_limit]
    worst_words = [x[0] for x in veracity_words if x[1]<low_limit]

    df['top_words'] = count_text_ocurrences(df['clean_text'], top_words)
    df['worst_words'] = count_text_ocurrences(df['clean_text'], worst_words)
    
    filtered_data.update([('top_words',top_words),('worst_words',worst_words)]) 
    

In [20]:
# Agrega 2 columnas al df las cuales indican el número de pares de palabras con mayor 
# y menor % de veracidad contiene cada tweet
def relevant_words_pairs_occurrences(df,top_limit, low_limit):
    
    top_words_pairs = [x[0] for x in veracity_pairs if x[1]>top_limit]
    worst_words_pairs = [x[0] for x in veracity_pairs if x[1]<low_limit]
        
    df['top_words_pairs'] = count_text_ocurrences(df['clean_text'], top_words_pairs)
    df['worst_words_pairs'] = count_text_ocurrences(df['clean_text'], worst_words_pairs)
    
    filtered_data.update([('top_words_pairs',top_words_pairs),('worst_words_pairs',worst_words_pairs)]) 
    

In [21]:
# Agrega 2 columnas binarias al df las cuales indican si el tweet posee un keyword 
# con un alto o un bajo % de veracidad
def relevant_keywords_occurrences(df, top_limit, low_limit):
    
    top_keywords = [x[0] for x in veracity_keyword if x[1]>top_limit]
    worst_keywords = [x[0] for x in veracity_keyword if x[1]<low_limit]

    df['top_keywords'] = count_ocurrences(df['keyword'], top_keywords)
    df['worst_keywords'] = count_ocurrences(df['keyword'], worst_keywords)
    
    filtered_data.update([('top_keywords',top_keywords),('worst_keywords',worst_keywords)])
    

In [22]:
# Agrega 2 columnas binarias al df las cuales indican si el tweet proviene de una locacion 
# con un alto o un bajo % de veracidad

def relevant_location_occurrences(df, top_limit, low_limit):
    
    top_locations = [x[0] for x in veracity_location if x[1]>top_limit]
    worst_locations = [x[0] for x in veracity_location if x[1]<low_limit]
    
    df['top_locations'] = count_ocurrences(df['location'], top_locations)
    df['worst_locations'] = count_ocurrences(df['location'], worst_locations)
    
    filtered_data.update([('top_locations',top_locations),('worst_locations',worst_locations)])
    

In [23]:
# Agrega 3 columnas binarias al df las cuales indican si el tweet contiene @, (¡,!) y (¿,?) 
def relevant_chars_ocurrences(df):
    
    df['arroba'] = contains_char(df['text'],'@')
    df['signos_interrogacion'] = contains_char(df['text'],'?', '¿')
    df['signos_exclamacion'] = contains_char(df['text'],'!','¡')
    

In [24]:
# Agrega 2 columnas al df las cuales indican la longitud en palabras y caracteres de cada tweet
def length_count(df):
    
    aux = []
    words = df['text'].str.split()
    
    for i in words:
        aux.append(len(i))

    df['long(char)'] = df['text'].str.len()
    df['long(word)'] = aux
    

In [28]:
# Devuelve un dataframe con todos los features considerados para el modelo
def prepare_training_set(df):
    
    df['clean_text'] = [' '.join(x) for x in clean_tweets]
    filtered_data.clear()
    relevant_words_occurrences(df, 0.9, 0.1)
    relevant_words_pairs_occurrences(df, 0.9, 0.1)
    relevant_keywords_occurrences(df, 0.9, 0.1)
    relevant_location_occurrences(df, 0.9, 0.1)
    relevant_chars_ocurrences(df)
    length_count(df)
    
    df = df.drop(columns=['keyword','location','text','clean_text'])
    df = df.set_index('id')
    
    return df
    

In [26]:
# Devuelve un dataframe con todos los features considerados para el modelo
def prepare_test_set(df):
    
    clean_tweets = clean_text(df)
    df['words'] = clean_tweets
    df['clean_text'] = [' '.join(x) for x in clean_tweets]
    
    df['top_words'] = count_text_ocurrences(df['clean_text'], filtered_data['top_words'])
    df['worst_words'] = count_text_ocurrences(df['clean_text'], filtered_data['worst_words'])
    df['top_words_pairs'] = count_text_ocurrences(df['clean_text'], filtered_data['top_words_pairs'])
    df['worst_words_pairs'] = count_text_ocurrences(df['clean_text'], filtered_data['worst_words_pairs'])
    df['top_keywords'] = count_ocurrences(df['keyword'], filtered_data['top_keywords'])
    df['worst_keywords'] = count_ocurrences(df['keyword'], filtered_data['worst_keywords'])
    df['top_locations'] = count_ocurrences(df['location'], filtered_data['top_locations'])
    df['worst_locations'] = count_ocurrences(df['location'], filtered_data['worst_locations'])
    relevant_chars_ocurrences(df)
    length_count(df)
    
    df = df.drop(columns=['keyword','location','text','words','clean_text'])
    df = df.set_index('id')
    
    return df

In [29]:
training_set = prepare_training_set(train.copy())
training_set.head()

Unnamed: 0_level_0,target,top_words,worst_words,top_words_pairs,worst_words_pairs,top_keywords,worst_keywords,top_locations,worst_locations,arroba,signos_interrogacion,signos_exclamacion,long(char),long(word)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,1,0,0,0,0,1,0,0,0,0,0,0,69,13
4,1,0,0,0,0,1,0,0,0,0,0,0,38,7
5,1,1,0,0,0,1,0,0,0,0,0,0,133,22
6,1,2,0,0,0,1,0,0,0,0,0,0,65,8
7,1,1,0,0,0,1,0,0,0,0,0,0,88,16


In [30]:
test_set = prepare_test_set(test.copy())
test_set.head()

Unnamed: 0_level_0,top_words,worst_words,top_words_pairs,worst_words_pairs,top_keywords,worst_keywords,top_locations,worst_locations,arroba,signos_interrogacion,signos_exclamacion,long(char),long(word)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,0,0,0,0,1,0,0,0,0,0,0,34,6
2,0,0,0,0,1,0,0,0,0,0,0,64,9
3,0,0,0,0,1,0,0,0,0,0,0,96,19
9,1,0,0,0,1,0,0,0,0,0,0,40,4
11,3,0,1,0,1,0,0,0,0,0,0,45,8


### Entrenamiento del set de datos

In [41]:
X, y = training_set.iloc[:,1:-1], training_set.iloc[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=13)

#### XGBoost

In [42]:
xg_reg = xgb.XGBClassifier(objective ='binary:logistic', 
                colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 20, alpha = 10, n_estimators = 20)

In [43]:
xg_reg.fit(X_train,y_train)

XGBClassifier(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.3, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=20,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=20, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=10,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [44]:
preds = xg_reg.predict(X_test)

In [45]:
print('Accuracy Score : ' + str(accuracy_score(y_test,preds)))
print('Precision Score : ' + str(precision_score(y_test,preds)))
print('Recall Score : ' + str(recall_score(y_test,preds)))
print('F1 Score : ' + str(f1_score(y_test,preds)))

Accuracy Score : 0.7690288713910761
Precision Score : 0.8382978723404255
Recall Score : 0.5880597014925373
F1 Score : 0.6912280701754386


In [None]:
# df = pd.DataFrame(preds)
# final = df.to_csv('csv/xgboost.csv')
# df.head()

In [None]:
# params = {
#         'min_child_weight': [1, 5, 10],
#         'gamma': [0.5, 1, 1.5, 2, 5],
#         'subsample': [0.6, 0.8, 1.0],
#         'colsample_bytree': [0.2, 0.3, 0.4],
#         'max_depth': [10, 15, 20]
#         }
# grid_acc = GridSearchCV(xg_reg, param_grid = params)
# grid_acc.fit(X_train, y_train)
# y_pred_acc = grid_acc.predict(X_test)

# # New Model Evaluation metrics 
# print('Accuracy Score : ' + str(accuracy_score(y_test,y_pred_acc.round())))
# print('Precision Score : ' + str(precision_score(y_test,y_pred_acc.round())))
# print('Recall Score : ' + str(recall_score(y_test,y_pred_acc.round())))
# print('F1 Score : ' + str(f1_score(y_test,y_pred_acc.round())))

#### RandomForest

In [None]:
# rf_model = RandomForestClassifier(random_state=13, n_estimators=80, max_depth=20)
# rf_model.fit(X_train, y_train)
# preds = rf_model.predict(X_test)

# print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
# print('Precision Score : ' + str(precision_score(y_test,preds.round())))
# print('Recall Score : ' + str(recall_score(y_test,preds.round())))
# print('F1 Score : ' + str(f1_score(y_test,preds.round())))

In [None]:
# df = pd.DataFrame(preds)
# final = df.to_csv('csv/randomForest.csv')
# df.head()

rf_model.feature_importances_
plt.figure(figsize=(17,9))
plt.bar(X_train.columns, rf_model.feature_importances_)
plt.xlabel('Features')
plt.ylabel('Importancia')
plt.title('Importancia Features con RF')
plt.show()

#### LightGBM

In [None]:
# lgb_train = lgb.Dataset(X_train, y_train)
# lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

# params = {
#      'objective': 'regression',
#      'metric': 'rmse',
#      'num_leaves': 5,
#      'learning_rate': 0.1,
#      'feature_fraction': 0.9,
#  }

# params = {
#      'objective': 'regression',
#  }

# gbm = lgb.train(params,
#                  lgb_train,
#                  num_boost_round=100,
#                  valid_sets=lgb_eval,
#                  early_stopping_rounds=10)

# preds = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
# print('Precision Score : ' + str(precision_score(y_test,preds.round())))
# print('Recall Score : ' + str(recall_score(y_test,preds.round())))
# print('F1 Score : ' + str(f1_score(y_test,preds.round())))
# rmse = np.sqrt(mean_squared_error(y_test, preds))
# print("RMSE: %f" % (rmse))

In [None]:
# df = pd.DataFrame(preds)
# final = df.to_csv('csv/lightgbm.csv')
# df.head()

#### CatBoost

In [None]:
# model = CatBoostClassifier()
# model.fit(X_train, y_train)
# preds = model.predict(X_test)

# print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
# print('Precision Score : ' + str(precision_score(y_test,preds.round())))
# print('Recall Score : ' + str(recall_score(y_test,preds.round())))
# print('F1 Score : ' + str(f1_score(y_test,preds.round())))

In [None]:
# df = pd.DataFrame(preds)
# final = df.to_csv('csv/catboost.csv')
# df.head()

In [None]:
# model = LogisticRegression()
# model.fit(X_train, y_train)
# preds = model.predict(X_test)

# print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
# print('Precision Score : ' + str(precision_score(y_test,preds.round())))
# print('Recall Score : ' + str(recall_score(y_test,preds.round())))
# print('F1 Score : ' + str(f1_score(y_test,preds.round())))

In [None]:
# df = pd.DataFrame(preds)
# final = df.to_csv('csv/logicRegression.csv')
# df.head()

In [None]:
# xg_reg = xgb.XGBClassifier(objective ='binary:logistic', 
#                 colsample_bytree = 0.3, learning_rate = 0.1,
#                 max_depth = 20, alpha = 10, n_estimators = 80)
# rf_model = RandomForestClassifier(random_state=13, n_estimators=80, max_depth=20)

# eclf2 = VotingClassifier(estimators=[
#          ('xgb', xg_reg), ('rf', rf_model)])

In [None]:
# eclf2 = eclf2.fit(X, y)

In [None]:
# preds = eclf2.predict(X_test)

# print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
# print('Precision Score : ' + str(precision_score(y_test,preds.round())))
# print('Recall Score : ' + str(recall_score(y_test,preds.round())))
# print('F1 Score : ' + str(f1_score(y_test,preds.round())))