# Se importan las bibliotecas necesarias

In [1]:
import pandas as pd
import re

# Se lee el .csv

In [2]:
train = pd.read_csv('csv/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [3]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


# Elaboración del modelo

Para este modelo se utilizará XGBoost para realizar las predicciones y se tendrán en cuenta los siguientes features extraídos del análisis exploratorio, los cuales fueron seleccionados por el grupo como aquellos que podrían resultan más interesantes:

- Palabras con mayor y menor porcentaje de veracidad.
- Pares de 2 palabras con mayor y menor porcentaje de veracidad.
- Tweets que contienen @, ¿?, ¡! tienden a ser falsos.
- Keywords con mayor y menor porcentaje de veracidad.
- Locaciones más y menos veraces.
- Longitud.

### Preparación del set de datos

#### Parte inicial

In [4]:
#Palabras con mayor y menor porcentaje de veracidad
min_repetitions = ((0.2/100)*len(train.index))
words = train['text'].str.split()
clean_words = []

for sentence in words:
    clean_sentence = []
    for word in sentence:
        clean_word = re.sub('[^A-Za-z0-9]+','', word)
        clean_sentence.append(clean_word.lower())
    clean_words.append(clean_sentence)


In [5]:
train_aux = train
train_aux['words'] = clean_words
train_aux = train_aux.explode('words')
train_aux.head()

Unnamed: 0,id,keyword,location,text,target,words
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deeds
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,are
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,the
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,reason


In [6]:
train_aux = train_aux.groupby('words').agg({'target':['sum','count']})
train_aux = train_aux[train_aux['target']['count']>min_repetitions]
train_aux['veracity'] = train_aux['target']['sum']/train_aux['target']['count']
train_aux.head()

Unnamed: 0_level_0,target,target,veracity
Unnamed: 0_level_1,sum,count,Unnamed: 3_level_1
words,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
,904,2150,0.420465
5.0,14,18,0.777778
1.0,29,64,0.453125
10.0,9,23,0.391304
100.0,6,17,0.352941


In [7]:
train_aux.reset_index(inplace=True) 

In [8]:
#Pares de 2 palabras con mayor y menor porcentaje de veracidad.
train_aux2 = train
train_aux2['words'] = clean_words
words_pairs = []

for sentence in train_aux2['words']:
    pairs = []
    for i in range(len(sentence)-1):
        pairs.append(sentence[i] + ' ' + sentence[i+1])
    words_pairs.append(pairs)
    
train_aux2['words'] = words_pairs
train_aux2 = train_aux2.explode('words')
train_aux2.head()

Unnamed: 0,id,keyword,location,text,target,words
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deeds are
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,are the
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,the reason
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,reason of


In [9]:
train_aux2 = train_aux2.groupby('words').agg({'target':['sum','count']})
train_aux2 = train_aux2[train_aux2['target']['count']>min_repetitions]
train_aux2['veracity'] = train_aux2['target']['sum']/train_aux2['target']['count']

In [10]:
#las palabras que son '' la cagan
train_aux2.head(10)

Unnamed: 0_level_0,target,target,veracity
Unnamed: 0_level_1,sum,count,Unnamed: 3_level_1
words,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
,66,118,0.559322
abc,19,19,1.0
full,6,67,0.089552
i,6,19,0.315789
the,32,57,0.561404
11yearold boy,20,20,1.0
16yr old,28,28,1.0
40 families,26,26,1.0
70 years,30,31,0.967742
a blast,3,16,0.1875


In [11]:
train_aux2.reset_index(inplace=True)

In [12]:
# Keywords más y menos veraces
train_aux3 = train
train_aux3 = train_aux3.groupby('keyword').agg({'target':['sum','count']})
train_aux3 = train_aux3[train_aux3['target']['count']>min_repetitions]
train_aux3['veracity'] = train_aux3['target']['sum']/train_aux3['target']['count']
train_aux3.head()

Unnamed: 0_level_0,target,target,veracity
Unnamed: 0_level_1,sum,count,Unnamed: 3_level_1
keyword,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
ablaze,13,36,0.361111
accident,24,35,0.685714
aftershock,0,34,0.0
airplane%20accident,30,35,0.857143
ambulance,20,38,0.526316


In [17]:
train_aux3.reset_index(inplace=True)

Unnamed: 0_level_0,index,keyword,target,target,veracity
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,count,Unnamed: 5_level_1
142,142,military,8,34,0.235294
143,143,mudslide,12,37,0.324324
144,144,natural%20disaster,18,34,0.529412
145,145,nuclear%20disaster,31,34,0.911765
146,146,nuclear%20reactor,14,36,0.388889
...,...,...,...,...,...
212,212,wounded,26,37,0.702703
213,213,wounds,10,33,0.303030
214,214,wreck,7,37,0.189189
215,215,wreckage,39,39,1.000000


In [18]:
# Locaciones más y menos veraces
train_aux4 = train
train_aux4 = train_aux4.groupby('location').agg({'target':['sum','count']})
train_aux4 = train_aux4[train_aux4['target']['count']>min_repetitions]
train_aux4['veracity'] = train_aux4['target']['sum']/train_aux4['target']['count']
train_aux4.head()

Unnamed: 0_level_0,target,target,veracity
Unnamed: 0_level_1,sum,count,Unnamed: 3_level_1
location,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Australia,9,18,0.5
California,7,17,0.411765
Canada,13,29,0.448276
"Chicago, IL",9,18,0.5
India,20,24,0.833333


In [19]:
train_aux4.reset_index(inplace=True)

#### Parte final

In [14]:
top_words = train_aux[train_aux['veracity']>0.9]['words']
worst_words = train_aux[train_aux['veracity']<0.1]['words']

In [15]:
top_words_pairs = train_aux2[train_aux2['veracity']>0.9]['words']
worst_words_pairs = train_aux2[train_aux2['veracity']<0.1]['words']

In [21]:
top_keywords = train_aux3[train_aux3['veracity']>0.9]['keyword']
worst_keywords = train_aux3[train_aux3['veracity']<0.1]['keyword']

In [20]:
top_locations = train_aux4[train_aux4['veracity']>0.9]['location']
worst_locations = train_aux4[train_aux4['veracity']<0.1]['location']

In [16]:
train['long(char)'] = train['text'].str.len()

aux = []
for i in words:
    aux.append(len(i))

train['long(word)'] = aux
train.head()

Unnamed: 0,id,keyword,location,text,target,words,long(char),long(word)
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[our deeds, deeds are, are the, the reason, re...",69,13
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[forest fire, fire near, near la, la ronge, ro...",38,7
2,5,,,All residents asked to 'shelter in place' are ...,1,"[all residents, residents asked, asked to, to ...",133,22
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[13000 people, people receive, receive wildfir...",65,8
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[just got, got sent, sent this, this photo, ph...",88,16
