### En este notebook voy a intentar clasificar las ubicaciones entre validas o no

In [1]:
import pandas as pd
import re
import numpy as np
import pickle
import nltk
import xgboost as xgb
import matplotlib.pyplot as plt
import lightgbm as lgb
from catboost import CatBoostRegressor
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import STOPWORDS
%matplotlib inline

#nltk.download('punkt')


In [2]:
train = pd.read_csv('csv/train.csv')
test  = pd.read_csv('csv/test.csv')

In [3]:
train['location'].isnull().mean() # tenemos un 30% de locations que son nul

0.33272034677525286

In [4]:
train['loc_nula'] = train.apply(lambda x: ( 1 if pd.isna(x['location']) else 0  ), axis = 1 )
test['loc_nula'] = test.apply(lambda x: ( 1 if pd.isna(x['location']) else 0  ), axis = 1 )

In [5]:
test.sample(5)

Unnamed: 0,id,keyword,location,text,loc_nula
2507,8354,ruin,,They like raunchy trash then make it real and ...,1
2696,8988,storm,Oman,Man united is not just about playing football ...,0
1578,5329,fire,OUTERSPACE,I just got electrocuted by my #HP Chromebook c...,0
3035,10026,twister,,Playing twister with a broken arm wasn't the s...,1
410,1329,blown%20up,IG: xbougiebri,Luckily I'm up cause if these Mexicans had cam...,0


In [6]:
### Esto ya lo habiamos hecho, ahora voy a agregar un nuevo 
### feature donde diga si la location es valida o no (si se encuentra en un txt con ubicaciones)

In [7]:
#traigo el csv de ubicaciones
ubicaciones = pd.read_csv('csv/worldcities.csv')

In [8]:
ubicaciones.head()

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,Tokyo,35.685,139.7514,Japan,JP,JPN,Tōkyō,primary,35676000.0,1392685764
1,New York,New York,40.6943,-73.9249,United States,US,USA,New York,,19354922.0,1840034016
2,Mexico City,Mexico City,19.4424,-99.131,Mexico,MX,MEX,Ciudad de México,primary,19028000.0,1484247881
3,Mumbai,Mumbai,19.017,72.857,India,IN,IND,Mahārāshtra,admin,18978000.0,1356226629
4,São Paulo,Sao Paulo,-23.5587,-46.625,Brazil,BR,BRA,São Paulo,admin,18845000.0,1076532519


In [9]:
# quiero una lista que contenga todas las ciudades con todos los nombres posibles.

In [10]:
ubicaciones_serie = pd.Series(ubicaciones['city_ascii']  )
ubicaciones_serie = ubicaciones_serie.append(ubicaciones['country'], ignore_index=True )
ubicaciones_serie  = ubicaciones_serie.append(ubicaciones['iso2'], ignore_index=True  )
ubicaciones_serie = ubicaciones_serie.append(ubicaciones['iso3'], ignore_index=True )

In [11]:
ubicaciones_serie

0              Tokyo
1           New York
2        Mexico City
3             Mumbai
4          Sao Paulo
            ...     
61967            GRL
61968            UKR
61969            RUS
61970            RUS
61971            CAN
Length: 61972, dtype: object

In [12]:
ubicaciones_serie[3]

'Mumbai'

In [13]:
ubicaciones_serie = ubicaciones_serie.drop_duplicates()

In [14]:
ubicaciones_serie

0              Tokyo
1           New York
2        Mexico City
3             Mumbai
4          Sao Paulo
            ...     
60228            TCA
60712            CYM
61404            COK
61626            FLK
61913            SGS
Length: 14128, dtype: object

In [15]:
ubicaciones_serie = ubicaciones_serie.apply(lambda x: str(x).replace(" ", "") )

In [16]:
ubicaciones_serie = ubicaciones_serie.apply(lambda x: str(x).lower() )

In [17]:
ubicaciones_serie

0             tokyo
1           newyork
2        mexicocity
3            mumbai
4          saopaulo
            ...    
60228           tca
60712           cym
61404           cok
61626           flk
61913           sgs
Length: 14128, dtype: object

In [18]:
### ahora tengo que separar las ciudades de mi csv de train.
ubicaciones_train = train['location'].copy()

In [19]:
ubicaciones_train = ubicaciones_train.fillna("invalida") #esto va a servir para filtrar.

In [20]:
ubicaciones_train = ubicaciones_train.apply(lambda x: x.lower() )

In [21]:
print(ubicaciones_train[6663], ";", ubicaciones_train[999] )

ohio, usa ; former yugoslav republic of macedonia


In [22]:
ubicaciones_train = ubicaciones_train.apply(lambda x: re.sub('[^A-Za-z0-9]+', '_', x) )

In [23]:
print(ubicaciones_train[6663], ";", ubicaciones_train[999] )

ohio_usa ; former_yugoslav_republic_of_macedonia


In [24]:
### Spliteo las ubicaciones del train, en espacios.
ubicaciones_train  = ubicaciones_train.apply(lambda x: x.split('_') )

In [25]:
print(ubicaciones_train[6663], ";", ubicaciones_train[999] )

['ohio', 'usa'] ; ['former', 'yugoslav', 'republic', 'of', 'macedonia']


In [26]:
ubicaciones_train.hasnans

False

In [27]:
print(ubicaciones_train[4771])

['reddit', '']


In [28]:
###cargo el w2v en memoria, voy a agregar un feature que sea la distancia cos en w2v
### a la ubicacion que mas se le parece.

In [29]:
# #descargar de aca https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz, pesa 1.5gb
EMBEDDING_FILE = 'GoogleNews-vectors-negative300.bin.gz'
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

In [30]:
def cos(x1, x2):
  return np.dot(x1, x2)/(np.linalg.norm(x1)*np.linalg.norm(x2))

In [31]:
def calcular_dist (ingresadas, validas, diccionario):
    i = 0
    maximo = 0.0
    distancia = 0.0
    palabra = 'nula'
    encontrada = 'nula'
    vocabulario = diccionario.vocab
    
    for buscada in ingresadas :
        
        if (buscada != '' and buscada != 'invalida' and buscada in vocabulario) :
            for ubicacion in validas :
                i+=1
                if(ubicacion in vocabulario):
                    distancia = cos( diccionario[buscada], diccionario[ubicacion] )
                    if distancia > maximo : 
                        maximo = distancia
                        palabra = buscada
                        encontrada = ubicacion
    print (i)
    return (maximo, buscada, encontrada)

In [32]:
distancia_minima = []
for ubicacion in ubicaciones_train :
    distancia = calcular_dist(ubicacion, ubicaciones_serie, word2vec)
    distancia_minima.append(distancia)


0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
14128
42384
14128
28256
28256
0
28256
0
14128
42384
0
28256
0
0
0
42384
0
42384
14128
42384
14128
0
14128
14128
14128
28256
42384
0
56512
0
0
56512
28256
28256
14128
28256
0
28256
42384
14128
42384
42384
14128
28256
0
14128
42384
28256
14128
0
56512
0
28256
28256
0
56512
0
28256
28256
0
56512
0
14128
28256
14128
42384
14128
14128
0
14128
28256
0
0
14128
0
14128
0
0
0
0
56512
0
14128
14128
14128
0
28256
14128
14128
14128
42384
0
0
28256
0
0
14128
0
28256
0
0
0
0
0
0
0
14128
56512
42384
0
56512
28256
42384
42384
14128
42384
0
14128
56512
42384
42384
0
28256
28256
56512
0
28256
14128
0
0
0
14128
0
0
14128
42384
42384
14128
14128
14128
0
14128
42384
0
42384
28256
0
70640
28256
14128
14128
0
14128
28256
0
70640
0
14128
42384
0
14128
70640
70640
14128
84768
0
14128
0
14128
56512
14128
0
14128
14128
0
0
14128
28256
28256
14128
42384
0
0
0
0
56512
42384
14128
0
0
42384
0
14128
0
0
56512
14128
14128
0
14128
0
28256
14128
0
56512
0
28

28256
0
0
14128
14128
0
14128
14128
0
28256
0
70640
0
0
14128
28256
28256
0
14128
0
0
0
42384
28256
0
0
14128
0
42384
14128
0
14128
0
0
28256
56512
14128
42384
0
0
14128
28256
0
14128
0
14128
0
14128
14128
28256
28256
0
0
0
0
0
14128
56512
0
0
28256
0
0
28256
14128
0
0
14128
0
14128
28256
0
14128
42384
42384
14128
14128
14128
0
0
0
14128
56512
28256
42384
0
28256
0
0
14128
42384
28256
0
42384
42384
28256
28256
0
0
14128
0
28256
28256
42384
28256
0
28256
14128
0
0
14128
28256
14128
28256
42384
28256
0
0
28256
14128
0
28256
28256
28256
14128
42384
42384
0
0
0
0
14128
28256
14128
42384
28256
14128
28256
42384
14128
28256
28256
14128
0
0
0
0
0
14128
0
14128
0
0
42384
42384
0
42384
28256
28256
28256
70640
0
0
0
28256
0
28256
14128
70640
42384
14128
14128
14128
14128
42384
0
42384
0
0
56512
42384
70640
0
0
0
28256
0
14128
0
0
14128
28256
0
42384
14128
0
0
14128
0
14128
42384
14128
14128
0
28256
14128
42384
28256
0
28256
14128
42384
28256
28256
42384
14128
56512
28256
28256
0
0
0
56512
42384


28256
0
28256
28256
56512
0
28256
14128
28256
56512
0
14128
14128
0
14128
42384
0
28256
0
28256
42384
28256
0
0
0
42384
28256
0
28256
14128
0
0
0
28256
14128
0
14128
14128
28256
28256
14128
14128
0
0
0
28256
0
14128
84768
0
0
0
28256
0
14128
28256
14128
28256
0
0
28256
28256
28256
0
14128
0
14128
42384
0
0
14128
0
28256
42384
14128
0
28256
14128
14128
14128
14128
42384
28256
42384
14128
0
42384
0
0
70640
0
0
0
28256
0
0
0
14128
14128
42384
42384
0
0
42384
0
42384
28256
0
0
14128
70640
14128
0
28256
28256
42384
28256
28256
42384
0
42384
0
28256
42384
28256
28256
0
0
28256
42384
42384
28256
0
0
0
0
0
42384
56512
0
70640
28256
42384
70640
0
28256
0
42384
70640
0
14128
0
0
0
0
0
0
28256
14128
42384
14128
0
42384
42384
0
42384
42384
28256
0
42384
0
42384
28256
14128
28256
14128
0
28256
0
42384
0
0
28256
56512
42384
0
28256
28256
28256
0
28256
28256
0
0
28256
0
0
14128
28256
28256
56512
0
0
28256
0
56512
0
0
14128
14128
28256
28256
42384
14128
0
0
28256
28256
56512
42384
14128
28256
14128
0


42384
14128
56512
42384
0
28256
0
0
42384
14128
28256
56512
0
0
14128
42384
70640
0
0
0
0
28256
14128
28256
28256
14128
14128
0
28256
0
14128
14128
0
0
14128
28256
0
0
0
14128
0
28256
0
14128
0
0
14128
0
0
28256
0
0
14128
28256
14128
14128
42384
0
0
14128
56512
0
28256
0
0
42384
14128
28256
14128
14128
28256
0
42384
0
42384
42384
14128
14128
28256
0
0
0
42384
14128
28256
28256
28256
28256
14128
28256
56512
14128
70640
42384
14128
42384
28256
0
14128
0
0
14128
28256
0
0
0
0
14128
0
0
0
0
0
14128
0
0
0
28256
0
0
0
42384
28256
28256
0
0
0
0
0
0
28256
0
0
28256
0
28256
28256
0
28256
14128
70640
0
0
0
14128
0
0
0
0
0
0
14128
28256
0
0
0
0
28256
0
14128
28256
42384
0
0
42384
0
56512
0
28256
28256
14128
42384
0
28256
28256
56512
42384
0
0
28256
42384
0
0
14128
28256
14128
28256
14128
0
0
14128
70640
28256
0
14128
14128
28256
14128
28256
14128
14128
28256
14128
0
42384
0
0
0
28256
42384
0
42384
28256
14128
56512
14128
0
0
42384
14128
0
0
28256
28256
14128
42384
28256
14128
0
0
0
0
0
14128
0
14

14128
14128
14128
0
14128
56512
0
0
28256
0
28256
0
28256
28256
0
14128
28256
0
28256
28256
0
28256
28256
42384
28256
28256
0
70640
0
0
14128
0
0
56512
0
14128
42384
42384
14128
84768
28256
0
0
28256
28256
28256
28256
42384
28256
14128
28256
42384
28256
28256
28256
28256
0
14128
28256
0
0
0
28256
0
14128
0
42384
0
28256
14128
0
0
28256
42384
14128
0
56512
0
42384
14128
28256
28256
14128
0
14128
42384
14128
0
14128
28256
42384
14128
0
28256
56512
0
28256
28256
0
28256
28256
42384
70640
28256
56512
42384
28256
42384
42384
42384
0
14128
42384
0
0
28256
0
14128
42384
28256
42384
28256
28256
28256
0
0
28256
98896
14128
28256
0
0
28256
28256
28256
0
28256
14128
14128
42384
28256
14128
0
0
14128
0
28256
28256
0
14128
14128
0
14128
0
14128
14128
0
0
0
14128
0
28256
14128
14128
14128
14128
0
28256
0
0
42384
14128
14128
14128
0
56512
14128
14128
14128
0
14128
14128
0
14128
14128
42384
42384
14128
28256
56512
0
28256
14128
0
0
0
42384
0
0
28256
70640
0
42384
0
0
14128
0
0
0
0
28256
42384
0
28256


In [33]:
len (distancia_minima)

7613

In [34]:
ubicaciones_train

0       [invalida]
1       [invalida]
2       [invalida]
3       [invalida]
4       [invalida]
           ...    
7608    [invalida]
7609    [invalida]
7610    [invalida]
7611    [invalida]
7612    [invalida]
Name: location, Length: 7613, dtype: object

In [35]:
len(train)

7613

In [36]:
len(ubicaciones_train)

7613

In [37]:
print(- (len(ubicaciones_train) - len (distancia_minima) ) )

0


In [38]:
train ['cos loc'] = distancia_minima

In [41]:
train.sample(50)

Unnamed: 0,id,keyword,location,text,target,loc_nula,cos loc
1746,2513,collision,"Los Angeles, CA",Santa Fe Springs Studebaker Rd / South St **Tr...,0,0,"(1.0000001, ca, ca)"
6421,9181,suicide%20bomber,,@bbclaurak Why is no one talking about the ris...,1,1,"(0.0, invalida, nula)"
4357,6191,hijacker,"Louisville, KY",Remove the http://t.co/VbqmZ5aPwj and Linkury ...,1,0,"(1.0000001, ky, louisville)"
5532,7891,quarantined,"Poplar, London",can't DL a patch to fix the error in symantec ...,0,0,"(1.0000001, london, london)"
5673,8096,rescued,Nigeria,4 kidnapped ladies rescued by police in Enugu ...,1,0,"(1.0, nigeria, nigeria)"
2891,4154,drown,East Coast,We all carry these things inside that no one e...,0,0,"(0.81980747, coast, southeast)"
2304,3304,demolished,NJ,Uribe demolished that ball ??????,0,0,"(0.61655015, nj, delaware)"
2319,3335,demolished,,Take this China get demolished and sent back t...,0,1,"(0.0, invalida, nula)"
4343,6167,hijack,"Near Richmond, VA",Another Mac vuln!\n\nhttps://t.co/OxXRnaB8Un,0,0,"(0.99999994, va, richmond)"
3354,4800,evacuated,"Hensley Street, Portland",KATUNews: #SR14 remains closed as brush fire b...,1,0,"(0.9999999, portland, portland)"


In [42]:
train.to_csv("train_cos.csv")