# PRÁCTICA 2 - UFO TABLE

In [1]:
# Data wrangling
import json as js
import numpy as np
import pandas as pd
from sqlalchemy import create_engine

from funpymodeling.exploratory import freq_tbl, status, profiling_num, cat_vars, num_vars
from pandas_profiling import ProfileReport
import reverse_geocoder as rg

import warnings
warnings.filterwarnings("ignore")

# Data viz
import cufflinks as cf
from sklearn import set_config

# Preprocesamiento
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif

# Modelado
from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

# Enviroment setup
cf.go_offline()
set_config(display='diagram')
pd.set_option('display.max_columns', 50)
pd.set_option('display.float_format', lambda x: '%.4f' % x)

## Funciones

In [2]:
def na_cols_freq(df):
    features_num = df.columns
    na_columns = df.isna().sum(axis=0)
    na_columns_num = na_columns[features_num] / df.shape[0]
    print(na_columns_num[na_columns_num > 0], '\n')
    
class TransformDate(TransformerMixin):
    def __init__(self,col):
        self.col = col

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        
        X_aux = X.copy()
        
        X_aux[self.col] = pd.to_datetime(X_aux[self.col].map(lambda x: str(x).split('+')[0]))
        X_aux["C_hour_"+ self.col ] = X_aux.loc[:,self.col].dt.hour
        X_aux["C_min_"+ self.col ] = X_aux.loc[:,self.col].dt.minute
        X_aux["C_sec_"+ self.col ] = X_aux.loc[:,self.col].dt.second
        X_aux["C_DAY_" + self.col ] = X_aux.loc[:,self.col].dt.day
        X_aux["C_WEEK_" + self.col] = X_aux.loc[:,self.col].dt.week
        X_aux["C_WEEKDAY_" + self.col] = X_aux.loc[:,self.col].dt.weekday
        X_aux["C_MONTHDAY_" + self.col] = X_aux.loc[:,self.col].dt.daysinmonth
        X_aux["C_MONTH_" + self.col] = X_aux.loc[:,self.col].dt.month
        X_aux["C_YEAR_" + self.col] = X_aux.loc[:,self.col].dt.year ##----> no creo nos sea útil
        X_aux["C_DAY_YEAR_" + self.col] = X_aux.loc[:,self.col].dt.dayofyear
        X_aux["C_QUARTER_" + self.col] = X_aux.loc[:,self.col].dt.quarter
        X_aux["C_WEEKEND_" + self.col] = np.where(X_aux["C_WEEKDAY_" + self.col] > 4 ,1,0)
        X_aux["C_SEMESTER_" + self.col] = np.where(X_aux["C_MONTH_" + self.col] > 6 ,1,0)
        
        X_aux = X_aux.drop(columns= self.col)
        
        return X_aux

## Lectura de datos

In [3]:
cols = ['datetime', 'city', 'state', 'country', 'shape', 'duration (seconds)',
       'duration (hours/min)', 'comments', 'date posted', 'latitude',
       'longitude', 'missed']

df = pd.read_csv("ufo_data.csv", error_bad_lines=False , names=cols)
df.drop([0], inplace=True)
df.shape

(88875, 12)

## Estado de salud de los datos

In [4]:
status(df)

Unnamed: 0,variable,q_nan,p_nan,q_zeros,p_zeros,unique,type
0,datetime,0,0.0,0,0.0,76305,object
1,city,196,0.0022,0,0.0,22018,object
2,state,7519,0.0846,0,0.0,68,object
3,country,12561,0.1413,0,0.0,5,object
4,shape,3118,0.0351,0,0.0,29,object
5,duration (seconds),2,0.0,1678,0.0189,732,object
6,duration (hours/min),3103,0.0349,0,0.0,9806,object
7,comments,126,0.0014,0,0.0,88348,object
8,date posted,0,0.0,0,0.0,513,object
9,latitude,0,0.0,0,0.0,19572,object


## Limpieza

In [5]:
comple=pd.DataFrame(df.isnull().sum())
comple.reset_index(inplace=True)

comple=comple.rename(columns={"index":"columna",0:"total"})
comple["completitud"]=(1-comple["total"]/df.shape[0])
comple=comple.sort_values(by="completitud",ascending=True)
comple.reset_index(drop=True,inplace=True)
comple

Unnamed: 0,columna,total,completitud
0,missed,88679,0.0022
1,country,12561,0.8587
2,state,7519,0.9154
3,shape,3118,0.9649
4,duration (hours/min),3103,0.9651
5,city,196,0.9978
6,comments,126,0.9986
7,duration (seconds),2,1.0
8,datetime,0,1.0
9,date posted,0,1.0


In [6]:
df["missed"].value_counts

<bound method IndexOpsMixin.value_counts of 1       NaN
2       NaN
3       NaN
4       NaN
5       NaN
         ..
88871   NaN
88872   NaN
88873   NaN
88874   NaN
88875   NaN
Name: missed, Length: 88875, dtype: float64>

In [7]:
uncorrect = df.loc[df['missed'] == 0]
print('Uncorrect rows:', uncorrect.shape[0], '\n')
print('Nan elements missed:\n')
na_cols_freq(uncorrect)
uncorrect.head()

Uncorrect rows: 196 

Nan elements missed:

city                   1.0000
state                  0.5612
country                1.0000
shape                  1.0000
duration (hours/min)   0.4286
comments               0.4643
dtype: float64 



Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude,missed
877,10/1/2006 12:00,,,,,0,,,((EDITORIAL COMMENT ABOUT THE UFO PHENOMEN)) ...,10/30/2006,0,0.0
1712,10/14/2004 13:00,,,,,0,,,With all the guns in this country...why hasn&#...,10/27/2004,0,0.0
1814,10/14/2011 22:30,,nv,,,0,light,22,3 Green lights,10/19/2011,0,0.0
2857,10/17/2008 20:30,,tx,,,0,oval,5 minutes,An oval shaped object in a photograph. ((NUF...,10/31/2008,0,0.0
3733,10/20/2013 18:30,,ct,,,0,egg,2 hours,Bright light visible in W sky for long periods...,11/11/2013,0,0.0


In [8]:
df = df.loc[df['missed'].isna()]
df.drop(['missed'], axis=1, inplace=True)
df.shape

(88679, 11)

## Conversión de tipo de datos

In [9]:
# verifico si efectivamente hay registros con la hora cargada como '24:00'
df[df['datetime'].str.contains("24:00")]

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
167,10/10/2005 24:00,franklin,in,us,disk,0,?,two yellow objects sitting over the corn field...,10/20/2005,39.4805556,-86.0550000
317,10/11/1994 24:00,hot springs and custer,sd,,triangle,0,,October 11&#44 1994--Hot Springs/Custer--trian...,9/2/2005,43.431646,-103.474362
418,10/11/2006 24:00,rome,ny,us,oval,120,a min or two,I was walking from the garage to the house&#44...,2/1/2007,43.2127778,-75.4561111
488,10/11/2012 24:00,truth or consequences,nm,us,unknown,0,,Fast moving red ball and then flashing light,10/30/2012,33.1283333,-107.2522222
568,10/1/1972 24:00,sweet home,or,us,unknown,0,ufo,Ligthning coming out of a cloud hovering&#44 w...,1/10/2009,44.3977778,-122.7350000
...,...,...,...,...,...,...,...,...,...,...,...
88274,9/7/2007 24:00,new york city (bronx),ny,us,unknown,0.0,?,Help. 500 Lights On Object0: Yes,10/8/2007,40.7141667,-74.0064
88313,9/7/2010 24:00,olin,nc,us,,0.0,,Related UUO/UFO activity vicinity of NC Coast ...,11/21/2010,35.9525000,-80.8400
88341,9/7/2012 24:00,big bear city,ca,us,circle,0.0,,It was bright wright&#44and stopped then moved...,9/24/2012,34.2611111,-116.8442
88449,9/8/2003 24:00,cedar rapids,ia,us,changing,900.0,15 min,WE SEEN IT,9/17/2003,42.0083333,-91.6439


In [10]:
# donde encuentra un '24:00' le asigna '00:00'
df['datetime_dat'] = df['datetime'].replace({'24:00': '00:00'}, regex=True)

In [11]:
df.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude,datetime_dat
1,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.9411111,10/10/1949 20:30
2,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082,10/10/1949 21:00
3,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667,10/10/1955 17:00
4,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.6458333,10/10/1956 21:00
5,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.8036111,10/10/1960 20:00


In [12]:
# Probemos ahora la conversión al tipo de datos datetime
df['datetime_dat']=df['datetime_dat'].astype('datetime64')

In [13]:
## registros errorneos en latitud
df[df['latitude'].str.contains('[A-Za-z]', na=False)] 

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude,datetime_dat
48575,5/22/1974 05:30,mescalero indian reservation,nm,,rectangle,180,two hours,Huge rectangular object emmitting intense whit...,4/18/2012,33q.200088,-105.624152,1974-05-22 05:30:00


In [14]:
df['latitude']=df["latitude"].str.replace("33q.200088", "33.200088")

In [15]:
# probemos otra vez la conversión de object a numerico
df['latitude_num']=df['latitude'].astype('float64')

In [16]:
# probemos otra vez la conversión de object a numerico
df['longitude_num']=df['longitude'].astype('float64')

In [17]:
df = df.reset_index(drop=True)

In [18]:
df[df['duration (seconds)'].str.contains("\W",na=False)]

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude,datetime_dat,latitude_num,longitude_num
302,10/10/2013 21:00,kingsville (canada),on,ca,,0.0,on going,Extremely fast multiple colored lights&#44 con...,10/14/2013,42.033333,-82.75,2013-10-10 21:00:00,42.0333,-82.7500
716,10/1/1998 22:30,moundsville,wv,us,triangle,0.0,continous,5 large objects observed&#44 4 of them flying ...,8/5/2001,39.9202778,-80.7433333,1998-10-01 22:30:00,39.9203,-80.7433
1509,10/13/2010 19:20,new york city,ny,us,light,0.0,on going,Multiple UFO sighting in NYC. ((NUFORC Note: ...,11/21/2010,40.7141667,-74.0063889,2010-10-13 19:20:00,40.7142,-74.0064
1747,10/14/2007 21:25,surprise,az,us,other,1.5,1.5 seconds,Streak in sky changes direction,11/28/2007,33.6305556,-112.3325000,2007-10-14 21:25:00,33.6306,-112.3325
1801,10/14/2011 18:30,strawberry,az,us,flash,1.5,1.5 seconds,White flash appears in Geosynchronous orbit la...,12/12/2011,34.4077778,-111.4927778,2011-10-14 18:30:00,34.4078,-111.4928
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63560,7/16/2000 00:34,portland,or,us,light,1.5,1.5sec.,Bright Green&#44 Orange&#44 and White lights,7/23/2000,45.5236111,-122.6750000,2000-07-16 00:34:00,45.5236,-122.6750
63877,7/17/2006 00:00,kingston,ny,us,light,0.5,.5 seconds,Bright light moving quickly before blinking ou...,10/30/2006,41.9269444,-73.9977778,2006-07-17 00:00:00,41.9269,-73.9978
63988,7/17/2012 05:11,stockton,ca,us,light,0.0,on going,Bright light with large light aura. NE of my l...,8/5/2012,37.9577778,-121.2897222,2012-07-17 05:11:00,37.9578,-121.2897
64390,7/19/2005 22:00,tempesque bridge (west of palma approach to) (...,,,other,0.38,.38 second,streak of light/object moved across the sky at...,9/2/2005,9.920987,-84.098549,2005-07-19 22:00:00,9.9210,-84.0985


In [19]:
df[df['duration (seconds)'].str.contains("`",na=False)]

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude,datetime_dat,latitude_num,longitude_num
30821,2/2/2000 19:33,bouse,az,us,,2`,each a few seconds,Driving through Plomosa Pass towards Bouse Loo...,2/16/2000,33.9325,-114.005,2000-02-02 19:33:00,33.9325,-114.005
39519,4/10/2005 22:52,santa cruz,ca,us,,8`,eight seconds,2 red lights moving together and apart with a ...,4/16/2005,36.9741667,-122.0297222,2005-04-10 22:52:00,36.9742,-122.0297
64975,7/21/2006 13:00,ibague (colombia),,,circle,0.5`,1/2 segundo,Viajaba a 27.000 pies en un avion comercial ve...,10/30/2006,4.440663,-75.244141,2006-07-21 13:00:00,4.4407,-75.2441


In [20]:
df.loc[30821,'duration (seconds)'] = 2
df.loc[39519,'duration (seconds)'] = 8
df.loc[64975,'duration (seconds)'] = 0.5

In [21]:
df[(df["duration (seconds)"] == "0")]

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude,datetime_dat,latitude_num,longitude_num
65,10/10/1995 17:05,puerto rico (puerto rico),pr,,,0,,Woman calls regarding fiance who is having pec...,11/2/1999,18.220833,-66.590149,1995-10-10 17:05:00,18.2208,-66.5901
94,10/10/1999 21:00,ashland (at our home),mo,us,light,0,two seperate times&#44 10 se,We saw on our local news that in Jefferson Cit...,12/16/1999,38.7744444,-92.2569444,1999-10-10 21:00:00,38.7744,-92.2569
126,10/10/2002 22:00,bahamas,,,egg,0,,we are sitting outside and commenting on all t...,10/15/2002,25.03428,-77.39628,2002-10-10 22:00:00,25.0343,-77.3963
127,10/10/2002 22:00,burnie (tasmania) (australia),,au,cross,0,12,the craft was large and noisy,7/13/2005,-41.05584,145.903748,2002-10-10 22:00:00,-41.0558,145.9037
157,10/10/2005 11:30,edgewater,fl,us,,0,300,orange ball in sky video taped over edgewater ...,10/11/2005,28.9886111,-80.9025000,2005-10-10 11:30:00,28.9886,-80.9025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65284,7/2/2011 04:30,central point,or,us,flash,0,,3-4 crafts that were flighing very fast it alm...,7/4/2011,42.3761111,-122.9152778,2011-07-02 04:30:00,42.3761,-122.9153
65323,7/2/2012 22:00,deadwood,sd,us,,0,unknown,Three red lights in sky found in photos,8/19/2012,44.3766667,-103.7291667,2012-07-02 22:00:00,44.3767,-103.7292
65341,7/2/2013 21:46,buckeye,az,us,light,0,going on now,Orange/amber orbs.,7/3/2013,33.3702778,-112.5830556,2013-07-02 21:46:00,33.3703,-112.5831
65351,7/22/1970 03:00,grand blanc,mi,us,,0,,Brother not joking of his encounter. He is a h...,2/14/2008,42.9275000,-83.6300000,1970-07-22 03:00:00,42.9275,-83.6300


In [22]:
df['duration (seconds)'].fillna(0, inplace=True)
df[df['duration (seconds)'].isna()]

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude,datetime_dat,latitude_num,longitude_num


In [23]:
df['duration (seconds)'] = df['duration (seconds)'].map(lambda x: int(float(x)))

In [24]:
df['duration(seconds)_num'] = df['duration (seconds)'].astype('float64')

In [25]:
df['datePosted'] = df['date posted'].astype('datetime64')

## Tratamiento de Nulos

In [26]:
status(df)

Unnamed: 0,variable,q_nan,p_nan,q_zeros,p_zeros,unique,type
0,datetime,0,0.0,0,0.0,76159,object
1,city,0,0.0,0,0.0,22018,object
2,state,7409,0.0835,0,0.0,68,object
3,country,12365,0.1394,0,0.0,5,object
4,shape,2922,0.033,0,0.0,29,object
5,duration (seconds),0,0.0,7122,0.0803,489,int64
6,duration (hours/min),3019,0.034,0,0.0,9791,object
7,comments,35,0.0004,0,0.0,88283,object
8,date posted,0,0.0,0,0.0,317,object
9,latitude,0,0.0,0,0.0,19469,object


**La variable country contiene muchisimos valores nulos al igual que duration (seconds), country la obtenemos a través de latitutud y longitud.**

In [27]:
coordinates = list(zip(df['latitude_num'], df['longitude']))

In [28]:
# le paso al geocoding mis coordenadas para que me devuelva países en una lista
results = rg.search(coordinates)

Loading formatted geocoded file...


In [29]:
results

[{'lat': '29.88327',
  'lon': '-97.94139',
  'name': 'San Marcos',
  'admin1': 'Texas',
  'admin2': 'Hays County',
  'cc': 'US'},
 {'lat': '29.38663',
  'lon': '-98.61797',
  'name': 'Lackland Air Force Base',
  'admin1': 'Texas',
  'admin2': 'Bexar County',
  'cc': 'US'},
 {'lat': '53.20832',
  'lon': '-2.9253',
  'name': 'Blacon',
  'admin1': 'England',
  'admin2': 'Cheshire West and Chester',
  'cc': 'GB'},
 {'lat': '28.97859',
  'lon': '-96.64609',
  'name': 'Edna',
  'admin1': 'Texas',
  'admin2': 'Jackson County',
  'cc': 'US'},
 {'lat': '21.40929',
  'lon': '-157.80092',
  'name': "Kane'ohe",
  'admin1': 'Hawaii',
  'admin2': 'Honolulu County',
  'cc': 'US'},
 {'lat': '36.59511',
  'lon': '-82.18874',
  'name': 'Bristol',
  'admin1': 'Tennessee',
  'admin2': 'Sullivan County',
  'cc': 'US'},
 {'lat': '51.4386',
  'lon': '-3.17342',
  'name': 'Penarth',
  'admin1': 'Wales',
  'admin2': 'Vale of Glamorgan',
  'cc': 'GB'},
 {'lat': '41.1176',
  'lon': '-73.4079',
  'name': 'Norwalk

In [30]:
results_df = pd.DataFrame(results)
print(results_df.shape)
results_df.head()

(88679, 6)


Unnamed: 0,lat,lon,name,admin1,admin2,cc
0,29.88327,-97.94139,San Marcos,Texas,Hays County,US
1,29.38663,-98.61797,Lackland Air Force Base,Texas,Bexar County,US
2,53.20832,-2.9253,Blacon,England,Cheshire West and Chester,GB
3,28.97859,-96.64609,Edna,Texas,Jackson County,US
4,21.40929,-157.80092,Kane'ohe,Hawaii,Honolulu County,US


In [31]:
df['country_c']=results_df['cc']
df['city_c']=results_df['name']
df['state_c'] = results_df['admin1']
df["lat_c"] = results_df['lat']
df["lon_c"] = results_df['lon']
df.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude,datetime_dat,latitude_num,longitude_num,duration(seconds)_num,datePosted,country_c,city_c,state_c,lat_c,lon_c
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.9411111,1949-10-10 20:30:00,29.8831,-97.9411,2700.0,2004-04-27,US,San Marcos,Texas,29.88327,-97.94139
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082,1949-10-10 21:00:00,29.3842,-98.5811,7200.0,2005-12-16,US,Lackland Air Force Base,Texas,29.38663,-98.61797
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667,1955-10-10 17:00:00,53.2,-2.9167,20.0,2008-01-21,GB,Blacon,England,53.20832,-2.9253
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.6458333,1956-10-10 21:00:00,28.9783,-96.6458,20.0,2004-01-17,US,Edna,Texas,28.97859,-96.64609
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.8036111,1960-10-10 20:00:00,21.4181,-157.8036,900.0,2004-01-22,US,Kane'ohe,Hawaii,21.40929,-157.80092


In [32]:
df['shape'] = df['shape'].fillna("No data")
df['comments'] = df['comments'].fillna("No data")

**Creamos la columna de duración en minutos:**

In [33]:
df['duration_minutes'] = df['duration(seconds)_num'] /60

In [34]:
to_drop = ["datetime","latitude","longitude","duration (seconds)", "date posted",
           "duration(seconds)_num","duration (hours/min)",'country','state','city',"latitude_num","longitude_num"]
df = df.drop(to_drop, axis=1)

In [35]:
df = df.rename(columns={"datetime_dat": "datetime",
                             'country_c':'country',
                             'state_c':'state',
                             'city_c': 'city',
                             'lat_c':'latitud',
                             'lon_c':'longitud',  
                             'longitude ': 'longitude'})
df.head()

Unnamed: 0,shape,comments,datetime,datePosted,country,city,state,latitud,longitud,duration_minutes
0,cylinder,This event took place in early fall around 194...,1949-10-10 20:30:00,2004-04-27,US,San Marcos,Texas,29.88327,-97.94139,45.0
1,light,1949 Lackland AFB&#44 TX. Lights racing acros...,1949-10-10 21:00:00,2005-12-16,US,Lackland Air Force Base,Texas,29.38663,-98.61797,120.0
2,circle,Green/Orange circular disc over Chester&#44 En...,1955-10-10 17:00:00,2008-01-21,GB,Blacon,England,53.20832,-2.9253,0.3333
3,circle,My older brother and twin sister were leaving ...,1956-10-10 21:00:00,2004-01-17,US,Edna,Texas,28.97859,-96.64609,0.3333
4,light,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1960-10-10 20:00:00,2004-01-22,US,Kane'ohe,Hawaii,21.40929,-157.80092,15.0


In [36]:
countries = pd.read_csv("https://raw.githubusercontent.com/umpirsky/country-list/master/data/en_US/country.csv")

# countries = pd.read_csv("https://gist.githubusercontent.com/marijn/396531/raw/188caa065e3cd319fed7913ee3eecf5eec541918/countries.txt"
#                         ,sep="|" , names=["1","2"])
countries

Unnamed: 0,id,value
0,AF,Afghanistan
1,AX,Åland Islands
2,AL,Albania
3,DZ,Algeria
4,AS,American Samoa
...,...,...
244,WF,Wallis & Futuna
245,EH,Western Sahara
246,YE,Yemen
247,ZM,Zambia


In [37]:
df = df.merge(countries , left_on= "country" , right_on= "id", how = "left")
df.head()

Unnamed: 0,shape,comments,datetime,datePosted,country,city,state,latitud,longitud,duration_minutes,id,value
0,cylinder,This event took place in early fall around 194...,1949-10-10 20:30:00,2004-04-27,US,San Marcos,Texas,29.88327,-97.94139,45.0,US,United States
1,light,1949 Lackland AFB&#44 TX. Lights racing acros...,1949-10-10 21:00:00,2005-12-16,US,Lackland Air Force Base,Texas,29.38663,-98.61797,120.0,US,United States
2,circle,Green/Orange circular disc over Chester&#44 En...,1955-10-10 17:00:00,2008-01-21,GB,Blacon,England,53.20832,-2.9253,0.3333,GB,United Kingdom
3,circle,My older brother and twin sister were leaving ...,1956-10-10 21:00:00,2004-01-17,US,Edna,Texas,28.97859,-96.64609,0.3333,US,United States
4,light,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1960-10-10 20:00:00,2004-01-22,US,Kane'ohe,Hawaii,21.40929,-157.80092,15.0,US,United States


In [38]:
df['id'] = df['id'].fillna(df['country'])

In [39]:
df.loc[df.id == "NA", ['value']] = 'Namibia'
df.loc[df.id == "XK", ['value']] = 'Kosovo'

In [40]:
df['comments'] = df['comments'].replace({';': ' '}, regex=True)

In [41]:
df.head()

Unnamed: 0,shape,comments,datetime,datePosted,country,city,state,latitud,longitud,duration_minutes,id,value
0,cylinder,This event took place in early fall around 194...,1949-10-10 20:30:00,2004-04-27,US,San Marcos,Texas,29.88327,-97.94139,45.0,US,United States
1,light,1949 Lackland AFB&#44 TX. Lights racing acros...,1949-10-10 21:00:00,2005-12-16,US,Lackland Air Force Base,Texas,29.38663,-98.61797,120.0,US,United States
2,circle,Green/Orange circular disc over Chester&#44 En...,1955-10-10 17:00:00,2008-01-21,GB,Blacon,England,53.20832,-2.9253,0.3333,GB,United Kingdom
3,circle,My older brother and twin sister were leaving ...,1956-10-10 21:00:00,2004-01-17,US,Edna,Texas,28.97859,-96.64609,0.3333,US,United States
4,light,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1960-10-10 20:00:00,2004-01-22,US,Kane'ohe,Hawaii,21.40929,-157.80092,15.0,US,United States


In [42]:
col_transform = TransformDate("datetime")
df = col_transform.transform(df)
df[:2]

Unnamed: 0,shape,comments,datePosted,country,city,state,latitud,longitud,duration_minutes,id,value,C_hour_datetime,C_min_datetime,C_sec_datetime,C_DAY_datetime,C_WEEK_datetime,C_WEEKDAY_datetime,C_MONTHDAY_datetime,C_MONTH_datetime,C_YEAR_datetime,C_DAY_YEAR_datetime,C_QUARTER_datetime,C_WEEKEND_datetime,C_SEMESTER_datetime
0,cylinder,This event took place in early fall around 194...,2004-04-27,US,San Marcos,Texas,29.88327,-97.94139,45.0,US,United States,20,30,0,10,41,0,31,10,1949,283,4,0,1
1,light,1949 Lackland AFB&#44 TX. Lights racing acros...,2005-12-16,US,Lackland Air Force Base,Texas,29.38663,-98.61797,120.0,US,United States,21,0,0,10,41,0,31,10,1949,283,4,0,1


In [43]:
df.shape

(88679, 24)

### Definiendo la variable objetivo

In [44]:
df['sighting_count']=df['country']+"_"+df['state']+"_"+df['city']+'_'+(df['datePosted'].astype(str))

In [45]:
a=df['sighting_count'].value_counts()

In [46]:
a=pd.DataFrame(a)

In [47]:
a.reset_index(inplace=True)

In [48]:
a=a.rename(columns={'index': 'sighting_count', 'sighting_count':'count'})

In [49]:
df=pd.merge(df, a, on='sighting_count', how='inner')

In [50]:
df['count']=df['count'].map(lambda x: 1 if x>1 else 0)

In [51]:
df=df.rename(columns={'count': 'target'})

**limpiando la variable shape**

In [52]:
others = [kvp[0] for kvp in dict(df["shape"].value_counts(1)).items() if kvp[1]<0.05]
df.loc[df['shape'].isin(others), 'shape'] = "others"

In [53]:
df["shape"].value_counts(1)

others     0.2591
light      0.2015
triangle   0.0957
circle     0.0953
fireball   0.0740
unknown    0.0713
other      0.0704
disk       0.0677
sphere     0.0649
Name: shape, dtype: float64

**limpiando la variable de country**

In [54]:
df["country"].value_counts(1)

US   0.8654
CA   0.0443
GB   0.0291
GH   0.0169
AU   0.0079
      ...  
SV   0.0000
LA   0.0000
GG   0.0000
TO   0.0000
OM   0.0000
Name: country, Length: 170, dtype: float64

Se decide eliminar a todos los paises fuera del mundo blanco primermundista que habla ingés

In [55]:
mundo_blanco = ["US","CA","GB", "AU","NZ"]
df = df[df['country'].isin(mundo_blanco)]

**limpiando estado**  
Se agrupan segín región, los de otros paises se agrupan en foreign.

In [56]:
regions = {
    "far west" : ['California', 'Hawaii', 'Nevada'],
    "great lakes": ['Illinois', 'Indiana', 'Michigan', 'Minnesota', 'Ohio', 'Wisconsin'],
    "midshouth": ['Delaware', 'District of Columbia', 'Kentucky', 'Maryland', 'North Carolina', 'Tennessee', 'Virginia', 'West Virginia'],
    "midwest": ['Iowa', 'Kansas', 'Missouri', 'Nebraska', 'North Dakota', 'South Dakota'],
    "mountain west": ['Arizona', 'Colorado', 'Idaho', 'Montana', 'New Mexico','Utah', 'Wyoming'],
    "new england" : ['Connecticut', 'Maine', 'Massachusetts', 'New Hampshire', 'Rhode Island', 'Vermont'],
    "northeast": ['New Jersey', 'New York', 'Pennsylvania'],
    "northwest" : ['Oregon', 'Washington'],
    "south central": ['Arkansas', 'Louisiana', 'Oklahoma', 'Texas'],
    "southeast": ['Alabama', 'Florida', 'Georgia', 'Mississippi', 'South Carolina', 'Puerto Rico']
}

In [57]:
df["region"]=[[kvp[0] for kvp in regions.items() if state in kvp[1]]  for state in df["state"]]

In [58]:
df.loc[df['region'].apply(str).eq('[]'), 'region'] = "foreign"

In [59]:
df["region"] = df["region"].apply(str).map(lambda x: x.lstrip("'[]").rstrip("'[]"))

In [60]:
df["region"].value_counts(1)

great lakes     0.1406
far west        0.1394
southeast       0.1000
northeast       0.0944
foreign         0.0929
mountain west   0.0911
midshouth       0.0899
northwest       0.0786
south central   0.0736
new england     0.0521
midwest         0.0474
Name: region, dtype: float64

**categorizando usando dummies**

In [61]:
df = pd.get_dummies(df,columns=['shape','region'])

**generando variables a partir de los comentarios**

In [62]:
df["comments_len_char"]=df["comments"].str.len()
df["comments_len_word"]=df['comments'].str.split().apply(len)

**eliminando columnas que no nos sirven**

In [63]:
df = df.drop(columns=['country','city','state','latitud','longitud','id','value','comments','datePosted','sighting_count'])

In [64]:
status(df)

Unnamed: 0,variable,q_nan,p_nan,q_zeros,p_zeros,unique,type
0,duration_minutes,0,0.0,6537,0.0778,472,float64
1,C_hour_datetime,0,0.0,5353,0.0637,24,int64
2,C_min_datetime,0,0.0,35207,0.4188,60,int64
3,C_sec_datetime,0,0.0,84061,1.0,1,int64
4,C_DAY_datetime,0,0.0,0,0.0,31,int64
5,C_WEEK_datetime,0,0.0,0,0.0,53,int64
6,C_WEEKDAY_datetime,0,0.0,10552,0.1255,7,int64
7,C_MONTHDAY_datetime,0,0.0,0,0.0,4,int64
8,C_MONTH_datetime,0,0.0,0,0.0,12,int64
9,C_YEAR_datetime,0,0.0,0,0.0,87,int64


## Modelado

In [65]:
import plotly.graph_objects as go
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score,StratifiedKFold
from sklearn.metrics import confusion_matrix, roc_curve, auc

In [66]:
##Funciones

#modelado buscando mejores hiperparámetros con grid search, validado don cross validation
def bestmodel_GridSearchCV(X_train, y_train, model, param_dict={}, 
                           k_fold=10, score='roc_auc'):
    grid = GridSearchCV(
        cv = StratifiedKFold(k_fold),
        verbose=True,
        scoring=score,
        estimator=model,
        n_jobs=-1,
        param_grid=param_dict
    )
    grid.fit(X_train,y_train)
    print(f"Best Score ({score}) :  {grid.best_score_}")
    print(f"Best Params :  {grid.best_params_}")
    return grid.best_estimator_

#matriz de confusion
def get_cm(y_test, y_pred):
    return confusion_matrix(y_test, y_pred)

def df_cm(cm):
    df = pd.DataFrame({'Observacion_ ':cm[0][:],
                       'Observacion ':cm[1][:],},
                      index=['Prediccion_ ','Prediccion '])
    return df

#metricas a partir de matriz de confusión
def metrics(cm):
    vp=cm[0][0]
    fp=cm[0][1]
    fn=cm[1][0]
    vn=cm[1][1]
    exactitud = ( vp+vn )/( vp+fp+fn+vn )
    print(f"   Exactitud : {exactitud}")
    
    precision=vp/(vp+fp)
    print(f"   Precision : {precision}")

    recall=(vp/(vp+fn))
    print(f"   Recall : {recall}")

    f1_score=((2*precision*recall)/(recall+precision))
    print(f"   f1_score : {f1_score}")
    
    TPR=recall
    print(f"   TPR : {TPR}")

    FPR=(fp/(fp+vn))
    print(f"   FPR : { FPR}")    
    
    return precision,recall,f1_score,TPR,FPR

def roc_fig(y_test, y_score):
    
    fpr, tpr, thresholds = roc_curve(y_test, y_score)
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=fpr, y=tpr,
                        fill='tozeroy', fillcolor='darkviolet',
                        hoveron = 'points+fills', # select where hover is active
                        line_color='darkviolet',
                        text="Points + Fills",
                        hoverinfo = 'text+x+y'))
    fig.add_shape(
        type='line', line=dict(dash='dash'),
        x0=0, x1=1, y0=0, y1=1
    )
    fig.update_layout(
        title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',
    )
    return fig.show()
    
    
def metrics_master(X_train, y_train, X_test, y_test, model):
    y_pred_train=model.predict(X_train)
    y_pred_test=model.predict(X_test)
    
    y_score_train=model.predict_proba(X_train)[:,1]
    y_score_test=model.predict_proba(X_test)[:,1]
    
    cm_train = get_cm(y_train,y_pred_train)
    cm_test = get_cm(y_test,y_pred_test)
    
    print("Métricas Train")
    metrics(cm_train)
    print('\n')
    print("Métricas Test")
    metrics(cm_test)
    print("Matríz de confusión de Train")
    display(df_cm(cm_train))
    print("Matríz de confusión de Test")
    display(df_cm(cm_test))
    print('ROC')
    display(roc_fig(y_train, y_score_train))

#### División train-test

In [67]:
y = df['target']
X = df[[x for x in df.columns if x != "target"]]

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1234)
y_train.mean(), y_test.mean()

(0.2452058053771116, 0.24081652074609822)

#### escalado

In [69]:
sc = MinMaxScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)

In [70]:
import plotly.graph_objects as go
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score,StratifiedKFold
from sklearn.metrics import confusion_matrix, roc_curve, auc

In [71]:
##Funciones

#modelado buscando mejores hiperparámetros con grid search, validado don cross validation
def bestmodel_GridSearchCV(X_train, y_train, model, param_dict={}, 
                           k_fold=10, score='roc_auc'):
    grid = GridSearchCV(
        cv = StratifiedKFold(k_fold),
        verbose=True,
        scoring=score,
        estimator=model,
        n_jobs=-1,
        param_grid=param_dict
    )
    grid.fit(X_train,y_train)
    print(f"Best Score ({score}) :  {grid.best_score_}")
    print(f"Best Params :  {grid.best_params_}")
    return grid.best_estimator_

#matriz de confusion
def get_cm(y_test, y_pred):
    return confusion_matrix(y_test, y_pred)

def df_cm(cm):
    df = pd.DataFrame({'Observacion_ ':cm[0][:],
                       'Observacion ':cm[1][:],},
                      index=['Prediccion_ ','Prediccion '])
    return df

#metricas a partir de matriz de confusión
def metrics(cm):
    vp=cm[0][0]
    fp=cm[0][1]
    fn=cm[1][0]
    vn=cm[1][1]
    exactitud = ( vp+vn )/( vp+fp+fn+vn )
    print(f"   Exactitud : {exactitud}")
    
    precision=vp/(vp+fp)
    print(f"   Precision : {precision}")

    recall=(vp/(vp+fn))
    print(f"   Recall : {recall}")

    f1_score=((2*precision*recall)/(recall+precision))
    print(f"   f1_score : {f1_score}")
    
    TPR=recall
    print(f"   TPR : {TPR}")

    FPR=(fp/(fp+vn))
    print(f"   FPR : { FPR}")    
    
    return precision,recall,f1_score,TPR,FPR

def roc_fig(y_test, y_score):
    
    fpr, tpr, thresholds = roc_curve(y_test, y_score)
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=fpr, y=tpr,
                        fill='tozeroy', fillcolor='darkviolet',
                        hoveron = 'points+fills', # select where hover is active
                        line_color='darkviolet',
                        text="Points + Fills",
                        hoverinfo = 'text+x+y'))
    fig.add_shape(
        type='line', line=dict(dash='dash'),
        x0=0, x1=1, y0=0, y1=1
    )
    fig.update_layout(
        title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',
    )
    return fig.show()
    
    
def metrics_master(X_train, y_train, X_test, y_test, model):
    y_pred_train=model.predict(X_train)
    y_pred_test=model.predict(X_test)
    
    y_score_train=model.predict_proba(X_train)[:,1]
    y_score_test=model.predict_proba(X_test)[:,1]
    
    cm_train = get_cm(y_train,y_pred_train)
    cm_test = get_cm(y_test,y_pred_test)
    
    print("Métricas Train")
    metrics(cm_train)
    print('\n')
    print("Métricas Test")
    metrics(cm_test)
    print("Matríz de confusión de Train")
    display(df_cm(cm_train))
    print("Matríz de confusión de Test")
    display(df_cm(cm_test))
    print('ROC')
    display(roc_fig(y_train, y_score_train))

# Naive Bayes

### Naive Bayes Gaussiano

In [72]:
from sklearn.naive_bayes import GaussianNB

In [73]:
nb_gaussian = GaussianNB()

In [74]:
best_nb_gaussian1 = bestmodel_GridSearchCV(X_train, y_train, nb_gaussian)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
Best Score (roc_auc) :  0.6018535612466658
Best Params :  {}


In [75]:
metrics_master(X_train, y_train, X_test, y_test, best_nb_gaussian1)

Métricas Train
   Exactitud : 0.6594178761202316
   Precision : 0.7433278695414618
   Recall : 0.7925610575845844
   f1_score : 0.7671553743385096
   TPR : 0.7925610575845844
   FPR : 0.6632636437686669


Métricas Test
   Exactitud : 0.6578797106966121
   Precision : 0.7420244437480413
   Recall : 0.7938711191577817
   f1_score : 0.7670726966437734
   TPR : 0.7938711191577817
   FPR : 0.6744224152056366
Matríz de confusión de Train


Unnamed: 0,Observacion_,Observacion
Prediccion_,35372,9258
Prediccion,12214,6201


Matríz de confusión de Test


Unnamed: 0,Observacion_,Observacion
Prediccion_,11839,3074
Prediccion,4116,1987


ROC


None

### SMV

In [76]:
from sklearn.svm import SVC

In [77]:
classifier_ker = SVC(random_state = 0,probability=True)

param_grid = dict(kernel=['poly', 'rbf', 'sigmoid'])

In [78]:
best_smv = bestmodel_GridSearchCV(X_train, y_train, classifier_ker, k_fold=3)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best Score (roc_auc) :  0.5554350557633592
Best Params :  {}


In [79]:
metrics_master(X_train, y_train, X_test, y_test, best_smv)

Métricas Train
   Exactitud : 0.7547941946228884
   Precision : 1.0
   Recall : 0.7547941946228884
   f1_score : 0.8602652059549313
   TPR : 0.7547941946228884
   FPR : nan


Métricas Test
   Exactitud : 0.7591834792539018
   Precision : 1.0
   Recall : 0.7591834792539018
   f1_score : 0.8631089232100836
   TPR : 0.7591834792539018
   FPR : nan
Matríz de confusión de Train


Unnamed: 0,Observacion_,Observacion
Prediccion_,47586,15459
Prediccion,0,0


Matríz de confusión de Test


Unnamed: 0,Observacion_,Observacion
Prediccion_,15955,5061
Prediccion,0,0


ROC


None

### Regresión Logística

In [80]:
from sklearn.linear_model import LogisticRegression

In [81]:
log= LogisticRegression()

param_grid = dict(solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                 penalty=["l2","l1","elasticnet"])

In [82]:
best_log = bestmodel_GridSearchCV(X_train, y_train, log, param_grid,k_fold=5)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
Best Score (roc_auc) :  0.6052227477425225
Best Params :  {'penalty': 'l1', 'solver': 'saga'}


In [83]:
metrics_master(X_train, y_train, X_test, y_test, best_log)

Métricas Train
   Exactitud : 0.7547941946228884
   Precision : 1.0
   Recall : 0.7547941946228884
   f1_score : 0.8602652059549313
   TPR : 0.7547941946228884
   FPR : nan


Métricas Test
   Exactitud : 0.7591834792539018
   Precision : 1.0
   Recall : 0.7591834792539018
   f1_score : 0.8631089232100836
   TPR : 0.7591834792539018
   FPR : nan
Matríz de confusión de Train


Unnamed: 0,Observacion_,Observacion
Prediccion_,47586,15459
Prediccion,0,0


Matríz de confusión de Test


Unnamed: 0,Observacion_,Observacion
Prediccion_,15955,5061
Prediccion,0,0


ROC


None

### Guardando el modelo

In [84]:
import pickle
filename = 'modelo_equipo_dinamita.pkl'

with open(filename,'wb') as f:
    pickle.dump(best_smv, f)

### Guardando el data frame 

In [85]:
df.to_csv('datos_equipo_dinamita.csv', index=False)