In [257]:
import pandas as pd
import numpy as np

### Firstly, the csv file could not be read because for some row there were extra columns so I manually delete the columns and I ensured to have the same number of columns for each row.

In [258]:
ufos = pd.read_csv("complete.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


### The error says us, the csv file has a column that consists out of multiple data types. So we can set the low_memory=False or we can set the dtype for a column manually. I choose manually.

In [259]:
ufos.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.941111
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.803611


In [260]:
for col in ufos.columns:
    print(col, "of the dtype:", ufos[col].dtype)
    

datetime of the dtype: object
city of the dtype: object
state of the dtype: object
country of the dtype: object
shape of the dtype: object
duration (seconds) of the dtype: object
duration (hours/min) of the dtype: object
comments of the dtype: object
date posted of the dtype: object
latitude of the dtype: object
longitude of the dtype: float64


### As you see, apart from longitude, we need to change their dtype!

In [261]:
ufos["datetime"] = pd.to_datetime(ufos['datetime'], errors="coerce") 
# Coerce command sets invalid data to Nat.

In [262]:
#Let's check the Nat values
ufos[ufos["datetime"].isnull()]

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
166,NaT,franklin,in,us,disk,0,?,two yellow objects sitting over the corn field...,10/20/2005,39.4805556,-86.055000
316,NaT,hot springs and custer,sd,,triangle,0,,October 11&#44 1994--Hot Springs/Custer--trian...,09/02/2005,43.431646,-103.474362
417,NaT,rome,ny,us,oval,120,a min or two,I was walking from the garage to the house&#44...,02/01/2007,43.2127778,-75.456111
487,NaT,truth or consequences,nm,us,unknown,0,,Fast moving red ball and then flashing light,10/30/2012,33.1283333,-107.252222
567,NaT,sweet home,or,us,unknown,0,ufo,Ligthning coming out of a cloud hovering&#44 w...,01/10/2009,44.3977778,-122.735000
...,...,...,...,...,...,...,...,...,...,...,...
88273,NaT,new york city (bronx),ny,us,unknown,0.0,?,Help. 500 Lights On Object0: Yes,10/08/2007,40.7141667,-74.006389
88312,NaT,olin,nc,us,,0.0,,Related UUO/UFO activity vicinity of NC Coast ...,11/21/2010,35.9525,-80.840000
88340,NaT,big bear city,ca,us,circle,0.0,,It was bright wright&#44and stopped then moved...,9/24/2012,34.2611111,-116.844167
88448,NaT,cedar rapids,ia,us,changing,900.0,15 min,WE SEEN IT,9/17/2003,42.0083333,-91.643889


### Let's get rid of these useless data!

In [263]:
ufos = ufos.dropna(subset=["datetime"])

In [264]:
ufos[ufos["datetime"].isnull()]

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude


## Let's deal with the "city" column!

In [265]:
ufos["city"] = pd.Series(ufos["city"], dtype="string")

In [266]:
print(ufos["city"].dtype)

string


In [267]:
ufos["state"] = pd.Series(ufos["state"], dtype="string")

In [268]:
ufos[ufos["state"].isnull()].head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
2,1955-10-10 17:00:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
6,1965-10-10 21:00:00,penarth (uk/wales),,gb,circle,180,about 3 mins,penarth uk circle 3mins stayed 30ft above m...,2/14/2006,51.434722,-3.18
18,1973-10-10 23:00:00,bermuda nas,,,light,20,20 sec.,saw fast moving blip on the radar scope thin w...,01/11/2002,32.364167,-64.678611
21,1974-10-10 21:30:00,cardiff (uk/wales),,gb,disk,1200,20 minutes,back in 1974 I was 19 at the time and lived i...,02/01/2007,51.5,-3.2
25,1976-10-10 22:00:00,stoke mandeville (uk/england),,gb,cigar,3,3 seconds,White object over Buckinghamshire UK.,12/12/2009,51.783333,-0.783333


### As we see here, non values are exist because the countries are different from USA.

In [269]:
ufos["country"].unique()

array(['us', nan, 'gb', 'ca', 'au', 'de'], dtype=object)

In [270]:
ufos[(ufos["state"].isnull()) & (ufos["country"] == "us")]

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude


### That's nice that if country is US, then the state is not non-value 

# Let's deal with the country column!

In [271]:
ufos["country"] = pd.Series(ufos["country"], dtype="string")

In [272]:
ufos[ufos["country"].isnull()]

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
1,1949-10-10 21:00:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
18,1973-10-10 23:00:00,bermuda nas,,,light,20,20 sec.,saw fast moving blip on the radar scope thin w...,01/11/2002,32.364167,-64.678611
19,1974-10-10 17:00:00,willow beach,az,,light,120,2 min,The object was a brillant white light standing...,2/18/2001,0,0.000000
30,1979-10-10 22:00:00,saddle lake (canada),ab,,triangle,270,4.5 or more min.,Lights far above&#44 that glance; then flee f...,1/19/2005,53.970571,-111.689885
36,1982-10-10 07:00:00,gisborne (new zealand),,,disk,120,2min,gisborne nz 1982 wainui beach to sponge bay,01/11/2002,-38.662334,178.017649
...,...,...,...,...,...,...,...,...,...,...,...
88819,2011-09-09 22:45:00,denmark,,,light,300.0,5 minutes,slow moving light about 5 meters up in the skye.,10/10/2011,0,0.000000
88855,2013-09-09 12:00:00,star tannery,va,,unknown,0.0,unk,Object seen in photo after it was taken and ph...,10/03/2013,39.078889,-78.427222
88859,2013-09-09 20:15:00,clifton,nj,,other,3600.0,~1hr+,Luminous line seen in New Jersey sky.,9/30/2013,40.858433,-74.163755
88862,2013-09-09 21:00:00,aleksandrow (poland),,,light,15.0,15 seconds,Two points of light following one another in a...,9/30/2013,50.465843,22.891814


In [276]:
ufos["country"].iloc[36]

<NA>

In [277]:
type(ufos["country"].iloc[36])

pandas._libs.missing.NAType

### As we see here, the country with non-values, the city column also can contain the country information. We can also write us in the country column of the series whose state column is valid.

In [278]:
def put_us(state, country):
    if state is not pd.NA and country is pd.NA:
        return "us"
    return country
        


ufos["country"] = np.vectorize(put_us)(ufos["state"], ufos["country"])

### Let's check!

In [279]:
ufos["country"].dtype

dtype('O')

In [280]:
ufos["country"].iloc[36]

'<NA>'

In [281]:
type(ufos["country"].iloc[36])

str

# I don't want to nan values as str!

In [256]:
ufos[ufos["country"].isnull()]

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude


In [282]:
ufos["country"] = pd.Series(ufos["country"], dtype="string")

In [285]:
ufos["country"] = ufos["country"].replace('<NA>', pd.NA)

In [287]:
ufos[ufos["country"].isnull()]

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
18,1973-10-10 23:00:00,bermuda nas,,,light,20,20 sec.,saw fast moving blip on the radar scope thin w...,01/11/2002,32.364167,-64.678611
36,1982-10-10 07:00:00,gisborne (new zealand),,,disk,120,2min,gisborne nz 1982 wainui beach to sponge bay,01/11/2002,-38.662334,178.017649
58,1993-10-10 03:00:00,zlatoust (russia),,,sphere,1200,20 minutes,I woke up at night and looked out the window n...,12/14/2004,55.183333,59.650000
69,1996-10-10 20:00:00,lake macquarie (nsw&#44 australia),,,light,300,5 min,RED LIGHT WITH OTHER RED FLASHING LIGHT&#44 ON...,5/24/1999,-33.093373,151.588982
76,1998-10-10 02:00:00,turin (italy),,,disk,15,15 sec,We were doing a patrol during the night&#44me ...,1/19/2005,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
88685,2002-09-09 19:30:00,thailand (northeast),,,changing,600.0,5-10 mins,you can see or documented another type ball of...,10/15/2002,15.870032,100.992541
88721,2004-09-09 21:00:00,dubai (uae),,,disk,15.0,15 secs,silver&#44 disk shaped object with lights surr...,9/29/2004,25.047664,55.181741
88725,2004-09-09 21:45:00,krimpen a/d ijssel (netherlands),,,light,5.0,5 sec,Two fast as a couple moving red lights,9/29/2004,51.915929,4.600337
88819,2011-09-09 22:45:00,denmark,,,light,300.0,5 minutes,slow moving light about 5 meters up in the skye.,10/10/2011,0,0.000000
