In [1]:
# import relevant libraries
import pandas as pd
import numpy as np

In [2]:
# read data
data = pd.read_csv('../Data/ufo_clean.csv')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,index,year_week,date_of_sight,year,month,week_of_year,state,city,sight_summary,ufo_shape
0,0,0,2006-52,2006-01-01,2006,1,52,NY,I-80 (unknown city proximity),"Traveling from Brooklyn, NY to Groton, NY on I...",Formation
1,1,1,2006-52,2006-01-01,2006,1,52,TX,San Antonio/Laredo (between),Three separate sightins which ocurred in rural...,Triangle
2,2,2,2006-1,2006-01-02,2006,1,1,MI,Ann Arbor,"Probable HOAX: 5 ufos, silent black objects, m...",Changing
3,3,3,2006-1,2006-01-02,2006,1,1,VA,Fredericksburg,clear sky looked like a star 10:00 am in the m...,Light
4,4,4,2006-1,2006-01-04,2006,1,1,PA,Scotland,"Small, white, light in the northeast sky...dis...",Light


In [4]:
# create a dataframe with only ufo shape
shape_data = pd.DataFrame(data['ufo_shape'])
shape_data

Unnamed: 0,ufo_shape
0,Formation
1,Triangle
2,Changing
3,Light
4,Light
...,...
71560,Light
71561,Chevron
71562,Oval
71563,Triangle


In [5]:
# checking for NaN
shape_data['ufo_shape'].isna().value_counts()

False    71213
True       352
Name: ufo_shape, dtype: int64

In [6]:
# only 0,5% of values are missing, this is neglectable, therefore drop NaN
shape_data1 = shape_data.dropna()
shape_data1['ufo_shape'].isna().value_counts()

False    71213
Name: ufo_shape, dtype: int64

In [7]:
shape_data1['ufo_shape'].value_counts()

Light       16065
Circle       8536
Triangle     6374
Fireball     6149
Sphere       5087
            ...  
TN              1
AK              1
SD              1
ID              1
UT              1
Name: ufo_shape, Length: 67, dtype: int64

In [8]:
# To see all the shapes
pd.set_option('display.max_rows', None)

In [9]:
shape_data1['ufo_shape'].value_counts()

Light            16065
Circle            8536
Triangle          6374
Fireball          6149
Sphere            5087
Unknown           5026
Other             4572
Oval              3081
Disk              2901
Formation         2685
Changing          1749
Cigar             1543
Flash             1355
Rectangle         1308
Cylinder          1089
Diamond           1039
Chevron            817
Teardrop           644
Egg                509
Cone               274
Cross              255
CA                  18
NY                  14
AZ                   9
PA                   7
NM                   7
WA                   7
CO                   7
TX                   6
OR                   6
FL                   5
IL                   4
KS                   4
GA                   4
NJ                   3
OH                   3
NC                   3
IA                   3
WY                   2
VT                   2
IN                   2
NE                   2
AR                   2
MO         

It looks like that in the shape columns there are some states inserted instead. Those could be compared and moved to the actual state column. But as there are not too many of those mistakes, I would drop them instead. With dropping, there are around 150 rows eliminated, which, given the large dataset, is neglectable.

In [10]:
# list of all shapes spotted
shape_lst = ['Light', 'Circle', 'Triangle', 'Fireball', 'Sphere', 'Unknown', 'Other', 'Oval', 'Disk', 'Formation', 
             'Changing', 'Cigar', 'Flash', 'Rectangle', 'Cylinder', 'Diamond', 'Chevron', 'Teardrop', 'Egg', 'Cone',
             'Cross']

# save only rows with a shapes entry which corresponds with the shape listed above
shape_data2 = shape_data1[shape_data1['ufo_shape'].apply(lambda x: x in shape_lst)]
shape_data2.shape

(71058, 1)

In [11]:
# create a dataframe with aggregated shapes
shape_data3 = pd.DataFrame(shape_data2['ufo_shape'].value_counts())
shape_data3

Unnamed: 0,ufo_shape
Light,16065
Circle,8536
Triangle,6374
Fireball,6149
Sphere,5087
Unknown,5026
Other,4572
Oval,3081
Disk,2901
Formation,2685


In [13]:
# 'Other' and 'Unknown' shapes can be added together to 'Unknown'
shape_data4 = shape_data3.reset_index().replace({'index': {'Other':'Unknown'}}).groupby('index', sort=False).sum()
shape_data4

Unnamed: 0_level_0,ufo_shape
index,Unnamed: 1_level_1
Light,16065
Circle,8536
Triangle,6374
Fireball,6149
Sphere,5087
Unknown,9598
Oval,3081
Disk,2901
Formation,2685
Changing,1749


In [14]:
# save shape dataset to continue plotting in Tableau
# shape_data4.to_csv('../Data/Datasets/shape_data_tableau.csv')