In [None]:
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns

import plotly.plotly as py
import plotly.graph_objs as go
import matplotlib.pyplot as plt

from plotly import tools
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode()

plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
df0 = pd.read_csv('../input/complete.csv', usecols=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], low_memory=False)

In [None]:
# check if there are missing values in dataset
df0.isnull().values.any()

In [None]:
# count NaN values in each column
df0.isnull().sum()

In [None]:
# fill NaN values with '0'
df=df0.fillna(value=0)
df

In [None]:
# check all columns' data types
df.dtypes

In [None]:
# change type of latitude to float
df['latitude'] = pd.to_numeric(df['latitude'],errors='coerce')
df['duration (seconds)'] = pd.to_numeric(df['duration (seconds)'],errors='coerce')

In [None]:
df.dtypes

In [None]:
# make city, state, country columns more pretty
df['city']=df['city'].str.title()
df['state']=df['state'].str.upper()
df['country']=df['country'].str.upper()

In [None]:
# check if there are inappropraite values in dataset
df.describe().astype(np.int64).T

In [None]:
# replace inappropraite values with column mean
df.replace([97836000,0],df['duration (seconds)'].mean())

In [None]:
df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')
# df.insert(1, 'year', df['datetime'].dt.year)
df['year'] = df['datetime'].dt.year
df['hour'] = df['datetime'].dt.hour
df['year'] = df['year'].fillna(0).astype(int)
df['hour'] = df['hour'].fillna(0).astype(int)
df['city'] = df['city'].str.title()
df['state'] = df['state'].str.upper()
df['country'] = df['country'].str.upper()
df['shape'] = df['shape'].str.title()
df['latitude'] = pd.to_numeric(df['latitude'], errors='coerce')

us_states = np.asarray(['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',
                        'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
                        'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
                        'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
                        'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY'])

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
# count each state's ufo sighting times
df["state"].value_counts()

In [None]:
# show the current information of dataset
df.info()

# Density plots

From this density plot, we can know that ufo sighting reaches its peak around 2000.

In [None]:
sns.distplot(df['year'])
plt.xlim(1900,2015)
plt.show()

From this density plot, we see that most UFO sightings were reported at night, around 20 o'clock. UFOs seldom appear during the day.

In [None]:
sns.distplot(df['hour'])
plt.show()

From this bar plot, we can see that there are also some ufo sightings reported in Canada, Britain and Austrilia in this dataset.

In [None]:
countries = df['country']
country_count = countries.value_counts()
country_count[:5].plot(kind='bar')
plt.show()

### Which states have reported the most UFO sightings in the past century?

From this bar plot, we can see that top 3 states that reported most ufo sightings are CA, TX and FL. And the time in CA is double of the second one. And there are more than 300 UFO sightsings were marked with unknown location.

In [None]:
# draw the barplot of the first 20 states' ufo sightings times
states = df['state']
state_count = states.value_counts()
state_count[:51].plot(kind='bar')
plt.show()

### Which shape of UFO has been reported most?

From this barplot, we know that the light UFO has been reported most. The second and third one is shape in triangle and circle. There are also many unknown shapes UFOs.

In [None]:
# draw the barplot of different shapes' ufo sightings times
shapes = df['shape']
shape_count = shapes.value_counts()
shape_count[:15].plot(kind='bar')
plt.show()

### Generates descriptive statistics that summarize the central tendency, dispersion and shape of a dataset’s distribution.

In [None]:
# find the summary statistics for each column
df.describe(include='all')

# Box-Plot

In [None]:
sns.boxplot(x="shape", y="longitude", data=df)
plt.show()

In [None]:
sns.stripplot(x="shape", y="longitude", data=df, jitter=True)
plt.show()

In [None]:
sns.boxplot(x="shape", y="latitude", data=df)
plt.show()

In [None]:
sns.stripplot(x="shape", y="latitude", data=df, jitter=True)
plt.show()

# Bar charts

From this bar chart, we see that there are more than 500 UFO sightings reported in California. And there are 6 states that have reported more than 200 UFO sightings.

In [None]:
sns.countplot(x="state", data=df, palette="Greens_d")
plt.show()

In [None]:
sns.countplot(x="shape", data=df, palette="Greens_d")
plt.show()

In [None]:
sns.regplot(x="year", y="latitude", data=df)
plt.xlim(1960,2015)
plt.show()

In [None]:
sns.lmplot(x="longitude", y="latitude", data=df)
plt.show()