# UFO Sightings Analysis

## Load the CSV file
### Noah Code Starts here

In [None]:
# Import any needed functionality
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
import gmaps
from IPython.display import Image
from scipy.stats import linregress

# This will hide any of the small pink messages
import warnings
warnings.filterwarnings("ignore")

# Import API Key from Config
from config import g_key

In [None]:
# Set file path
file = ("UFO_Data/US_UFO_Sightings.csv")

# Read the csv file
us_ufo_data = pd.read_csv(file)

In [None]:
# Display the csv file as a dataframe
us_ufo_df = pd.DataFrame(us_ufo_data)
us_ufo_df.head()

In [None]:
# Remove the "Unnamed: 0" index row that accidently came through in the cleaned file
us_ufo_data = us_ufo_df.drop(columns = ["Unnamed: 0"])
us_ufo_data.head()

## Initial Analysis

### Learning more about our dataset

In [None]:
Image(filename = "Images/questions.jpg")

In [None]:
# What is the shape of our dataframe?
# How many sightings are recordered?
us_ufo_data.shape

In [None]:
# What kind of datatypes are we looking at?
us_ufo_data.dtypes

In [None]:
# What is the first year we have a recorded sighting for, and what is the most recent in this dataframe?

# Can use a min and max of the date column to find this
us_ufo_min_year = us_ufo_data["date"].min()
us_ufo_min_year

us_ufo_max_year = us_ufo_data["date"].max()
us_ufo_max_year

print(f"This dataset ranges from sightings recorded in {us_ufo_min_year} to {us_ufo_max_year}")

In [None]:
# What are all of the unique values given for the shape column?

# Our group liked to think of this as the aliens car or "whip" if you are young and cool
alien_whips = us_ufo_data["shape"].unique()
alien_whips

### Where do aliens like to visit?

In [None]:
Image(filename = "Images/map_pin.jpg")

In [None]:
# Look at the state data to see which state is most visited

# Use a groupby on state
state_group = us_ufo_data.groupby("state")
state_group

# Get the count of each time there was a sighting for that state group to get the count for each state
state_visits = state_group["time"].count()
state_visits

In [None]:
# Set up a bar graph to show this information a little more clearly

# Get the abbreviations of all of the different states, to be used as the x axis
#make the states uppercase
states = us_ufo_data["state"].str.upper().unique()
states 


In [None]:
# Need to read the list alphabetically to match with tick locations with groupby results
states_alph = sorted(states)
states_alph

In [None]:
# Generate a bar plot showing the total number of for each state
x_axis = np.arange(0, len(states))

# Grabs each state abbreviation and stores it for the x tick volume as it goes through the list of states
tick_locations = []

for x in x_axis:
    tick_locations.append(x)


# Change the size of the chart to make it more readable
plt.figure(figsize = (15, 5))

# Plot the bar chart
plt.bar(x_axis, state_visits, align = "center")
plt.xticks(tick_locations, states_alph, rotation = 90)

# Add legend
plt.legend(["Visits"], loc = "upper center")

# Create labels for the bar plot
plt.title("Total Documented Sightings per State")
plt.xlabel("State")
plt.ylabel("Number of Sightings in the State")

# Set limits for the bar chart
plt.xlim(-1, len(states_alph) + 0.25)
plt.ylim(0, max(state_visits) + 1000)

# Best practice to always include plt.show
# Even though it is not technically needed in jupyter notebooks
plt.show()

So far from our analysis we can see that most of the sightings for aliens happen in California, followed by Florida, and then by Washington. This is a good start, but we have a city column in our dataset, so we can look even closer to see where these guys are going!!

In [None]:
# What city is visited the most regardless of state visits? (Best vacation spot?)

# Use groupby on the city this time
city_group = us_ufo_data.groupby("city")
city_group

# Get the count of visits for each city by recording each time there was a sighting
city_group_visits = city_group.count()["time"]
city_group_visits

# Reset the index
city_group_visits = city_group_visits.reset_index("city")
city_group_visits

# The 5 most popular cities for alien sightings in the US
# Sort our list descending and display the top 5 results (ascending = False)
popular_cities = city_group_visits.sort_values(["time"], ascending = False)
popular_cities.head(5)

Interestingly, despite California being far and above the state with the most sightings, a city in Washington is the most visited! Maybe it is because the original Starbucks is there? Also very interesting, is that despite Florida being the state with the second highest sightings, none of it's cities crack the top 5.

In [None]:
# What are the top cities within the most visited state?

# We only want to look at data from california so we will use .loc to get only those values from the state column
california = us_ufo_data.loc[us_ufo_data["state"] == "ca"]
california

# This could help determine the best vacation spot for aliens!
ca_city_group = california.groupby("city")
city_group

# Get the count of each time there was a sighting 
ca_city_group_visits = ca_city_group.count()["time"]
city_group_visits

# Reset the index
ca_city_group_visits = ca_city_group_visits.reset_index("city")
ca_city_group_visits


# The 5 most popular cities for alien sightings in the California
# Sort our list descending and display the top 5 results (ascending = False)
popular_ca_cities = ca_city_group_visits.sort_values(["time"], ascending = False)
print(f"These are the 5 California cities with most sightings:")
popular_ca_cities.head(5)

In [None]:
# What are the top cities within Florida?

# We only want to look at data from Florida so we will use .loc to get only those values from the state column
florida = us_ufo_data.loc[us_ufo_data["state"] == "fl"]
florida

# This could help determine the best vacation spot for aliens!
fl_city_group = florida.groupby("city")
fl_city_group

# Get the count of each time there was a sighting 
fl_city_group_visits = fl_city_group.count()["time"]
city_group_visits

# Reset the index
fl_city_group_visits = fl_city_group_visits.reset_index("city")
fl_city_group_visits


# The 5 most popular cities for alien sightings in the Florida
# Sort our list descending and display the top 5 results (ascending = False)
popular_fl_cities = fl_city_group_visits.sort_values(["time"], ascending = False)
print(f"These are the 5 Florida cities with most sightings:")
popular_fl_cities.head(5)

In [None]:
# What are the top cities within Washington?

# We only want to look at data from Washington so we will use .loc to get only those values from the state column
washington = us_ufo_data.loc[us_ufo_data["state"] == "wa"]
washington

# This could help determine the best vacation spot for aliens!
wa_city_group = washington.groupby("city")
wa_city_group

# Get the count of each time there was a sighting 
wa_city_group_visits = wa_city_group.count()["time"]
city_group_visits

# Reset the index
wa_city_group_visits = wa_city_group_visits.reset_index("city")
wa_city_group_visits


# The 5 most popular cities for alien sightings in the Washington
# Sort our list descending and display the top 5 results (ascending = False)
popular_wa_cities = wa_city_group_visits.sort_values(["time"], ascending = False)
print(f"These are the 5 Washington cities with most sightings:")
popular_wa_cities.head(5)

# Need help here!!

# We need a value for the weight of each, I have been trying to put the count per state as a new "visits per state" column but I can't get it to read the state from the state column and append the sightings in that state to each row. There has to be a column with a numerical range that can be set to weight otherwise the map won't populate

# Maybe cities is better than state? Might be a TA question I cannot figure this out

In [None]:
# US Heatmap for alien visits over the years
# Configure gmaps
gmaps.configure(api_key = g_key)

# Use the latitude and longitude columns as the locations variable

## JUST SO THE TEAM KNOWS THERE IS A SPACE AFTER "longitude " FOR THE COLUMN SO YOU HAVE TO PUT THE SPACE IN WHEN CALLING IT
locations = us_ufo_data[["latitude", "longitude "]]

# Use the state visits as the weight variable???


# MIGHT HAVE TO MAKE A NEW ONE FOR CITY COUNTS IDK??
us_ufo_data["visits"] = 5
us_ufo_data
#locations.count()

In [None]:
### I DONT THINK WE NEED THIS

#state_value = us_ufo_data["state"]

# for x in state_value:
#     print(x)
#     if x == state_visits.get_group("states")["time"]:
#          print("match1")
#          us_ufo_data["visits per state"] = state_visits["time"]
                   
#us_ufo_data

In [None]:
#state_visits["state"][2]

In [None]:
# Generate the map
# From class discussion, it is important to set a center, and a zoom level to help make the map display properly
# Got the center and zoom_level values through trial and error
fig = gmaps.figure(center=(41, -87), zoom_level = 3.8)

# Generate a new layer on the map - Heat layer
heat_layer = gmaps.heatmap_layer(locations, weights = us_ufo_data["visits"],
                                 dissipating = False, max_intensity = 100,
                                 point_radius = .3)

# Add layer
fig.add_layer(heat_layer)

# Display figure
fig

### What are they drivings??

In [None]:
Image(filename = "Images/space_tesla.jpg")

In [None]:
# We will look at the shapes recorded for each sighting to determine what the aliens are driving?
alien_cars = us_ufo_data.groupby("shape")
alien_cars

# Get the count of each time there was a sighting 
shape_count = alien_cars.count()["time"]
shape_count

# Reset the index
alien_fav_whip = shape_count.reset_index()
alien_fav_whip

alien_fav_whip = alien_fav_whip.sort_values(["time"], ascending = False)
alien_fav_whip

In [None]:
# Let's find the most popular UFO shapes and work with just those
# The top shapes will be the ones that have been spotted at least 2000 times
top_alien_fav_whips = alien_fav_whip.loc[alien_fav_whip["time"] > 2000]
top_alien_fav_whips

# Not sure if we should use the pie chart or the horizontal bar for this? Since we are not looking at 100% with the top shapes, maybe the bar is better?

In [None]:
# # Generate a pie plot showing the distribution of visits across the states
# # Labels for the sections of our pie chart
labels = top_alien_fav_whips["shape"] 
sizes = top_alien_fav_whips["time"]

plt.figure(figsize = (35, 7))


# # Tell matplotlib to create a pie chart filled with corresponding percentages and displayed vertically
plt.pie(sizes, labels = labels, autopct="%1.1f%%", startangle=120)
plt.title("What is the Toyota Camry of UFO's?")
plt.legend(labels, loc = "best", bbox_to_anchor=(-0.1, 1.)) 

# # Display resulting plot
#plt.axis("equal")
#plt.tight_layout()
plt.show()

In [None]:
plt.rcdefaults()
fig, ax = plt.subplots()

# Example data
shapes = top_alien_fav_whips["shape"]
y_pos = np.arange(len(shapes))
times_seen = top_alien_fav_whips["time"]
#error = np.random.rand(len(people))

ax.barh(y_pos, times_seen, align='center')
ax.set_yticks(y_pos)
ax.set_yticklabels(shapes)
ax.invert_yaxis()  # labels read top-to-bottom
ax.set_xlabel("Number of times seen")
ax.set_title("What is the Toyota Camry of UFO's")

plt.show()

So maybe the aliens aren't driving around Tesla's.... yet. It seems like they travel in style, or at least with their headlights on as 'light' is the number one descriptor for UFO sightings with nearly twice as many as the next closest, the ever frightening 'triangle'

### Best time for cow abduction?

In [None]:
Image(filename = "Images/cow.jpg")

In [None]:
us_ufo_data.head()

In [None]:
us_ufo_data.dtypes

In [None]:
# First we want to create a new column that takes the time and buckets it into one hour segments
us_ufo_data["hour"] = np.floor(us_ufo_data["time"].str.replace(":", "").astype("int") / 100)

# This makes the "24" hour value and combines with the 0 hour value (midnight)
# It is dividing by 24 and putting the remainder together (like fizbuzz!)
#us_ufo_data["hour"].max()
us_ufo_data["hour"] %= 24 

us_ufo_data.head()

In [None]:
# We will look at the hour recorded for each sighting to determine peak times of alien sightings (cow abductions!!)
time_group = us_ufo_data.groupby("hour")

# Get the list of hours (in military time)
times_of_day = us_ufo_data["hour"].unique()
times_of_day.sort()

# Get the count of visits by seeing how many times each hour is recorded
num_visits = time_group["hour"].count()
num_visits

In [None]:
# Plot the time of sightings data with a scatter plot

# Update the figure size so that it will be easily readable
plt.figure(figsize = (15, 5))

# Scatter plot to see if the time of day has any correlation with total number of visits
# time of day is our x axis and the number of visits (sightings) is our y
plt.scatter(times_of_day, num_visits)

# Arrange tick labels
plt.xticks(np.arange(0, 24, 1))

# Set the title and x and y labels of the plot
plt.title("Number of Sightings at Time of Day")
plt.xlabel("Time of day")
plt.ylabel("Number of sightings")
plt.grid()

# Best practice to use plt.show()
plt.show()

Here we can see that most recorded sightings of UFO's occur during the evening and night hours. This would make sense as it is darker and probably easier to see their space ships at night than during the day.

### Do they keep coming back?

In [None]:
Image(filename = "Images/old_ufo.jpg")

In [None]:
# Over the years are the number of sightings increasing or decreasing in any pattern or rate?

# Same as in the previous chart, we will need to split out the date column information and solo in on the year
calendar = us_ufo_data["date"].str.split("/", n = 2, expand = True)
us_ufo_data["day"] = calendar[0]
us_ufo_data["month"] = calendar[1]
us_ufo_data["year"] = calendar[2]

us_ufo_data.head()

In [None]:
# We will look at the year recorded for each sighting
year_group = us_ufo_data.groupby("year")

# Get the years recorded
years = us_ufo_data["year"].unique()
years.sort()

# Get the count of each time there is a UFO sighting that year
visits_per_year = year_group["year"].count()
visits_per_year


In [None]:
total_visits = pd.DataFrame({"years": years,
                "visits": visits_per_year})
total_visits.reset_index(drop=True, inplace=True)
total_visits



In [None]:
#change the str to float so we can do a regression test
total_visits['visits']= pd.to_numeric(total_visits['visits'], downcast="float")
total_visits

In [None]:
# Use the iloc to find the data for 1992 to current 
#total_visits.iloc[60]

In [None]:
#make a df of the recent ufo data
recent_visitis = total_visits.iloc[60:]
recent_visitis

In [None]:
# Plot the time of sightings data with a line chart

# Update the figure size so that it will be easily readable
plt.figure(figsize = (15, 5))

# Line chart to see if the sightings over the years in the US has an pattern
# years is our x axis and visits per year is our y axis
plt.plot(years, visits_per_year, marker = "^")

# Arrange tick labels
plt.xticks(np.arange(0, 84, 10))

# Set the title and x and y labels of the plot
plt.title("Number of Visits per year")
plt.xlabel("Year")
plt.ylabel("Number of sightings")

# Best practice to use plt.show()
plt.show()

As we can see in the line chart above, there have been an exponential increase in UFO sightings since the mid 90's. I wonder if there were any event around this time that would have caused people to look up at the sky more often?

### Noah Code Ends here

## Deeper Analysis
### Mahnoor Code Starts here

### Regression or hypothesis testing

In [None]:
#total ufo visitis in the last decade
x_values = total_visits['years'].astype(float)
y_values = total_visits['visits']

correlation = st.pearsonr(x_values,y_values)

# Print the answer to above calculation
print(f"""The correlation between weight and average tumor volume
on the Capomulin regimen is {round(correlation[0],2)}.""")

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept

#calculate linear regression 
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept

line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

print(line_eq)

In [None]:
# Add labels and title to plot
fig1, ax1 = plt.subplots(figsize=(10, 8))
plt.scatter(x_values,y_values,s=75, color="green")
plt.plot(x_values,regress_values,"r-")
plt.title('UFO sighings in the last decade',fontsize =12)
plt.xlabel('Years',fontsize =14)
plt.ylabel('UFO Sighings',fontsize =10)

# Annotate linear regression
ax1.annotate(line_eq, xy=(2010, 1995), xycoords='data',xytext=(0.5, 0.8),
             textcoords='axes fraction',horizontalalignment='right',
             verticalalignment='top',fontsize=10,color="green")

print(f"The r-squared is: {rvalue**2}")

#plt.savefig("../Images/ufo_decade.png", bbox_inches = "tight")

plt.show()

In [None]:
#recent UFO visitis in the last 30yaers
x_values = recent_visitis['years'].astype(float)
y_values = recent_visitis['visits']

correlation = st.pearsonr(x_values,y_values)

# Print the answer to above calculation
print(f"""The correlation between UFO sighing over the years
on the Capomulin regimen is {round(correlation[0],2)}.""")

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept

#calculate linear regression 
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept

line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

print(line_eq)


In [None]:
#we can change the year value to get the prediton of the years we want based on that data we have.
prediction = slope * 2015 + intercept
prediction

In [None]:
# Add labels and title to plot
fig1, ax1 = plt.subplots(figsize=(10, 8))
plt.scatter(x_values,y_values,s=75, color="green")
plt.plot(x_values,regress_values,"r-")
plt.title('UFO Sighings 1992 to 2014',fontsize =12)
plt.xlabel('Years',fontsize =14)
plt.ylabel('UFO Sighings',fontsize =10)

# Annotate linear regression
ax1.annotate(line_eq, xy=(2010, 1000), xycoords='data',xytext=(0.5, 0.8),
             textcoords='axes fraction',horizontalalignment='right',
             verticalalignment='top',fontsize=10,color="green")

print(f"The r-squared is: {rvalue**2}")

#plt.savefig("../Images/ufo_1992_2015.png", bbox_inches = "tight")

plt.show()

In [None]:
# Regression possibilities

#    Can we predict the number of total sightings for 2013?
    
#    What city will have the highest visits in 2013?
#    Can we predict what region an alien is most likely to visit on their next vacation to earth?

In [None]:
# Hypothesis testing possibilities (if we had more time we could do this part, but for now we to skip :( 

#    We believe that aliens prefer to visit at night, and that sightings will most likely happen between 10pm and 2am
#    We think us excitement around aliens peaks around halloween, so the majority of sightings will be in the fall (Sept - Nov)
#    We believe the midwest, not the southwest (area 51) will have the most sightings (More cows to abduct!!)
#    Is area 51 (Nevada and surrounding area) truly the hub of all recorded alien activity??

In [None]:
## We could potentially look at the 2016 csv we almost used to see how accurate our predictions were??
## Idk if that would be a stretch or like bad to compare (apples to oranges?)

### Mahnoor Code Ends here