In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline
sns.set(color_codes=True)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/museum-directory/museums.csv', low_memory=False)

### Exploratory Data Analysis

In this section, we are going to perform an Exploratory Data Analysis of Museums data. In this regards, we will undertake 
different descriptive and statistical analysis of the data. We will try to identify the missing values, when present, clean 
them and try to make meaning of the remaining data.

In [None]:
# We'll look at the head in order to understand labels of the data. 
df.head()

In [None]:
# Let's see for the general information about the data. 
#It seems, there are a lot of missing values in columns such as Institution Name column.

df.info()

Let's see the statistical values of the numerical columns. 
According to the data, in the first quartile the income and revenue for some museums or such institutions seems to be 0. 
These institutions either have not provided their earnings or are completely subsidized by the government, 
which exempts them to publish their earnings. 

In [None]:
df.describe()

In [None]:
#Let's look at the statistical values from a differnet perspective. 
df.describe(include='O').transpose()

In [None]:
#Let's check the check the shape of the dataframe.
df.shape

### Removing the irrelevant columns, duplicates and missing values¶

In this part, we are going to first remove the irrelevant columns. Depending on the data, the removal of the columns can be different. But in this case, we have identified that there are overlapping information such as institution addresses or names. This information are not going to add value to our analysis. Moreover, some of the columns have more missing values. Therefore, we decided to drop those columns. Another column that we drop is the column of the Phone Numbers.

In [None]:
#Let's get the column names' first

df.columns

In [None]:
#Now, let's drop the columns.

df = df.drop(['Museum ID','Legal Name', 'Alternate Name', 'Street Address (Physical Location)', 'City (Physical Location)',
       'State (Physical Location)', 'Phone Number', 'Employer ID Number'], axis=1)

# And check the head again. 

df.head()

In [None]:
# Let' check the shape of dataframe again. 

df.shape

In [None]:
# Let's remove the number of rows before removing the duplicates.

df = df.drop_duplicates()

In [None]:
# Let's count the number of rows after removing the duplicates.

df.count()

In [None]:
# The next thing is to identify the number of missing values. 
#According to our dataframe, columns such as Institution Name or 
#Zip Code (Physical Location) have the most missing values.

df.isnull().sum()

In [None]:
# Let's find the columns, which have more than 50% missing values.

most_missing_cols = set(df.columns[df.isnull().mean() > 0.50])

most_missing_cols

In [None]:
# In this case let's drop the "Institution Name" and Zip Code (Physical Location) columns, since we believe that 
#ommision of these columns will not much differnce to our analysis. 

df = df.drop(['Institution Name', 'Zip Code (Physical Location)'], axis=1)

In [None]:
# Let's check the head of our dataframe again to make sure that the last two columns have been dropped.

df.head()

In [None]:
# The next step is to drop the missing values. 

df = df.dropna()
df.count()

In [None]:
# Let's check the columns with 0 missing values

no_nulls = set(df.columns[df.isnull().mean()==0])
no_nulls

Even though Revenue column does appear in the above snippet, in the code below, we have identified rows that 
have 0s rather than missing values. We believe that these institutions with 0 Revenues are the ones
that receive support from the government and are not required to file for taxes. 0s affect the mean of our calculations. 
Therefore, we will filter out the rows with 0s as well.

In [None]:
# When we plot the Revenue column with 0s included, the mean is close to zero and almost invisible. 
sns.boxplot(x=df['Revenue'], showfliers=False);

In [None]:
# Now, we are going to filter out the rows that have 0 values and assign the result to a new variable.
no_zeros = df[df['Revenue']!=0]

In [None]:
# Let's look at the spread of the revenue with 0s filtered out. As you can observe the mean, 
#even though slightly, has moved to the right. 
sns.boxplot(x=no_zeros['Revenue'] , showfliers=False);

In [None]:
#In terms of normal distribution, we have a right skewed histogram. 
fig, ax = plt.subplots()
ax.hist(x=no_zeros['Revenue'], bins =2)
ax.set(title = 'Normal distribution of museum revenues');

In [None]:
# Let's count the number of museum types available in the dataframe.

df['Museum Type'].value_counts().nlargest(20).plot(figsize=(10,5), kind = 'bar')
plt.title("Number of museum types in the United States")
plt.ylabel("Number of museums")
plt.xlabel("Types of museums");

In [None]:
#Let's see what type of museums attract the most visitors in terms of revenue.

type_rev = df.groupby(['Museum Type']).agg({'Revenue':'sum'})
type_rev = type_rev.sort_values(by='Revenue', ascending=False)
type_rev.plot(kind='bar', figsize=(10,5));

In [None]:
# How about the states that earn the majority of Revenues?
state_rev = df.groupby(['State (Administrative Location)']).agg({'Revenue':'sum'})
state_rev = state_rev.sort_values(by='Revenue', ascending=False)
state_rev[:20].plot(kind='bar', figsize=(10,5))
plt.ylabel('Revenue (million $)');

In [None]:
# Let's see the top grossing museums in terms of their city locations. 
city_rev = df.groupby(['City (Administrative Location)']).agg({'Revenue':'sum'})
city_rev = city_rev.sort_values(by='Revenue', ascending=False)
city_rev[:20].plot(kind='bar', figsize=(10,5))
plt.ylabel('Revenue (million $)');

In [None]:
# In this section we are going to create a column based on revenue 
#column in order to rank the museums in terms of their revenues. 

cols = ['Revenue']

no_zeros['Rank'] = no_zeros.sort_values(cols, ascending=False).groupby(cols, sort=False).ngroup()+1
no_zeros.head()

In [None]:
no_zeros.sort_values('Rank')

In [None]:
#Here we can see that the Museums, that are sorted by their revenues. When we compare with the Rank column
#we see different museum names. It's probably due to the rank method.

museum_rev = df[['Museum Name','Revenue']].sort_values(by='Revenue', ascending=False)
museum_rev.head()

### Conclusion

The aim of this exploratory analysis was to perform statistical analysis on Museums, Aquariums and Zoo data. As we have seen 
the dataset contains many null and zero values. In order to get an accurate result, we have dropped null values and ommited 
0 values from our calculations. The dataset contained duplicate or overlapping values as well. We have identified these values 
and dropped before starting to analyze the data. We remaining data, we were able to analyze identify the number of revenues, high earning 
cities and states and plot the results on bar charts. For the analysis to be more accurate, we need more data to evaluate further. 