# Step 0: Imports and Reading Data


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
plt.style.use('ggplot')
pd.set_option('display.max_columns', 200)

In [None]:
df = pd.read_csv('/kaggle/input/coaster-practice-dataset/coaster_db.csv')

In [None]:
df

#Step 1: Data Understanding

* Dataframe shape
* head and tail
* dtypes
* describe



In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df.describe()

# Step 2: Data Preparation

* Dropping irrelevant columns and rows
* Identifying duplicated columns
* Renaming Columns
* Feature Creation

In [None]:
df.columns

In [None]:
# Example of dropping columns
# df.drop(['Opening date'], axis=1)

In [None]:
# we keep the columns that we want

df = df[['coaster_name',
    #'Length', 'Speed',
    'Location', 'Status',
    # 'Opening date',
    #   'Type',
    'Manufacturer',
    # 'Height restriction', 'Model', 'Height',
    #   'Inversions', 'Lift/launch system', 'Cost', 'Trains', 'Park section',
    #   'Duration', 'Capacity', 'G-force', 'Designer', 'Max vertical angle',
    #   'Drop', 'Soft opening date', 'Fast Lane available', 'Replaced',
    #   'Track layout', 'Fastrack available', 'Soft opening date.1',
    #   'Closing date',
    # 'Opened',
    # 'Replaced by', 'Website',
    #   'Flash Pass Available', 'Must transfer from wheelchair', 'Theme',
    #   'Single rider line available', 'Restraint Style',
    #   'Flash Pass available', 'Acceleration', 'Restraints', 'Name',
       'year_introduced',
    'latitude', 'longitude', 'Type_Main',
       'opening_date_clean',
    #'speed1', 'speed2', 'speed1_value', 'speed1_unit',
       'speed_mph',
    # 'height_value', 'height_unit',
    'height_ft',
       'Inversions_clean', 'Gforce_clean']].copy()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
# We need to change date type to datetime64[ns] from object

df['opening_date_clean'] = pd.to_datetime(df['opening_date_clean'])

In [None]:
# Rename our columns

df = df.rename(columns={'coaster_name':'Coaster_Name',
                   'year_introduced':'Year_Introduced',
                   'opening_date_clean':'Opening_Date',
                   'speed_mph':'Speed_mph',
                   'height_ft':'Height_ft',
                   'Inversions_clean':'Inversions',
                   'Gforce_clean':'Gforce'})

In [None]:
df.columns

In [None]:
# checking for NaN values per column

df.isna().sum()

In [None]:
# Checking for duplicate coaster name

df.duplicated(subset=['Coaster_Name'])


In [None]:
# Finding exactly locations of duplicated coaster names

df.loc[df.duplicated(subset=['Coaster_Name'])]

In [None]:
# Checking an example duplicate

df.query('Coaster_Name == "Crystal Beach Cyclone"')

In [None]:
# Sum of duplicated rows in columns Coaster Name, Location & Opening Date

df.duplicated(subset=['Coaster_Name', 'Location', 'Opening_Date']).sum()

In [None]:
# Taking the inverse { ~ } Sum
# So we find the locations of NON duplicates in subsets
# And we need to fix index from dropped rows

df = df.loc[~df.duplicated(subset=['Coaster_Name', 'Location', 'Opening_Date'])] \
    .reset_index(drop=True).copy()

In [None]:
df.shape

# Step 3: Feature Understanding

* Plotting Feature Distributions
  * Histogram
  * KDE
  * Boxplot

In [None]:
# Count values in Year Introduced column

df['Year_Introduced'].value_counts()

In [None]:
# Make a chart of top 10 years coasters have been introduced

ax = df['Year_Introduced'].value_counts().head(10) \
      .plot(kind='bar', title='Top 10 Years Coasters Introduced')
ax.set_xlabel('Year Introduced')
ax.set_ylabel('Count')
plt.show()

In [None]:
# Make a histogram of speed (mph) distribution

ax = df['Speed_mph'].plot(kind='hist', bins=20, title='Coaster Speed (mph)')
ax.set_xlabel('Speed (mph)')
plt.show()

In [None]:
# Make a KDE chart of speed (mph) density

ax = df['Speed_mph'].plot(kind='kde', title='Coaster Speed (mph)')
ax.set_xlabel('Speed (mph)')
plt.show()

# Step 4: Feature Relationships

* Scatterplot
* Heatmap Correlation
* Pairplot
* Groupby comparisons

In [None]:
# Make a scatterplot with Matplotlib to see relationships between Speed and Height of coasters

df.plot(kind='scatter', x='Speed_mph', y='Height_ft', title='Coaster Speed vs. Height')
plt.show()

In [None]:
# Make a scatterplot with Seaborn to see relationships between Speed and Height of coasters
# but with more details in chart like hue of years introduced

sns.scatterplot(x='Speed_mph', y='Height_ft', hue='Year_Introduced', data=df)
plt.show()

In [None]:
# If we want to compare more than 2 features of our dataset
# We can do it with Pairplot
# We provide also hue of Type Main

sns.pairplot(df, vars=['Year_Introduced', 'Speed_mph', 'Height_ft',
                       'Inversions', 'Gforce'],
             hue='Type_Main')
plt.show()

In [None]:
# See correlations between features

df_corr = df[['Year_Introduced','Speed_mph',
    'Height_ft','Inversions','Gforce']].dropna().corr()
df_corr

In [None]:
# We can see that also with heatmap chart from Seaborn

sns.heatmap(df_corr, annot=True)
plt.show()

# Step 5: Ask a Question about the data

* Try to answer a question you have about the data using a plot or statistic



**What are the locations with the fastest roller coasters(minimum of 10)?**

In [None]:
# First we search value counts in subset Location

df['Location'].value_counts()

In [None]:
# We find 'other' value in many locations so we ignore it

ax = df.query('Location != "Other"') \
    .groupby('Location')['Speed_mph'] \
    .agg(['mean','count']) \
    .query('count >= 10') \
    .sort_values('mean')['mean'] \
    .plot(kind='barh', figsize=(12, 5), title='Average Coast Speed by Location')
ax.set_xlabel('Average Coaster Speed')
plt.show()