In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix, roc_curve, auc

# Basline Model
from sklearn.linear_model import LogisticRegression

# Tree-Based Models
from sklearn.ensemble import RandomForestClassifier

# Gradient Boosting Models
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Scaling (for Logistic Regression)
from sklearn.preprocessing import StandardScaler

In [2]:
# load the dataset and set certain arguments to read the data correctly.
df = pd.read_csv('us_national_parks.csv')

# Display the first few rows of the dataset
display(df.head())

Unnamed: 0,Park Name,Latitude,Longitude,Primary Location,Second Location,Third Location,Date established as park,Area in acres,Area in km squares,Recreation visitors (2021),Description
0,Acadia,﻿44.35°N,68.21°W,Maine,,,"February 26, 1919",49071.4,198.6,4069098,Covering most of Mount Desert Island and other...
1,American Samoa,﻿14.25°S,170.68°W,American Samoa,,,"October 31, 1988",8256.67,33.4,8495,The southernmost national park is on three Sam...
2,Arches,﻿38.68°N,109.57°W,Utah,,,"November 12, 1971",76678.98,310.3,1806865,"This site features more than 2,000 natural san..."
3,Badlands,﻿43.75°N,102.50°W,South Dakota,,,"November 10, 1978",242755.94,982.4,1224226,"The Badlands are a collection of buttes, pinna..."
4,Big Bend,﻿29.25°N,103.25°W,Texas,,,"June 12, 1944",801163.21,3242.2,581220,Named for the prominent bend in the Rio Grande...


In [3]:
# Determining the size of the DataFrame
n_rows, n_cols = df.shape
print(f"The DataFrame has {n_rows} rows and {n_cols} columns.")

The DataFrame has 63 rows and 11 columns.


In [4]:
# Convert the column names to lowercase and at underscore instead of spaces
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [5]:
# Drop columns that are not needed for analysis
df.drop(columns=['park_name', 'second_location', 'third_location', 'date_established_as_park', 'description'], inplace=True)

In [6]:
# Display informative summary of the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63 entries, 0 to 62
Data columns (total 6 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   latitude                    63 non-null     object
 1   longitude                   63 non-null     object
 2   primary_location            63 non-null     object
 3   area_in_acres               63 non-null     object
 4   area_in_km_squares          63 non-null     object
 5   recreation_visitors_(2021)  63 non-null     object
dtypes: object(6)
memory usage: 3.1+ KB


In [7]:
# Display descriptive statistics of the DataFrame
display(df.describe())

Unnamed: 0,latitude,longitude,primary_location,area_in_acres,area_in_km_squares,recreation_visitors_(2021)
count,63,63,63,63.0,63.0,63
unique,61,62,30,63.0,62.0,63
top,﻿58.50°N,110.50°W,California,49071.4,108.0,4069098
freq,2,2,9,1.0,2.0,1
