# Data Analysis
This file contains code to explore the "hospital.csv" datasest by looking at items such as column names, data types, value counts, unique values, and null values. Additionaly, there are preliminary visualizations to begin understanding relationships. The main purpose of this file is to explore the dataset for the first time in order to began planning future steps for data cleaning and eventually for models.

In [None]:
# Data Processing
import os
import numpy as np
import pandas as pd
import missingno as msno
# Basic Visualization tools
import matplotlib.pyplot as plt
import warnings
import plotly.express as px
warnings.filterwarnings('ignore')
plt.rcParams['figure.dpi'] = 300


In [None]:
# Load in the dataset
data = pd.read_csv('hospital.csv')

## Analysis

In [None]:
# Print each column name and its respective data type
columns_list = data.columns.tolist()
np.array(columns_list)
for column in columns_list:
    row = [column, data[column].dtype]
    print(row)

In [None]:
# Print the first 2 rows of data
print("First 2 rows of data:\n")
print(data.head(2))

# Print info about data types
print(data.info())

In [None]:
# Count null values per row 
print((data.isnull() == True).sum())

In [None]:
# Look specifically at target variable row's (length of stay) number of values
print(data["Length of Stay"].value_counts())

# and the frequencies of each vaue
print(data["Length of Stay"].value_counts(True))

In [None]:
columns_list = data.columns.tolist()
# Check unique values in certain columns
for column in columns_list:
    unique_values = data[column].unique()
    print(f"{column} : {unique_values} ")

## Visualizations

In [None]:
# Visualization to see columns with null/missing values
msno.matrix(data)

In [None]:
# Visualization showing the length of stay based on hospital location, organized by the diagnosis code
fig = px.scatter(data, x="Hospital Service Area", y="Length of Stay", color="CCSR Diagnosis Code")
fig.update_layout(barmode='group')
fig.show()

In [None]:
# Visualization showing the length of stay based on total costs, organized by the age group
fig = px.scatter(data, x='Total Costs', y='Length of Stay', color='Age Group')
fig.show()