# Marc EDA Notebook

* I am going to take the cleaned up information that Nick and I ended up with and explore on it.

In [None]:
# import libs
import acquire as a
import prepare as p
import pandas as pd
import env
# I may come up with model and explore .py modules

# Data Acquisition

### We acquired from the Open Data new york using the Socrata API and saw our observations/features

In [None]:
ny_df = a.acquire_ny()
ny_df.info()

In [None]:
ny_df.describe()

In [None]:
ny_df.head()

- We cleaned up the data by handling all null values, Most nulls were dropped, grades were just adjusted according to score.
- Nick is currently wanting to accomplish the following:
    - ***"I want to consolidate the data down to one row per inspection. Currently there are multiple rows per violation per inspection. I would like to merge each row per inspection down to one row."***
    - ***"Having some difficulties combining rows. I have managed to have some progress using group by along with some aggregate functions. Need to check if combined data is accurate."***

# Data Preparation

- Changes:
    - Function uses an additional line of code that resets index after clean actions.

In [None]:
ny = p.clean_ny(ny_df)

In [None]:
ny.info()

In [None]:
ny.describe()

### No nulls

In [None]:
ny.isna().sum()

In [None]:
ny.nunique()

### concatenating the address features together to have a full address column

In [None]:
full_addy = ny.building + ' ' + ny.street + ' ' + ny.zipcode.astype(str)
ny['full_address'] = full_addy

In [None]:
ny['full_address'][0:5]

- dropped the old address features

In [None]:
ny = ny.drop(columns=['building', 'street', 'zipcode'])

In [None]:
ny.columns

### Now I am looking at the inspections dates for cleaning

In [None]:
ny.inspection_date.nunique()

## Grouped by camis and inspection_date and then aggregated violation_code and violation description to lists for respective camis/inspection date observation

In [None]:
agg_violations = ny.groupby(['camis','inspection_date']).agg({'violation_code': lambda x: x.tolist(),'violation_description':lambda x: x.tolist()})
agg_violations[0:2]

## Looks good

In [None]:
# location a specific aggregated violation with specific camis and date
agg_violations.loc[(30075445, '2021-08-10T00:00:00.000')]

## added the codes and descritpion into a list and them appended then to the location for each group (camis/date combination)

In [None]:
agg_data_code = []
agg_data_description = []

for cam, date in zip(ny.camis, ny.inspection_date):
    agg_data_code.append(agg_violations.loc[(cam, date)][0])
    agg_data_description.append(agg_violations.loc[(cam, date)][1])
    
ny.violation_code = agg_data_code
ny.violation_description = agg_data_description

## Look at first 3 to see changes 

In [None]:
ny.head(3)

## sorting by inspection dates to see if multiple dates are listed for a specific business: this look like it worked we can see the lists in the violation code/description columns

In [None]:
ny[ny.dba == '$1 PIZZA'].sort_values('inspection_date').head(3)

In [None]:
agg_violations.index[0]

In [None]:
len(agg_data_code)

In [None]:
ny.groupby('dba').camis.nunique()

In [None]:
ny.head(3)

In [None]:
ny.shape

## We need to ensure datatypes are correct

In [None]:
ny.info()

## There were 89 NaN values for phone, so i will input a string for these and list them as 'no phone'

In [None]:
ny[ny['phone'].isna()].shape

In [None]:
# Replace NaN values in the 'phone' column with No Phone place holder
ny['phone'].fillna(-1, inplace=True)  # Use -1 as a numeric placeholder for missing phone numbers

# Check to verify that NaN values have been replaced
print(ny['phone'])

In [None]:
# Convert a numeric column to numeric type (float)
ny['phone'] = pd.to_numeric(ny['phone'], errors='coerce')

# Convert it to an integer
ny['phone'] = ny['phone'].astype(int)

# Check the data type of the column after conversion
print(ny['phone'].dtype)

In [None]:
# Convert inspection_date to datetime
ny['inspection_date'] = pd.to_datetime(ny['inspection_date'])

# Convert critical_flag to categorical
ny['critical_flag'] = pd.Categorical(ny['critical_flag'])

# Convert record_date to datetime
ny['record_date'] = pd.to_datetime(ny['record_date'])

# Convert score to numeric (float)
ny['score'] = pd.to_numeric(ny['score'], errors='coerce')  # 'coerce' to handle non-numeric values

# Convert grade to categorical
ny['grade'] = pd.Categorical(ny['grade'])

# Convert other object columns to appropriate data types as needed
# For example, if 'phone' should be treated as a string, no further conversion is necessary

# Check data types of the DataFrame after conversion
print(ny.dtypes)

In [None]:
ny.info()

## Flattened the DF in case Nick wants to use it.

## - This has almost 5 Million observations to explore

In [None]:
# copy of the original DataFrame
flattened_ny = ny.copy()

# Explode the lists in 'violation_code' and 'violation_description' columns
flattened_ny = flattened_ny.explode('violation_code').explode('violation_description')

# Reset the index to ensure unique row identifiers
flattened_ny = flattened_ny.reset_index(drop=True)

In [None]:
flattened_ny.info()

In [None]:
print(flattened_ny.shape)
print('\n\n\n')
flattened_ny.head()

<!-- ## John Requested Matching Criteria   -->

# Exploratory Data Analysis - Statistical Analysis

In [None]:
## Here I am using the cleaned dataframe

In [None]:
ny.info()

In [None]:
ny.head()

## Installed Plotly for interactive visuals

In [None]:
pip install plotly

In [None]:
# visual imports
import seaborn as sns
import matplotlib.pyplot as plt
import plotly

## Doing some seaborn exploration

  * note that I am using the flattened_ny dataframe due to data not being able to be visualized with list values for violation code/description column

In [None]:
# # should drop lat and long; don't need to see them
# flattened_ny = flattened_ny.drop(columns=(['latitude', 'longitude']))

# # Separate columns into numeric and categorical types
# numeric_columns = flattened_ny.select_dtypes(include=['int64', 'float64']).columns
# categorical_columns = flattened_ny.select_dtypes(include=['object']).columns

# # Visualize numeric features (e.g., histograms)
# for column in numeric_columns:
#     plt.figure()
#     sns.histplot(flattened_ny[column])
#     plt.title(f'Distribution of {column}')
#     plt.show()

# # Visualize categorical features (e.g., bar plots)
# for column in categorical_columns:
#     plt.figure(figsize=(10, 5))
#     sns.countplot(data=flattened_ny, x=column)
#     plt.title(f'Distribution of {column}')
#     plt.xticks(rotation=45)  # Rotate x-axis labels for readability
#     plt.show()


## I found that distributions were good for some features but for dba, violation code, violations descriptions, cuisine_description, inspection_type, and full_address was either too difficult to read the x labels or the visuals looks like so many bars and seemed very short. 

In [None]:
# Should drop 'latitude' and 'longitude'
# flattened_ny = flattened_ny.drop(columns=['latitude', 'longitude'])

# Define the list of categorical columns
categorical_columns = ['score', 'boro', 'cuisine_description', 'action', 'violation_code', 'inspection_type']

# Visualize categorical features (e.g., bar plots)
for column in categorical_columns:
    plt.figure(figsize=(24, 18))
    sns.countplot(data=flattened_ny, x=column)
    plt.title(f'Distribution of {column}')
    plt.xticks(rotation=90)  # Rotate x-axis labels for readability
    plt.show()# Rotate x-axis labels for readability        

In [None]:
# Define the list of categorical columns
categorical_columns = ['score', 'boro', 'cuisine_description', 'action', 'violation_code', 'violation_description', 'inspection_type']

# Visualize categorical features with horizontal bar plots
for column in categorical_columns:
    plt.figure(figsize=(24, 18))
    
    if column == 'violation_description':
        # Split the violation_description text and select the first word
        flattened_ny['violation_first_word'] = flattened_ny['violation_description'].str.split().str[0]
        ax = sns.barplot(x=flattened_ny['violation_first_word'].value_counts(), y=flattened_ny['violation_first_word'].value_counts().index)
        plt.title(f'Distribution of {column} (First Word)')
        
        # Customize the tick positions and labels for the y-axis
        tick_positions = range(len(flattened_ny['violation_first_word'].unique()))
        tick_labels = flattened_ny['violation_first_word'].unique()
        ax.set_yticks(tick_positions)
        ax.set_yticklabels(tick_labels)
    else:
        ax = sns.barplot(x=flattened_ny[column].value_counts(), y=flattened_ny[column].value_counts().index)
        plt.title(f'Distribution of {column}')
    
    plt.tight_layout()  # Automatically adjusts subplot parameters for better spacing
    plt.show()


## Now i will do a pair plot and look at the results for numerical features

In [None]:
flattened_ny.info()

In [None]:
sns.pairplot(flattened_ny)
plt.show()

In [None]:
sns.histplot(data=flattened_ny, x='score', bins=20)
plt.show()

In [None]:
sns.barplot(data=flattened_ny, x ='score', y = 'grade')
plt.show()

In [None]:
# business counts
dba_count = flattened_ny['dba'].value_counts()
dba_count

In [None]:
top_n = 20  # Adjust the number of top values to display
plt.figure(figsize=(24, 18))
sns.countplot(data=flattened_ny, x='dba', order=flattened_ny['dba'].value_counts().iloc[:top_n].index)
plt.xticks(rotation=45)
plt.title(f'Top {top_n} dba Values')

In [None]:
ny.describe()

# Data Modeling