# **Problem Statement**



### **Loading Libraries**

In [None]:
import warnings
warnings.filterwarnings('ignore')

from utils.data_loader import load_crime_dataset

from pathlib import Path

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
import plotly.express as px

# Set theme of pyplot library

sns.set_theme(style="ticks", color_codes=True)

### **Downloading/Locating dataset**

In [None]:
# Location of stored dataset
dataset_path = Path('../datasets/chicago-crime-data.csv')

if dataset_path.exists():
    print(f"File found: {dataset_path.name}")
else:
    load_crime_dataset()

### **Loading Dataset**

In [None]:
# Converting result into dataframe
crime_df = pd.read_csv(dataset_path)
print(crime_df.shape)
print(crime_df.columns)
crime_df.head(4)

In [None]:
# Dropping redundant columns
crime_df.drop(columns=crime_df.columns[22:], axis = 1, inplace= True)

In [None]:
# Check duplicate rows
print(f"Duplicated rows detected: {sum(crime_df.duplicated())}")

In [None]:
# Check missing values
crime_df.isna().sum()

In [None]:
# Dropping rows with missing values
crime_df.dropna(inplace=True)
crime_df.shape

In [None]:
temp = crime_df.copy()
temp['date'] = pd.to_datetime(crime_df['date'])
temp['year'] = temp['date'].dt.year
temp['month'] = temp['date'].dt.month
temp.groupby('year').size()

## **Seggregate Numerical and Categorical features**

In [None]:
crime_df.info()

In [None]:
# Numerical columns
num_col = crime_df.select_dtypes(include=['int64', 'float64']).columns

# Categorical columns
cat_col = crime_df.select_dtypes(include=['object', 'bool']).columns

print(num_col, end='\n\n')
print(cat_col)

In [None]:
crime_df[cat_col].head()