In [8]:
#import relevent libraries
import pandas as pd
import seaborn as sns
import matplotlib as plt


In [9]:
#load data into dataframe

file = "allegations_202007271729.csv"

complaints_df = pd.read_csv(file)



In [28]:
# Functions for EDA, cleaning and for later QA

def create_copy(dataframe):
    return dataframe.copy()

def get_info(dataframe):
    return dataframe.info()

def data_shape(dataframe):
    if dataframe.shape() != (0,0):
        return dataframe.shape()
    else:
        return "There is an error with your dataframe"
    
def dataframe_preview(dataframe):
    return dataframe.head()

def describe_df(dataframe):
    return dataframe.describe()

def unique_counts(dataframe):
    return dataframe.apply(pd.Series.value_counts)

def check_dups(dataframe):
    duplicates = dataframe[dataframe.duplicated()]
    num_dups = len(duplicates)
    if num_dups > 0:
        return f"There are {num_dups} duplicates in your dataset", duplicates
    else:
        return f"There are {num_dups} duplicates in your dataset"

def check_nulls(dataframe):
    num_nulls = dataframe.isnull().sum()
    return num_nulls

def drop_dups(dataframe):
    dataframe = dataframe.drop_duplicates()
    return dataframe

def drop_nulls(dataframe):
    dataframe = dataframe.dropna()
    return dataframe

def change_data_type(dataframe, column_names, data_type):
    dataframe[column_names].astype(data_type)
    return dataframe

#function to combine month and year
def make_complaint_date(dataframe, year_column, month_column):
    dataframe['complaint_date'] = pd.to_datetime(dataframe[year_column].astype(str) + '-' + dataframe[month_column].astype(str) + '-01')
    return dataframe

def make_resolved_date(dataframe, year_column, month_column):
    dataframe['resolved_date'] = pd.to_datetime(dataframe[year_column].astype(str) + '-' + dataframe[month_column].astype(str) + '-01')
    return dataframe

#check dates are right data type
def check_dates(dateframe, column_name):
    return 
    
#Change M and F to Male and Female respectively
def longform_sex(dataframe, gender_column):
    dataframe[gender_column] = dataframe[gender_column].replace('M', 'Male').replace('F', 'Female')
    return dataframe

    



EDA Findings:
27 columns with 33,357 rows
Appears to be missing data in some rows
Dates need to be made, currently separate columns for month and day
unique_mos_id, shield_no, complaint_id, precinct are ints and would be better in string format
1985 to 1998 had a null rate of 92% or higher for complainant age, gender, and ethnicity
only 4 rows for the year 2020
noticed discrepancy between gender classification for police and complainants - changed to full form
Given the large number of columns, relevent columns will save to a separate file

In [None]:
get_info(complaints_df)
#code below is to see "missing" columns
#get_info(complaints_df.iloc[:,20:27])

In [None]:
#get preview of columns, abbreviations for commands and ranks need to be converted to full form
dataframe_preview(complaints_df)

In [None]:
#unique_mod_id, shield_no, complaint_id, precinct are all ints and would be better as strings, same with year and month columns
#complainant age has a min age of -4301, further analysis will be required
#lots of missing values in complainant age
describe_df(complaints_df)

In [None]:
#check duplicates
#631 duplicates in the data
check_dups(complaints_df)

In [None]:
#check nulls
#lots of nulls in command at incident, complainant ethnicity, age, and gender these are the important ones
check_nulls(complaints_df)

In [None]:
#check rate of nulls by year for complainant ethnicity, age, and gender
#check nulls by year
by_year = complaints_df.groupby('year_received')
null_counts = by_year[['complainant_age_incident', 'complainant_ethnicity', 'complainant_gender']].apply(lambda x: x.isnull().sum())
#null_counts

null_counts['total rows'] = by_year['unique_mos_id'].count()

null_counts['percent of total age'] = null_counts['complainant_age_incident']/null_counts['total rows']
null_counts['percent of total ethnicity'] = null_counts['complainant_ethnicity']/null_counts['total rows']
null_counts['percent of total gender'] = null_counts['complainant_gender']/null_counts['total rows']
null_percentages = null_counts[['percent of total age', 'percent of total ethnicity', 'percent of total gender', 'total rows']]
null_percentages 

In [None]:
#count of rows by age. 
#Remove ages 9 and under as rows limited.
count_age = complaints_df.groupby('complainant_age_incident').count()
count_age.head(20)

Data Cleaning

In [24]:
#make copy to clean
to_clean = create_copy(complaints_df)

#filter age group
to_clean = to_clean[to_clean['complainant_age_incident'] >= 10]

#make dates
to_clean = make_complaint_date(to_clean, 'year_received', 'month_received')
to_clean = make_resolved_date(to_clean, 'year_closed', 'month_closed')

#standardize genders
to_clean = longform_sex(to_clean, 'mos_gender')

#change data types
to_clean = change_data_type(to_clean, column_names=['unique_mos_id', 'shield_no', 'complaint_id', 'precinct'], data_type=str)

#Drop nulls and dups
to_clean = drop_nulls(to_clean)

to_clean = drop_dups(to_clean)

#replace nulls with "no record"
final_df = to_clean.fillna("No Records Available")

#Filter years in final_df contains all columns
final_df = final_df[(final_df['year_received'] >= 1999) & (final_df['year_received'] < 2020)]


In [196]:
#save final_df as csv contains all columns
final_df.to_csv('final_df.csv', index=False)

In [207]:
relevent_data = final_df[['complaint_id', 'complaint_date', 'resolved_date', 'contact_reason', 'fado_type', 'allegation', 'outcome_description', 'board_disposition', 
                        'complainant_age_incident', 'complainant_gender', 'complainant_ethnicity', 'unique_mos_id', 'first_name',
                        'last_name', 'command_at_incident', 'command_now', 'rank_incident', 'rank_now',  'mos_age_incident', 'mos_gender', 'mos_ethnicity', 
                        'precinct']]
relevent_data.to_csv('relevent_data.csv', index=False)

QA

In [25]:
check_dups(final_df)

'There are 0 duplicates in your dataset'

In [26]:
check_nulls(final_df)

unique_mos_id               0
first_name                  0
last_name                   0
command_now                 0
shield_no                   0
complaint_id                0
month_received              0
year_received               0
month_closed                0
year_closed                 0
command_at_incident         0
rank_abbrev_incident        0
rank_abbrev_now             0
rank_now                    0
rank_incident               0
mos_ethnicity               0
mos_gender                  0
mos_age_incident            0
complainant_ethnicity       0
complainant_gender          0
complainant_age_incident    0
fado_type                   0
allegation                  0
precinct                    0
contact_reason              0
outcome_description         0
board_disposition           0
complaint_date              0
resolved_date               0
dtype: int64

In [34]:
#check data types of all data
get_info(final_df)
#get truncated data
#get_info(final_df.iloc[:, 20:29])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27599 entries, 0 to 33357
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   complainant_age_incident  27599 non-null  float64       
 1   fado_type                 27599 non-null  object        
 2   allegation                27599 non-null  object        
 3   precinct                  27599 non-null  float64       
 4   contact_reason            27599 non-null  object        
 5   outcome_description       27599 non-null  object        
 6   board_disposition         27599 non-null  object        
 7   complaint_date            27599 non-null  datetime64[ns]
 8   resolved_date             27599 non-null  datetime64[ns]
dtypes: datetime64[ns](2), float64(2), object(5)
memory usage: 2.1+ MB
