# Police data analysis

In [59]:
import pandas as pd

In [60]:
data = pd.read_csv("Police_Data.csv")
data.head()

Unnamed: 0,stop_date,stop_time,country_name,driver_gender,driver_age_raw,driver_age,driver_race,violation_raw,violation,search_conducted,search_type,stop_outcome,is_arrested,stop_duration,drugs_related_stop
0,1/2/2005,1:55,,M,1985.0,20.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
1,1/18/2005,8:15,,M,1965.0,40.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
2,1/23/2005,23:15,,M,1972.0,33.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False
3,2/20/2005,17:15,,M,1986.0,19.0,White,Call for Service,Other,False,,Arrest Driver,True,16-30 Min,False
4,3/14/2005,10:00,,F,1984.0,21.0,White,Speeding,Speeding,False,,Citation,False,0-15 Min,False


In [61]:
data.dtypes

stop_date              object
stop_time              object
country_name          float64
driver_gender          object
driver_age_raw        float64
driver_age            float64
driver_race            object
violation_raw          object
violation              object
search_conducted         bool
search_type            object
stop_outcome           object
is_arrested            object
stop_duration          object
drugs_related_stop       bool
dtype: object

In [62]:
data.index

RangeIndex(start=0, stop=65535, step=1)

In [63]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65535 entries, 0 to 65534
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   stop_date           65535 non-null  object 
 1   stop_time           65535 non-null  object 
 2   country_name        0 non-null      float64
 3   driver_gender       61474 non-null  object 
 4   driver_age_raw      61481 non-null  float64
 5   driver_age          61228 non-null  float64
 6   driver_race         61475 non-null  object 
 7   violation_raw       61475 non-null  object 
 8   violation           61475 non-null  object 
 9   search_conducted    65535 non-null  bool   
 10  search_type         2479 non-null   object 
 11  stop_outcome        61475 non-null  object 
 12  is_arrested         61475 non-null  object 
 13  stop_duration       61475 non-null  object 
 14  drugs_related_stop  65535 non-null  bool   
dtypes: bool(2), float64(3), object(10)
memory usage: 6.6+

In [64]:
data.isnull().sum()

stop_date                 0
stop_time                 0
country_name          65535
driver_gender          4061
driver_age_raw         4054
driver_age             4307
driver_race            4060
violation_raw          4060
violation              4060
search_conducted          0
search_type           63056
stop_outcome           4060
is_arrested            4060
stop_duration          4060
drugs_related_stop        0
dtype: int64

# Q. 1) Instruction ( For Data Cleaning ) - Remove the column that only contains missing values.


In [65]:
data.drop(columns='country_name',inplace=True)

In [66]:
data.columns

Index(['stop_date', 'stop_time', 'driver_gender', 'driver_age_raw',
       'driver_age', 'driver_race', 'violation_raw', 'violation',
       'search_conducted', 'search_type', 'stop_outcome', 'is_arrested',
       'stop_duration', 'drugs_related_stop'],
      dtype='object')

# Q. 2) Question ( Based on Filtering + Value Counts ) - For Speeding , were Men or Women stopped more often ? 


In [67]:
data[data['violation'] =='Speeding'].driver_gender.value_counts()

M    25517
F    11686
Name: driver_gender, dtype: int64

# Q. 3) Question ( Groupby ) - Does gender affect who gets searched during a stop ?


In [68]:
data.groupby('driver_gender').search_conducted.count()

driver_gender
F    16310
M    45164
Name: search_conducted, dtype: int64

# Q. 4) Question ( mapping + data-type casting ) - What is the mean stop_duration ?


In [69]:
data['stop_duration'].value_counts()

0-15 Min     47379
16-30 Min    11448
30+ Min       2647
2                1
Name: stop_duration, dtype: int64

In [73]:
series = pd.Series({'0-15 Min' : '7.5', '16-30 Min' : '23', '30+ Min' : '45'})
data['stop_duration']=data['stop_duration'].map(series)

In [76]:
data['stop_duration'].value_counts()

7.5    47379
23     11448
45      2647
Name: stop_duration, dtype: int64

In [84]:
stop_duration_mean = data['stop_duration'].astype(float).mean()
print(f"Stop_duration mean is:{stop_duration_mean}")

Stop_duration mean is:12.001195627419722


# Q. 5) Question ( Groupby , Describe ) - Compare the age distributions for each violation.


In [87]:
data.groupby('violation').driver_age.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
violation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Equipment,6507.0,31.682957,11.380671,16.0,23.0,28.0,39.0,81.0
Moving violation,11876.0,36.736443,13.25835,15.0,25.0,35.0,47.0,86.0
Other,3477.0,40.362381,12.754423,16.0,30.0,41.0,50.0,86.0
Registration/plates,2240.0,32.656696,11.15078,16.0,24.0,30.0,40.0,74.0
Seat belt,3.0,30.333333,10.214369,23.0,24.5,26.0,34.0,42.0
Speeding,37120.0,33.262581,12.615781,15.0,23.0,30.0,42.0,88.0


In [88]:
data.groupby('violation').driver_age.value_counts()

violation  driver_age
Equipment  23.0          385
           21.0          374
           22.0          370
           20.0          348
           24.0          329
                        ... 
Speeding   84.0            3
           86.0            3
           88.0            2
           15.0            1
           83.0            1
Name: driver_age, Length: 333, dtype: int64