# Importing libraries

In [212]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

In [213]:
#loading the Traffic police dataset for Rhode Islands
df=pd.read_csv("datasets/traffic-RI.csv")
df.head()

Unnamed: 0,raw_row_number,date,time,zone,subject_race,subject_sex,department_id,type,arrest_made,citation_issued,warning_issued,outcome,contraband_found,contraband_drugs,contraband_weapons,contraband_alcohol,contraband_other,frisk_performed,search_conducted,search_basis,reason_for_search,reason_for_stop,vehicle_make,vehicle_model,raw_BasisForStop,raw_OperatorRace,raw_OperatorSex,raw_ResultOfStop,raw_SearchResultOne,raw_SearchResultTwo,raw_SearchResultThree
0,1,2005-11-22,11:15:00,X3,white,male,200,vehicular,False,True,False,citation,,,,,,False,False,,,Speeding,,,SP,W,M,M,,,
1,2,2005-10-01,12:20:00,X3,white,male,200,vehicular,False,True,False,citation,,,,,,False,False,,,Speeding,,,SP,W,M,M,,,
2,3,2005-10-01,12:30:00,X3,white,female,200,vehicular,False,True,False,citation,,,,,,False,False,,,Speeding,,,SP,W,F,M,,,
3,4,2005-10-01,12:50:00,X3,white,male,200,vehicular,False,True,False,citation,,,,,,False,False,,,Speeding,,,SP,W,M,M,,,
4,5,2005-10-01,13:10:00,X3,white,female,200,vehicular,False,True,False,citation,,,,,,False,False,,,Speeding,,,SP,W,F,M,,,


# Getting to know the data

In [214]:
df.dtypes

raw_row_number            int64
date                     object
time                     object
zone                     object
subject_race             object
subject_sex              object
department_id            object
type                     object
arrest_made              object
citation_issued          object
outcome                  object
contraband_found         object
contraband_drugs         object
contraband_weapons       object
contraband_alcohol       object
contraband_other         object
frisk_performed          object
search_conducted           bool
search_basis             object
reason_for_search        object
reason_for_stop          object
vehicle_make             object
vehicle_model            object
raw_BasisForStop         object
raw_OperatorRace         object
raw_OperatorSex          object
raw_ResultOfStop         object
raw_SearchResultOne      object
raw_SearchResultTwo      object
raw_SearchResultThree    object
dtype: object

In [215]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 509681 entries, 0 to 509680
Data columns (total 31 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   raw_row_number         509681 non-null  int64 
 1   date                   509671 non-null  object
 2   time                   509671 non-null  object
 3   zone                   509671 non-null  object
 4   subject_race           480608 non-null  object
 5   subject_sex            480584 non-null  object
 6   department_id          509671 non-null  object
 7   type                   509681 non-null  object
 8   arrest_made            480608 non-null  object
 9   citation_issued        480608 non-null  object
 11  outcome                473840 non-null  object
 12  contraband_found       17762 non-null   object
 13  contraband_drugs       15988 non-null   object
 14  contraband_weapons     11795 non-null   object
 15  contraband_alcohol     1217 non-null    object
 16  

In [216]:
df.isnull().sum()

raw_row_number                0
date                         10
time                         10
zone                         10
subject_race              29073
subject_sex               29097
department_id                10
type                          0
arrest_made               29073
citation_issued           29073
outcome                   35841
contraband_found         491919
contraband_drugs         493693
contraband_weapons       497886
contraband_alcohol       508464
contraband_other         491919
frisk_performed              10
search_conducted              0
search_basis             491919
reason_for_search        491919
reason_for_stop           29073
vehicle_make             191564
vehicle_model            279593
raw_BasisForStop          29073
raw_OperatorRace          29073
raw_OperatorSex           29073
raw_ResultOfStop          29073
raw_SearchResultOne      491919
raw_SearchResultTwo      508862
raw_SearchResultThree    509513
dtype: int64

In [217]:
df.describe() # this will not be accurate untill we convert data types

Unnamed: 0,raw_row_number
count,509681.0
mean,254841.0
std,147132.375613
min,1.0
25%,127421.0
50%,254841.0
75%,382261.0
max,509681.0


# Feature Engineering

## Converting the categorical features

In [218]:
continous=[]
categorical=[]
def convert_type(a):
    n=df[a].nunique()
    if n > 20:
        continous.append(a)
    else:
        categorical.append(a)

In [219]:
for index,i in enumerate(df.columns):
    convert_type(i)

In [220]:
continous

['raw_row_number',
 'date',
 'time',
 'department_id',
 'reason_for_search',
 'vehicle_make',
 'vehicle_model']

In [221]:
categorical

['zone',
 'subject_race',
 'subject_sex',
 'type',
 'arrest_made',
 'citation_issued',
 'outcome',
 'contraband_found',
 'contraband_drugs',
 'contraband_weapons',
 'contraband_alcohol',
 'contraband_other',
 'frisk_performed',
 'search_conducted',
 'search_basis',
 'reason_for_stop',
 'raw_BasisForStop',
 'raw_OperatorRace',
 'raw_OperatorSex',
 'raw_ResultOfStop',
 'raw_SearchResultOne',
 'raw_SearchResultTwo',
 'raw_SearchResultThree']

In [222]:
df[categorical].iloc[-20:]

Unnamed: 0,zone,subject_race,subject_sex,type,arrest_made,citation_issued,warning_issued,outcome,contraband_found,contraband_drugs,contraband_weapons,contraband_alcohol,contraband_other,frisk_performed,search_conducted,search_basis,reason_for_stop,raw_BasisForStop,raw_OperatorRace,raw_OperatorSex,raw_ResultOfStop,raw_SearchResultOne,raw_SearchResultTwo,raw_SearchResultThree
509661,X1,,,vehicular,,,,,,,,,,False,False,,,,,,,,,
509662,X1,,,vehicular,,,,,,,,,,False,False,,,,,,,,,
509663,X3,white,female,vehicular,False,True,False,citation,,,,,,False,False,,Other Traffic Violation,OT,W,F,M,,,
509664,K2,white,male,vehicular,False,True,False,citation,,,,,,False,False,,Speeding,SP,W,M,M,,,
509665,K2,white,female,vehicular,False,True,False,citation,,,,,,False,False,,Speeding,SP,W,F,M,,,
509666,K2,hispanic,male,vehicular,False,True,False,citation,,,,,,False,False,,Speeding,SP,H,M,M,,,
509667,X1,,,vehicular,,,,,,,,,,False,False,,,,,,,,,
509668,X4,hispanic,male,vehicular,False,True,False,citation,,,,,,False,False,,Equipment/Inspection Violation,EQ,H,M,M,,,
509669,X4,,,vehicular,,,,,,,,,,False,False,,,,,,,,,
509670,X4,white,male,vehicular,False,True,False,citation,,,,,,False,False,,Speeding,SP,W,M,M,,,


In [223]:
def print_unique(a):
    for i in a:
        print(df[i].unique())

In [224]:
print_unique(categorical)

['X3' 'X4' 'K3' 'K2' 'K1' 'X1' nan]
['white' 'hispanic' 'black' 'other' 'asian/pacific islander' nan]
['male' 'female' nan]
['vehicular']
[False True nan]
[True False nan]
[False True nan]
[nan False True]
[nan False True]
[nan False True]
[nan False True]
[nan False True]
[False True nan]
[False  True]
[nan 'other' 'probable cause' 'plain view']
['Speeding' 'Other Traffic Violation' 'Equipment/Inspection Violation'
 'Motorist Assist/Courtesy' 'Registration Violation' 'Call for Service'
 nan 'Violation of City/Town Ordinance' 'Special Detail/Directed Patrol'
 'APB' 'Warrant' 'Suspicious Person' 'Seatbelt Violation']
['SP' 'OT' 'EQ' 'MO' 'RV' 'CS' nan 'VO' 'SD' 'AP' 'WA' 'SU' 'SB']
['W' 'H' 'B' 'O' 'I' 'N' nan 'L']
['M' 'F' nan 'N' 'U']
['M' 'N' 'D' 'P' 'W' nan 'A']
[nan 'N' 'D' 'M' 'A' 'W' 'O']
[nan 'W' 'D' 'A' 'O' 'M']
[nan 'D' 'M' 'O' 'N' 'A' 'W']


In [225]:
# I will pick out the features having more than 75% NaN values
null_f=[]
def null_features(col):
    for i in col:
        if df[i].isnull().sum()/len(df) > 0.75:
            null_f.append(i)

In [226]:
null_features(df.columns)

In [227]:
null_f

['contraband_found',
 'contraband_drugs',
 'contraband_weapons',
 'contraband_alcohol',
 'contraband_other',
 'search_basis',
 'reason_for_search',
 'raw_SearchResultOne',
 'raw_SearchResultTwo',
 'raw_SearchResultThree']

In [228]:
print_unique(null_f)

[nan False True]
[nan False True]
[nan False True]
[nan False True]
[nan False True]
[nan 'other' 'probable cause' 'plain view']
[nan 'Inventory/Tow' 'Odor of Drugs/Alcohol'
 'Incident to Arrest|Terry Frisk' 'Incident to Arrest|Inventory/Tow'
 'Incident to Arrest' 'Odor of Drugs/Alcohol|Incident to Arrest'
 'Terry Frisk' 'Probable Cause' 'Reasonable Suspicion'
 'Plain View|Incident to Arrest|Inventory/Tow'
 'Incident to Arrest|Probable Cause|Plain View'
 'Probable Cause|Incident to Arrest|Terry Frisk'
 'Probable Cause|Odor of Drugs/Alcohol|Terry Frisk' 'Plain View'
 'Odor of Drugs/Alcohol|Terry Frisk|Incident to Arrest'
 'Incident to Arrest|Plain View|Odor of Drugs/Alcohol'
 'Plain View|Odor of Drugs/Alcohol|Inventory/Tow'
 'Inventory/Tow|Incident to Arrest' 'Odor of Drugs/Alcohol|Terry Frisk'
 'Plain View|Incident to Arrest' 'Odor of Drugs/Alcohol|Probable Cause'
 'Probable Cause|Odor of Drugs/Alcohol' 'Incident to Arrest|Plain View'
 'Incident to Arrest|Odor of Drugs/Alcohol'
 'Incid

In [229]:
df.drop(null_f[-4],axis=1, inplace=True)
null_f.pop(-4)

'reason_for_search'

In [230]:
null_f

['contraband_found',
 'contraband_drugs',
 'contraband_weapons',
 'contraband_alcohol',
 'contraband_other',
 'search_basis',
 'raw_SearchResultOne',
 'raw_SearchResultTwo',
 'raw_SearchResultThree']

In [231]:
print_unique(null_f)

[nan False True]
[nan False True]
[nan False True]
[nan False True]
[nan False True]
[nan 'other' 'probable cause' 'plain view']
[nan 'N' 'D' 'M' 'A' 'W' 'O']
[nan 'W' 'D' 'A' 'O' 'M']
[nan 'D' 'M' 'O' 'N' 'A' 'W']


In [232]:
df[null_f].head()

Unnamed: 0,contraband_found,contraband_drugs,contraband_weapons,contraband_alcohol,contraband_other,search_basis,raw_SearchResultOne,raw_SearchResultTwo,raw_SearchResultThree
0,,,,,,,,,
1,,,,,,,,,
2,,,,,,,,,
3,,,,,,,,,
4,,,,,,,,,


In [233]:
df[null_f].dtypes

contraband_found         object
contraband_drugs         object
contraband_weapons       object
contraband_alcohol       object
contraband_other         object
search_basis             object
raw_SearchResultOne      object
raw_SearchResultTwo      object
raw_SearchResultThree    object
dtype: object

In [234]:
# I will replace the NaNs with 0 and make a new category out of it or using my domain knowedge i will replace some with mode for that particular features

In [237]:
df[null_f].iloc[:,-4]=df[null_f].iloc[:,-4].fillna(value="other")
null_f.pop(-4)

'search_basis'

In [238]:
df[null_f]=df[null_f].fillna(value="missing")

In [239]:
df[null_f]

Unnamed: 0,contraband_found,contraband_drugs,contraband_weapons,contraband_alcohol,contraband_other,raw_SearchResultOne,raw_SearchResultTwo,raw_SearchResultThree
0,missing,missing,missing,missing,missing,missing,missing,missing
1,missing,missing,missing,missing,missing,missing,missing,missing
2,missing,missing,missing,missing,missing,missing,missing,missing
3,missing,missing,missing,missing,missing,missing,missing,missing
4,missing,missing,missing,missing,missing,missing,missing,missing
...,...,...,...,...,...,...,...,...
509676,missing,missing,missing,missing,missing,missing,missing,missing
509677,missing,missing,missing,missing,missing,missing,missing,missing
509678,missing,missing,missing,missing,missing,missing,missing,missing
509679,missing,missing,missing,missing,missing,missing,missing,missing


In [240]:
# changing data type for the categorical variable
df[categorical]= df[categorical].astype("category")
df.dtypes

raw_row_number              int64
date                       object
time                       object
zone                     category
subject_race             category
subject_sex              category
department_id              object
type                     category
arrest_made              category
citation_issued          category
outcome                  category
contraband_found         category
contraband_drugs         category
contraband_weapons       category
contraband_alcohol       category
contraband_other         category
frisk_performed          category
search_conducted         category
search_basis             category
reason_for_stop          category
vehicle_make               object
vehicle_model              object
raw_BasisForStop         category
raw_OperatorRace         category
raw_OperatorSex          category
raw_ResultOfStop         category
raw_SearchResultOne      category
raw_SearchResultTwo      category
raw_SearchResultThree    category
dtype: object

In [241]:
df.isnull().sum()

raw_row_number                0
date                         10
time                         10
zone                         10
subject_race              29073
subject_sex               29097
department_id                10
type                          0
arrest_made               29073
citation_issued           29073
outcome                   35841
contraband_found              0
contraband_drugs              0
contraband_weapons            0
contraband_alcohol            0
contraband_other              0
frisk_performed              10
search_conducted              0
search_basis             491919
reason_for_stop           29073
vehicle_make             191564
vehicle_model            279593
raw_BasisForStop          29073
raw_OperatorRace          29073
raw_OperatorSex           29073
raw_ResultOfStop          29073
raw_SearchResultOne           0
raw_SearchResultTwo           0
raw_SearchResultThree         0
dtype: int64

In [242]:
df[]

SyntaxError: invalid syntax (<ipython-input-242-71f11347ebfc>, line 1)