In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, plot_confusion_matrix
from sklearn.svm import SVC

# PREPROCESSING 

In [2]:
#Read data
df = pd.read_csv("NC_policing.csv")
df.info()
df.head()
df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 402087 entries, 0 to 402086
Data columns (total 14 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   state               402087 non-null  object 
 1   stop_date           402087 non-null  object 
 2   driver_gender       402087 non-null  object 
 3   driver_age          401996 non-null  float64
 4   driver_race_raw     402087 non-null  object 
 5   driver_race         402087 non-null  object 
 6   violation           402087 non-null  object 
 7   search_type         402087 non-null  object 
 8   contraband_found    402087 non-null  bool   
 9   stop_outcome        402087 non-null  object 
 10  search_basis        402087 non-null  object 
 11  officer_id          402087 non-null  int64  
 12  drugs_related_stop  4379 non-null    object 
 13  district            402049 non-null  object 
dtypes: bool(1), float64(1), int64(1), object(11)
memory usage: 40.3+ MB


state                      0
stop_date                  0
driver_gender              0
driver_age                91
driver_race_raw            0
driver_race                0
violation                  0
search_type                0
contraband_found           0
stop_outcome               0
search_basis               0
officer_id                 0
drugs_related_stop    397708
district                  38
dtype: int64

In [None]:
for column in df.columns:
    print(f"{column}: {df[column].nunique()} unique values")

In [None]:
#Check the unique values of all data.
for col in df:
    print(df[col].unique())

## Missing Values

In [None]:
#Fill missing values in 'drivers_age' with mean
mean_age = df['driver_age'].mean()
print("Average age is :", mean_age)

In [None]:
#All missing values in 'driver_age' fills with average mean
#It was rounded at 소수첫째자리 반올림
df['driver_age'] = df['driver_age'].fillna(int(np.rint(mean_age)))

#Fill missing values in 'drugs_related_stop' feature with false values
#because 차 수색해서 차에서 마약이 나올 확률이 안나올 확률보다 낮다
df['drugs_related_stop'] = df['drugs_related_stop'].fillna(False)

# only 38 data missed, = keinen Einfluss auf dataset
df.dropna(axis=0, inplace=True)

#Check ob tere are antother missing values
df.isna().sum()

## Encoding

we cannot work with object Dtype. So we need to change the Dtype as numeric values.


In [None]:
#Copy dataset to make a encoded data.
df_encoded = df.copy()

#Remove unnecessary columns 사용되지 않을 컬럼, y variable 삭제
df_encoded = df_encoded.drop(['state','driver_race_raw','district','officer_id'], axis=1)
# driver_race_raw :'driver_race'랑 비슷한 데이터셋
# state : every search happend in NC state
# district and officer_id : will not be used for ML



# Label encoding 
encoder = LabelEncoder()
for col in ['contraband_found','driver_gender', 'drugs_related_stop','stop_outcome','violation','search_basis']:
    df_encoded.loc[:,col] = encoder.fit_transform(df_encoded.loc[:,col])

In [None]:
df_encoded.head()

#Classify the reason of search 
search_reasons = search_reasons = [
    'No Search Conducted', 'Observation Suspected Contraband', 
    'Erratic Suspicious Behaviour', 'Other Official Info', 
    'Suspicious Movement', 'Witness Observation', 'Informant Tip'
]
# Initialize binary columns for each reason in df_encoded with default value 0
for reason in search_reasons:
    df_encoded[f'search_basis_{reason}'] = 0

# Update these columns in df_encoded based on the 'search_basis' column
for reason in search_reasons:
    df_encoded.loc[df_encoded['search_basis'].str.contains(reason), f'search_basis_{reason}'] = 1

# Drop the original 'search_basis' column from df_encoded
df_encoded = df_encoded.drop('search_basis', axis=1)


#Classify district
dist = ['A', 'B','C','D','E','F']

for district in dist:
    df_encoded[f'district_{district}'] = 0

for district in dist:
    df_encoded.loc[df_encoded['district'].str.contains(district), f'district_{district}'] = 1

df_encoded = df_encoded.drop('district', axis=1)

In [None]:
# Convert 'stop_date' to datetime format
df_encoded['stop_date'] = pd.to_datetime(df_encoded['stop_date'])

# Extract year, month, day, and day of the week
#df_encoded['year'] = df_encoded['stop_date'].dt.year
#df_encoded['month'] = df_encoded['stop_date'].dt.month
#df_encoded['day'] = df_encoded['stop_date'].dt.day
df_encoded['day_of_week'] = df_encoded['stop_date'].dt.dayofweek  # Monday=0, Sunday=6

# Drop the original 'stop_date' column 
df_encoded = df_encoded.drop('stop_date', axis=1)

df_encoded.head()

In [None]:
# One-hot encoding
# List of columns to be one-hot encoded           'violation'
columns_to_encode = ['search_type', 'driver_race']  # Add more columns as needed

for col in columns_to_encode:
    if col in df_encoded.columns:
        # Generate one-hot encoded columns
        dummies = pd.get_dummies(df_encoded[col], prefix=col)
        
        # Concatenate the one-hot encoded columns to df_encoded
        df_encoded = pd.concat([df_encoded, dummies], axis=1)
        
        # Drop the original column
        df_encoded = df_encoded.drop(col, axis=1)



In [None]:
#각 컬럼이 어떤 type 인지 확인
#df.info()

#Check the unique values of all data.
for col in df_encoded:
    print(df_encoded[col].unique())

In [None]:
df_encoded.head()

In [None]:
df_encoded.describe().T

### Generate two binary classifiers using two different methods of your liking and compare their performances against a dummy classifier, which always predicts “not arrested”.

In [None]:
#Target variable y
y = df_encoded["stop_outcome"]
y_encoded = encoder.fit_transform(y)


In [None]:
# Splitting Data 
X = df_encoded.drop('stop_outcome', axis=1)
X = X.astype(np.float32) 

y = (df_encoded['stop_outcome'] == "Arrested").astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

In [None]:
X

In [None]:
print(X_train.shape)
print(X_test.shape)
print(X_train.shape)

### only standardization
### only normalization
### std and norm ?


In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

col_names = list(X_train.columns)
# Standardization
standard_scaler = StandardScaler()
X_train_standardized = standard_scaler.fit_transform(X_train)
X_test_standardized = standard_scaler.transform(X_test)

# Normalization
minmax_scaler = MinMaxScaler()
X_train_normalized = minmax_scaler.fit_transform(X_train_standardized)
X_test_normalized = minmax_scaler.transform(X_test_standardized)


In [None]:
# convert to DataFrame
# Convert the standardized NumPy array back to a DataFrame
X_train_standardized_df = pd.DataFrame(X_train_standardized, columns=X_train.columns)

# Display the top rows of the standardized DataFrame
print(X_train_standardized_df.head())


In [None]:
#creating correlation matrix for feature selection
corr = X.corr()
plt.figure(figsize=(10, 10))
sns.heatmap(corr, linewidths=0.1, square=True, annot=True, cmap='YlGnBu',
            linecolor="white", fmt='.2f')
plt.title('Correlation between features')
plt.show()