In [1]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

import pandas as pd

In [2]:
df = pd.read_csv('NC_policing.csv')

In [3]:
# 1. Handling NaN values:

# For `driver_age`
df['driver_age'].fillna(df['driver_age'].median(), inplace=True)

# For `drugs_related_stop`
df['drugs_related_stop'].fillna('Unknown', inplace=True)

# For `district`
df['district'].fillna(df['district'].mode()[0], inplace=True)


In [4]:
# 2. Feature Encoding:

# Binary columns
label_encoders = {}
for col in ['driver_gender', 'contraband_found']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Multi-class columns
df = pd.get_dummies(df, columns=['state', 'driver_race_raw', 'driver_race', 'violation', 'search_type', 'search_basis', 'district', 'drugs_related_stop'])


In [None]:
# 3. Standardization/Normalization:

scaler = StandardScaler()
df[['driver_age', 'officer_id']] = scaler.fit_transform(df[['driver_age', 'officer_id']])


In [None]:
# 4. Splitting Data:

X = df.drop(columns=['stop_outcome', 'stop_date'])  # I assume you don't want to use the 'stop_date' as a feature
y = df['stop_outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# SVM (Remember this may take a long time for a big dataset)
svm = SVC()
svm.fit(X_train, y_train)

# You can now predict and evaluate your models using X_test and y_test