In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder

# <center><span style="color: #3498db;">Complete Data Cleaning and label encoding</span></center>

# <center> Label Encoding Information</center>

## <span style="color: #3498db;">The following Label Encoding has been applied to convert categorical data to numerical form:</span>

**stop_cause**
- <span style="color: #3498db;">Male: 0</span>
- <span style="color: #e74c3c;">Female: 1</span>
- <span style="color: #95a5a6;">Unknown: 2</span>

**subject_race**
- <span style="color: #27ae60;">'A': 0</span>
- <span style="color: #f39c12;">'B': 1</span>
- <span style="color: #8e44ad;">'C': 2</span>
- <span style="color: #d35400;">'D': 3</span>
- <span style="color: #c0392b;">'F': 4</span>
- <span style="color: #16a085;">'G': 5</span>
- <span style="color: #f1c40f;">'H': 6</span>
- <span style="color: #bdc3c7;">'I': 7</span>
- <span style="color: #7f8c8d;">'J': 8</span>
- <span style="color: #34495e;">'K': 9</span>
- <span style="color: #2ecc71;">'L': 10</span>
- <span style="color: #3498db;">'O': 11</span>
- <span style="color: #e74c3c;">'P': 12</span>
- <span style="color: #95a5a6;">'S': 13</span>
- <span style="color: #27ae60;">'U': 14</span>
- <span style="color: #f39c12;">'V': 15</span>
- <span style="color: #8e44ad;">'W': 16</span>
- <span style="color: #d35400;">'X': 17</span>
- <span style="color: #c0392b;">'Z': 18</span>

**sd_resident, arrested, searched**
- <span style="color: #27ae60;">'Y': 1</span>
- <span style="color: #e74c3c;">'N': 0</span>

**search_details_type**
- <span style="color: #3498db;">'ActionTaken': 0</span>
- <span style="color: #f39c12;">'ActionTakenOther': 1</span>
- <span style="color: #8e44ad;">'SearchBasis': 2</span>
- <span style="color: #d35400;">'SearchBasisOther': 3</span>
- <span style="color: #c0392b;">'SearchType': 4</span>

In [None]:
def unique_value_feature(df, column):
    """
    Print the unique values and their counts for a given column in a DataFrame.

    Parameters:
    - df (pandas.DataFrame): The DataFrame containing the data.
    - column (str): The column name for which unique values are to be analyzed.

    Returns:
    None

    Example:
    unique_value_feature(my_dataframe, 'example_column')
    """
    assert isinstance(df, pd.DataFrame), "Input 'df' must be a pandas DataFrame."
    assert isinstance(column, str), "Input 'column' must be a string."

    unique_values = df[column].value_counts()
    print(f"Feature: {column}\n{unique_values}\n{'=' * 30}\n")

In [None]:
def clean_features(df, replace_mapping, feature):
    """
    Clean and standardize a specified feature column in the given DataFrame.

    Parameters:
    - df (pandas.DataFrame): The DataFrame containing the data.
    - replace_mapping (dict): A dictionary specifying the values to be replaced.
    - feature (str): The feature/column name to be cleaned and standardized.

    Returns:
    pandas.DataFrame: The DataFrame with the specified feature column cleaned and standardized.
    """
    # Input parameter assertions
    assert isinstance(df, pd.DataFrame), "Input 'df' must be a pandas DataFrame."
    assert isinstance(replace_mapping, dict), "Input 'replace_mapping' must be a dictionary."
    assert isinstance(feature, str), "Input 'feature' must be a string."

    for old_value, new_value in replace_mapping.items():
        df.loc[df[feature].isin([old_value]), feature] = new_value

    return df

In [None]:
def drop_rows_by_feature_value(df, feature, value):
    """
    Drop rows from a DataFrame where the specified feature has the specified value.

    Parameters:
    - df (pandas.DataFrame): The DataFrame containing the data.
    - feature (str): The feature/column name based on which rows will be dropped.
    - value: The value of the feature for which rows will be dropped.

    Returns:
    pandas.DataFrame: The DataFrame with specified rows dropped.
    """
    # Input parameter assertions
    assert isinstance(df, pd.DataFrame), "Input 'df' must be a pandas DataFrame."
    assert isinstance(feature, str), "Input 'feature' must be a string."

    return df[df[feature] != value]

In [None]:
def get_unique_values_as_list(df, feature):
    """
    Get all unique values of a specified feature in a DataFrame as a list.

    Parameters:
    - df (pandas.DataFrame): The DataFrame containing the data.
    - feature (str): The feature/column name for which unique values are to be retrieved.

    Returns:
    list: A list of unique values for the specified feature.
    """
    # Input parameter assertions
    assert isinstance(df, pd.DataFrame), "Input 'df' must be a pandas DataFrame."
    assert isinstance(feature, str), "Input 'feature' must be a string."

    unique_values = df[feature].unique().tolist()
    return unique_values

In [None]:
df= pd.read_csv("/kaggle/input/ece-143-group-5/final_combined_dataset_v2.csv")
df

In [None]:
df.info()

In [None]:
print(df.isnull().sum())

In [None]:
print(df.duplicated().sum())

In [None]:
df = df.drop_duplicates()

In [None]:
df

In [None]:
unique_value_feaure(df,'stop_cause')

In [None]:
stop_cause_mapping = {
    'NOT MARKED': 'Not Marked',
    'not marked': 'Not Marked',
    'Suspect Info': 'Suspect Info (I.S., Bulletin, Log)',
    '&Equipment Violation': 'Equipment Violation',
    'Personal Observ/Knowledge': 'Personal Knowledge/Informant',
    '&Moving Violation': 'Moving Violation',
    '&Radio Call/Citizen Contact': 'Radio Call/Citizen Contact',
    'no cause listed': 'Not Listed',
    'none listed': 'Not Listed',
    'Not Marked': 'Not Listed',
    'not noted': 'Not Listed',
    'not listed': 'Not Listed',
    'not marked  not marked': 'Not Listed',
    'NOT SPECIFIED': 'Not Listed',
    'No Cause Specified on a Card': 'Not Listed',
    'UNI, &County, H&&S Code': 'Muni, County, H&S Code',
    'MUNI, County, H&S Code': 'Muni, County, H&S Code',
    'Not Listed' : 'Not Listed/Other', 
    'Other': 'Not Listed/Other'
}
df = clean_features(df, stop_cause_mapping,'stop_cause')

In [None]:
unique_value_feaure(df,'stop_cause')

In [None]:
unique_value_feaure(df,'subject_race')

In [None]:
columns_to_drop = ['date_time']
df = df.drop(columns=columns_to_drop)
df

In [None]:
unique_value_feaure(df,'sd_resident')

In [None]:
sd_resident_mapping = {
    "y":'Y',
    'n' : 'N',
}
df = clean_features(df, sd_resident_mapping,'sd_resident')

In [None]:
unique_value_feaure(df,'sd_resident')

In [None]:
df = drop_rows_by_feature_value(df, 'sd_resident', ' ')

In [None]:
unique_value_feaure(df,'sd_resident')

In [None]:
unique_value_feaure(df,'arrested')

In [None]:
df = drop_rows_by_feature_value(df, 'arrested', ' ')
df = clean_features(df, sd_resident_mapping,'arrested')
unique_value_feaure(df,'arrested')

In [None]:
unique_value_feaure(df,'searched')

In [None]:
df = drop_rows_by_feature_value(df, 'searched', ' ')
df = clean_features(df, sd_resident_mapping,'searched')
unique_value_feaure(df,'searched')

In [None]:
unique_subject_age_values = get_unique_values_as_list(df, 'subject_age')
print(unique_subject_age_values)

In [None]:
df.shape

In [None]:
ages_to_drop =['0', '5', '230','2_', '2', '211', '8', '234', '185', '13', '12', '9', '153', '7', '6', '4', '221', '5_', '1', '4_', 
               '255', '224', '3', 'N', '204', '223', '228', '222',  '213', 'No Age', '125', '243', '399', 'f26', '3_', '11', '233', 
               '180', '173', '100', '119', '163', '212', '220', '145', '120', '226', '143']

for i in ages_to_drop:
    df = drop_rows_by_feature_value(df, 'subject_age', i)

In [None]:
df.shape

In [None]:
unique_subject_age_values = get_unique_values_as_list(df, 'subject_age')
print(unique_subject_age_values)

In [None]:
df = clean_features(df, {'Unknown': 0},'service_area')

In [None]:
yes_no_mapping = {
    "Y": 1,
    'N' : 0,
}
df = clean_features(df, yes_no_mapping,'sd_resident')

In [None]:
df = clean_features(df, yes_no_mapping,'arrested')
df = clean_features(df, yes_no_mapping,'searched')

In [None]:
label_encoder = LabelEncoder()

In [None]:
df['subject_race'] = label_encoder.fit_transform(df['subject_race'])

In [None]:
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:")
print(label_mapping)

In [None]:
df['search_details_type'] = label_encoder.fit_transform(df['search_details_type'])

In [None]:
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:")
print(label_mapping)

In [None]:
df['stop_cause'] = label_encoder.fit_transform(df['stop_cause'])

In [None]:
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:")
print(label_mapping)

In [None]:
df

In [None]:
df.describe().T

In [None]:
df.to_csv('final_combined_dataset_v3.csv', index=False)