In [114]:
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [115]:
df = pd.read_csv("/content/drive/MyDrive/Datasets/Train.csv")

In [116]:
# Group by road conditions and count the frequency of each
road_condition_counts = df['RDSFCOND'].value_counts().reset_index()
road_condition_counts.columns = ['Road Condition', 'Frequency']

# Create a bar plot using Plotly
fig = px.bar(road_condition_counts, x='Road Condition', y='Frequency',
             title='Frequency of Road Conditions in Traffic Collisions',
             labels={'Frequency': 'Collision Count', 'Road Condition': 'Road Condition'})

# Show the plot
fig.show()

In [117]:
# Group by state and road conditions, and count the frequency of each
state_road_condition_counts = df.groupby(['DISTRICT', 'RDSFCOND']).size().reset_index(name='Frequency')

# Create a bar plot using Plotly
fig = px.bar(state_road_condition_counts, x='DISTRICT', y='Frequency', color='RDSFCOND',
             title='Frequency of Road Conditions by State',
             labels={'Frequency': 'Collision Count', 'DISTRICT': 'State', 'RDSFCOND': 'Road Condition'},
             barmode='group')

# Show the plot
fig.show()

In [118]:
condition_counts = df.groupby(['RDSFCOND', 'VISIBILITY']).size().reset_index(name='Frequency')

# Create a bar plot using Plotly
fig = px.bar(condition_counts, x='RDSFCOND', y='Frequency', color='VISIBILITY',
             title='Frequency of Collisions by Road Conditions and Visibility',
             labels={'Frequency': 'Collision Count', 'RDSFCOND': 'Road Condition', 'VISIBILITY': 'Visibility'},
             barmode='group')

# Show the plot
fig.show()

In [119]:
yearly_collision_counts = df['YEAR'].value_counts().reset_index()
yearly_collision_counts.columns = ['Year', 'Collision Count']

# Sort the DataFrame by year
yearly_collision_counts = yearly_collision_counts.sort_values(by='Year')

# Create a line plot using Plotly
fig = px.bar(yearly_collision_counts, x='Year', y='Collision Count',
              title='Year-wise Occurrence of Collisions',
              labels={'Collision Count': 'Collision Count', 'Year': 'Year'})

# Show the plot
fig.show()

In [120]:
# Group by year and state and count the occurrences of collisions
state_yearly_collision_counts = df.groupby(['DISTRICT', 'YEAR']).size().reset_index(name='Collision Count')

# Sort the DataFrame by year
state_yearly_collision_counts = state_yearly_collision_counts.sort_values(by=['DISTRICT', 'YEAR'])

# Create a grouped bar plot using Plotly
fig = px.bar(state_yearly_collision_counts, x='YEAR', y='Collision Count', color='DISTRICT',
             title='Year-wise Occurrence of Collisions by State',
             labels={'Collision Count': 'Collision Count', 'YEAR': 'Year', 'DISTRICT': 'State'},
             barmode='group')

# Show the plot
fig.show()

In [121]:
# Group by light and visibility conditions and count the occurrences of collisions
light_visibility_counts = df.groupby(['LIGHT', 'VISIBILITY']).size().reset_index(name='Collision Count')

# Create a grouped bar plot using Plotly
fig = px.bar(light_visibility_counts, x='LIGHT', y='Collision Count', color='VISIBILITY',
             title='Comparison of Light and Visibility Conditions in Collisions',
             labels={'Collision Count': 'Collision Count', 'LIGHT': 'Light Condition', 'VISIBILITY': 'Visibility Condition'},
             barmode='group')

# Show the plot
fig.show()

In [122]:
# Filter out rows with null age values and get unique age groups
valid_age_groups = df.dropna(subset=['INVAGE'])['INVAGE'].unique()

# Group by age group, light conditions, and visibility conditions, and count the occurrences of collisions
age_light_visibility_counts = df.groupby(['INVAGE', 'LIGHT', 'VISIBILITY']).size().reset_index(name='Collision Count')

# Filter only the rows with valid age groups
age_light_visibility_counts = age_light_visibility_counts[age_light_visibility_counts['INVAGE'].isin(valid_age_groups)]

# Create a grouped bar plot using Plotly
fig = px.bar(age_light_visibility_counts, x='INVAGE', y='Collision Count', color='VISIBILITY',
             facet_col='LIGHT', facet_col_wrap=3,
             title='Comparison of Light and Visibility Conditions by Age Group',
             labels={'Collision Count': 'Collision Count', 'INVAGE': 'Age Group', 'VISIBILITY': 'Visibility Condition'},
             barmode='group')

# Show the plot
fig.show()

In [123]:
# Filter out rows with null age values and get unique age groups
valid_age_groups = df.dropna(subset=['INVAGE'])['INVAGE'].unique()

# Group by age group and count the occurrences of collisions
age_collision_counts = df['INVAGE'].value_counts().reset_index()
age_collision_counts.columns = ['Age Group', 'Collision Count']

# Filter only the rows with valid age groups
age_collision_counts = age_collision_counts[age_collision_counts['Age Group'].isin(valid_age_groups)]

# Create a bar plot using Plotly
fig = px.bar(age_collision_counts, x='Age Group', y='Collision Count',
             title='Collision Occurrences by Age Group',
             labels={'Collision Count': 'Collision Count', 'Age Group': 'Age Group'})

# Show the plot
fig.show()

In [124]:
# Group by injury type and count the occurrences of collisions
injury_collision_counts = df['INJURY'].value_counts().reset_index()
injury_collision_counts.columns = ['Injury Type', 'Collision Count']

# Create a bar plot using Plotly
fig = px.bar(injury_collision_counts, x='Injury Type', y='Collision Count',
             title='Collision Occurrences by Injury Type',
             labels={'Collision Count': 'Collision Count', 'Injury Type': 'Injury Type'})

# Show the plot
fig.show()

In [125]:
# Group by initial direction of travel and count the occurrences of collisions
initdir_collision_counts = df['INITDIR'].value_counts().reset_index()
initdir_collision_counts.columns = ['Initial Direction', 'Collision Count']

# Create a bar plot using Plotly
fig = px.bar(initdir_collision_counts, x='Initial Direction', y='Collision Count',
             title='Collision Occurrences by Initial Direction of Travel',
             labels={'Collision Count': 'Collision Count', 'Initial Direction': 'Initial Direction'})

# Show the plot
fig.show()

In [126]:
# Group by driver condition and count the occurrences of collisions
drivcond_collision_counts = df['DRIVCOND'].value_counts().reset_index()
drivcond_collision_counts.columns = ['Driver Condition', 'Collision Count']

# Create a bar plot using Plotly
fig = px.bar(drivcond_collision_counts, x='Driver Condition', y='Collision Count',
             title='Collision Occurrences by Driver Condition',
             labels={'Collision Count': 'Collision Count', 'Driver Condition': 'Driver Condition'})

# Show the plot
fig.show()

In [127]:
# Filter out rows with missing vehicle type
df_filtered = df.dropna(subset=['VEHTYPE'])

# Group by vehicle type and count the occurrences
collision_counts = df_filtered['VEHTYPE'].value_counts().reset_index()
collision_counts.columns = ['Vehicle Type', 'Collision Count']

# Create a bar chart using Plotly Express
fig = px.bar(collision_counts, x='Vehicle Type', y='Collision Count',
             title='Total Collision Comparison by Vehicle Type',
             labels={'Vehicle Type': 'Vehicle Type', 'Collision Count': 'Collision Count'})

# Show the plot
fig.show()

In [128]:
# Filter out rows with missing pedestrian condition
df_filtered = df.dropna(subset=['PEDCOND'])

# Group by pedestrian condition and count the occurrences
collision_counts = df_filtered['PEDCOND'].value_counts().reset_index()
collision_counts.columns = ['Pedestrian Condition', 'Collision Count']

# Create a bar chart using Plotly Express
fig = px.bar(collision_counts, x='Pedestrian Condition', y='Collision Count',
             title='Total Collision Comparison by Pedestrian Condition',
             labels={'Pedestrian Condition': 'Pedestrian Condition', 'Collision Count': 'Collision Count'})

# Show the plot
fig.show()

In [129]:
# Count the occurrences of each factor
counts = {
    'Cyclist Involved': df['CYCLIST'].notnull().sum(),
    'Pedestrian Involved': df['PEDESTRIAN'].notnull().sum(),
    'Transit or City Vehicle Involved': df['TRSN_CITY_VEH'].notnull().sum(),
    'Passenger Involved': df['PASSENGER'].notnull().sum(),
    'Speeding Related': df['SPEEDING'].notnull().sum(),
    'Red Light Related': df['REDLIGHT'].notnull().sum(),
    'Alcohol Related': df['ALCOHOL'].notnull().sum(),
}

# Create a DataFrame from the counts
data = pd.DataFrame.from_dict(counts, orient='index', columns=['Count']).reset_index()

# Rename columns
data.columns = ['Factor', 'Count']

# Create a bar chart using Plotly Express
fig = px.bar(data, x='Factor', y='Count',
             title='Total Accidents and Involvement of Different Factors',
             labels={'Factor': 'Factor', 'Count': 'Count'})

# Show the plot
fig.show()

In [130]:
# Count the occurrences of each factor
counts = {
    'Cyclist Involved': df['CYCLIST'].notnull().sum(),
    'Pedestrian Involved': df['PEDESTRIAN'].notnull().sum(),
    'Transit or City Vehicle Involved': df['TRSN_CITY_VEH'].notnull().sum(),
    'Passenger Involved': df['PASSENGER'].notnull().sum(),
    'Speeding Related': df['SPEEDING'].notnull().sum(),
    'Red Light Related': df['REDLIGHT'].notnull().sum(),
    'Alcohol Related': df['ALCOHOL'].notnull().sum(),
}

# Create a DataFrame from the counts
data = pd.DataFrame.from_dict(counts, orient='index', columns=['Count']).reset_index()

# Rename columns
data.columns = ['Factor', 'Count']

# Create a pie chart using Plotly Express
fig = px.pie(data, names='Factor', values='Count',
             title='Distribution of Accidents by Factor')

# Show the plot
fig.show()

In [131]:
# Group by involvement type and count the occurrences
collision_counts = df_filtered['INVTYPE'].value_counts().reset_index()
collision_counts.columns = ['Involvement Type', 'Collision Count']

# Create a bar chart using Plotly Express
fig = px.bar(collision_counts, x='Involvement Type', y='Collision Count',
             title='Total Collisions by Involvement Type',
             labels={'Involvement Type': 'Involvement Type', 'Collision Count': 'Collision Count'})

# Show the plot
fig.show()

In [132]:
# Define a function to map dates to seasons
def get_season(date):
    month = date.month
    if 3 <= month <= 5:
        return 'Spring'
    elif 6 <= month <= 8:
        return 'Summer'
    elif 9 <= month <= 11:
        return 'Autumn'
    else:
        return 'Winter'

# Convert the 'DATE' column to datetime format if it's not already
df['DATE'] = pd.to_datetime(df['DATE'])

# Apply the function to create a new column for seasons
df['SEASON'] = df['DATE'].apply(get_season)

# Filter out rows with missing season information
df_filtered = df.dropna(subset=['SEASON'])

# Group by season and count the occurrences
season_counts = df_filtered['SEASON'].value_counts().reset_index()
season_counts.columns = ['Season', 'Accident Count']

# Create a bar chart using Plotly Express
fig = px.bar(season_counts, x='Season', y='Accident Count',
             title='Accidents by Season',
             labels={'Season': 'Season', 'Accident Count': 'Number of Accidents'})

# Show the plot
fig.show()

In [133]:
# Filter out rows with missing season and district information
df_filtered = df.dropna(subset=['SEASON', 'DISTRICT'])

# Group by season and district and count the occurrences
season_district_counts = df_filtered.groupby(['DISTRICT', 'SEASON']).size().reset_index(name='Accident Count')

# Create a grouped bar chart using Plotly Express
fig = px.bar(season_district_counts, x='DISTRICT', y='Accident Count', color='SEASON',
             title='Accidents by District and Season',
             labels={'DISTRICT': 'District', 'Accident Count': 'Number of Accidents'})

# Show the plot
fig.show()

In [134]:
# First result: Total null values for each feature
null_counts = df.isnull().sum()
null_df = pd.DataFrame(null_counts, columns=["Missing Values"])

# Second result: Total unique values for each feature
unique_values = df.nunique().reset_index()
unique_values.columns = ["Feature", "Unique Values"]

# Combine the two DataFrames
combined_df = pd.merge(unique_values, null_df, left_on="Feature", right_index=True)

# Display the combined DataFrame
print(combined_df)

              Feature  Unique Values  Missing Values
0              INDEX_          15000               0
1              ACCNUM           3822            3698
2                YEAR             13               0
3                DATE           3082               0
4                TIME           1276               0
5             STREET1           1547               0
6             STREET2           2344            1343
7              OFFSET            339           13072
8          ROAD_CLASS              9             357
9            DISTRICT              4              14
10            WARDNUM             71               0
11           LATITUDE           3475               0
12          LONGITUDE           3901               0
13           LOCCOORD              7              90
14             ACCLOC              9            5450
15           TRAFFCTL             10              29
16         VISIBILITY              8              14
17              LIGHT              9          

In [135]:
df['DATE'] = pd.to_datetime(df['DATE'])

# Accessing year, month, and day components
df['Year'] = df['DATE'].dt.year
df['Month'] = df['DATE'].dt.month
df['Day'] = df['DATE'].dt.day

# Displaying the updated DataFrame
print(df[['DATE', 'Year', 'Month', 'Day']].head())

                       DATE  Year  Month  Day
0 2006-03-11 05:00:00+00:00  2006      3   11
1 2006-03-11 05:00:00+00:00  2006      3   11
2 2006-03-11 05:00:00+00:00  2006      3   11
3 2006-01-01 05:00:00+00:00  2006      1    1
4 2006-03-11 05:00:00+00:00  2006      3   11


In [136]:
one_accident = df[df.duplicated(subset=['TIME', 'DIVISION','LATITUDE','LONGITUDE' ,'STREET1', 'STREET2', 'NEIGHBOURHOOD_140','NEIGHBOURHOOD_158' ,'HOOD_140'], keep=False)]

# Group by the condition and assign a unique index ID
one_accident['ACC_NUM'] = one_accident.groupby(['TIME', 'DIVISION','LATITUDE','LONGITUDE' ,'STREET1', 'STREET2', 'NEIGHBOURHOOD_140','NEIGHBOURHOOD_158', 'HOOD_140']).ngroup()

# Merge the unique index IDs back to the original DataFrame
df = pd.merge(df, one_accident[['ACC_NUM']], left_index=True, right_index=True, how='left')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [137]:
df['ACC_NUM'].isna().sum()


1544

In [138]:
!pip install klib




In [139]:
import klib

In [140]:
df.dtypes


INDEX_                             int64
ACCNUM                           float64
YEAR                               int64
DATE                 datetime64[ns, UTC]
TIME                               int64
STREET1                           object
STREET2                           object
OFFSET                            object
ROAD_CLASS                        object
DISTRICT                          object
WARDNUM                            int64
LATITUDE                         float64
LONGITUDE                        float64
LOCCOORD                          object
ACCLOC                            object
TRAFFCTL                          object
VISIBILITY                        object
LIGHT                             object
RDSFCOND                          object
ACCLASS                           object
IMPACTYPE                         object
INVTYPE                           object
INVAGE                            object
INJURY                            object
FATAL_NO        

In [141]:
klib.convert_datatypes(df)


Unnamed: 0,INDEX_,ACCNUM,YEAR,DATE,TIME,STREET1,STREET2,OFFSET,ROAD_CLASS,DISTRICT,...,NEIGHBOURHOOD_158,HOOD_140,NEIGHBOURHOOD_140,DIVISION,ObjectId,SEASON,Year,Month,Day,ACC_NUM
0,3387730,892658.0,2006,2006-03-11 05:00:00+00:00,852,BLOOR ST W,DUNDAS ST W,,Major Arterial,Toronto and East York,...,High Park North,88,High Park North (88),D11,1,Spring,2006,3,11,1038.0
1,3387731,892658.0,2006,2006-03-11 05:00:00+00:00,852,BLOOR ST W,DUNDAS ST W,,Major Arterial,Toronto and East York,...,High Park North,88,High Park North (88),D11,2,Spring,2006,3,11,1038.0
2,3388101,892810.0,2006,2006-03-11 05:00:00+00:00,915,MORNINGSIDE AVE,SHEPPARD AVE E,,Major Arterial,Scarborough,...,Malvern East,132,Malvern (132),D42,3,Spring,2006,3,11,1139.0
3,3389067,893184.0,2006,2006-01-01 05:00:00+00:00,236,WOODBINE AVE,O CONNOR DR,,Major Arterial,Toronto and East York,...,Woodbine-Lumsden,60,Woodbine-Lumsden (60),D55,4,Winter,2006,1,1,308.0
4,3388102,892810.0,2006,2006-03-11 05:00:00+00:00,915,MORNINGSIDE AVE,SHEPPARD AVE E,,Major Arterial,Scarborough,...,Malvern East,132,Malvern (132),D42,5,Spring,2006,3,11,1139.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,81474608,,2018,2018-04-26 04:00:00+00:00,1942,ISLINGTON AVE,ALBION RD,,Major Arterial,Etobicoke York,...,Thistletown-Beaumond Heights,3,Thistletown-Beaumond Heights (3),D23,14996,Spring,2018,4,26,3679.0
14996,81474609,,2018,2018-04-26 04:00:00+00:00,1942,ISLINGTON AVE,ALBION RD,,Major Arterial,Etobicoke York,...,Thistletown-Beaumond Heights,3,Thistletown-Beaumond Heights (3),D23,14997,Spring,2018,4,26,3679.0
14997,81474610,,2018,2018-04-26 04:00:00+00:00,1942,ISLINGTON AVE,ALBION RD,,Major Arterial,Etobicoke York,...,Thistletown-Beaumond Heights,3,Thistletown-Beaumond Heights (3),D23,14998,Spring,2018,4,26,3679.0
14998,81474611,,2018,2018-04-26 04:00:00+00:00,1942,ISLINGTON AVE,ALBION RD,,Major Arterial,Etobicoke York,...,Thistletown-Beaumond Heights,3,Thistletown-Beaumond Heights (3),D23,14999,Spring,2018,4,26,3679.0


In [142]:
df.dtypes


INDEX_                             int64
ACCNUM                           float64
YEAR                               int64
DATE                 datetime64[ns, UTC]
TIME                               int64
STREET1                           object
STREET2                           object
OFFSET                            object
ROAD_CLASS                        object
DISTRICT                          object
WARDNUM                            int64
LATITUDE                         float64
LONGITUDE                        float64
LOCCOORD                          object
ACCLOC                            object
TRAFFCTL                          object
VISIBILITY                        object
LIGHT                             object
RDSFCOND                          object
ACCLASS                           object
IMPACTYPE                         object
INVTYPE                           object
INVAGE                            object
INJURY                            object
FATAL_NO        

In [143]:
df = df.drop(columns=['INDEX_', 'ACCNUM', 'YEAR', 'DATE', 'TIME', 'STREET1', 'STREET2', 'OFFSET',
                   'DISTRICT', 'WARDNUM', 'LATITUDE', 'LONGITUDE', 'LOCCOORD', 'ACCLOC', 'FATAL_NO',
                   'HOOD_158', 'NEIGHBOURHOOD_158', 'HOOD_140', 'NEIGHBOURHOOD_140',
                   'DIVISION', 'ObjectId'], axis=1)

df.head()

Unnamed: 0,ROAD_CLASS,TRAFFCTL,VISIBILITY,LIGHT,RDSFCOND,ACCLASS,IMPACTYPE,INVTYPE,INVAGE,INJURY,...,SPEEDING,AG_DRIV,REDLIGHT,ALCOHOL,DISABILITY,SEASON,Year,Month,Day,ACC_NUM
0,Major Arterial,Traffic Signal,Clear,Daylight,Dry,Fatal,Pedestrian Collisions,Driver,unknown,,...,,Yes,,,,Spring,2006,3,11,1038.0
1,Major Arterial,Traffic Signal,Clear,Daylight,Dry,Fatal,Pedestrian Collisions,Pedestrian,65 to 69,Fatal,...,,Yes,,,,Spring,2006,3,11,1038.0
2,Major Arterial,Traffic Signal,Clear,Daylight,Dry,Fatal,Turning Movement,Motorcycle Driver,45 to 49,Fatal,...,,Yes,Yes,,,Spring,2006,3,11,1139.0
3,Major Arterial,No Control,Clear,Dark,Wet,Non-Fatal Injury,Approaching,Passenger,50 to 54,Major,...,Yes,Yes,,Yes,,Winter,2006,1,1,308.0
4,Major Arterial,Traffic Signal,Clear,Daylight,Dry,Fatal,Turning Movement,Driver,unknown,,...,,Yes,Yes,,,Spring,2006,3,11,1139.0


In [144]:
# Find null values in the 'ACC_NUM' column
null_values_indices = df[df['ACC_NUM'].isnull()].index

# Generate a sequence starting from 4545
sequence = range(4545, 4545 + len(null_values_indices))

# Fill null values with the generated sequence
df.loc[null_values_indices, 'ACC_NUM'] = sequence

# Display the DataFrame with filled null values
df["ACC_NUM"].isna().sum()

0

In [145]:
label_encoder = LabelEncoder()

object_columns = ['ROAD_CLASS', 'TRAFFCTL', 'VISIBILITY', 'LIGHT', 'RDSFCOND',
                  'IMPACTYPE', 'INVTYPE', 'INVAGE','INITDIR', 'VEHTYPE',
                  'MANOEUVER', 'DRIVACT', 'DRIVCOND', 'PEDTYPE', 'PEDACT', 'PEDCOND',
                  'CYCLISTYPE', 'CYCACT', 'CYCCOND', 'PEDESTRIAN', 'CYCLIST', 'AUTOMOBILE',
                  'MOTORCYCLE', 'TRUCK', 'TRSN_CITY_VEH', 'EMERG_VEH', 'PASSENGER', 'SPEEDING',
                  'AG_DRIV', 'REDLIGHT', 'ALCOHOL', 'DISABILITY', 'SEASON']

for column in object_columns:
    df[column] = label_encoder.fit_transform(df[column])

df.head()

Unnamed: 0,ROAD_CLASS,TRAFFCTL,VISIBILITY,LIGHT,RDSFCOND,ACCLASS,IMPACTYPE,INVTYPE,INVAGE,INJURY,...,SPEEDING,AG_DRIV,REDLIGHT,ALCOHOL,DISABILITY,SEASON,Year,Month,Day,ACC_NUM
0,5,8,0,4,0,Fatal,4,2,20,,...,1,0,1,1,1,1,2006,3,11,1038.0
1,5,8,0,4,0,Fatal,4,11,13,Fatal,...,1,0,1,1,1,1,2006,3,11,1038.0
2,5,8,0,4,0,Fatal,9,6,8,Fatal,...,1,0,0,1,1,1,2006,3,11,1139.0
3,5,0,0,0,8,Non-Fatal Injury,1,10,10,Major,...,0,0,1,0,1,3,2006,1,1,308.0
4,5,8,0,4,0,Fatal,9,2,20,,...,1,0,0,1,1,1,2006,3,11,1139.0


In [146]:
df.isna().sum()

ROAD_CLASS          0
TRAFFCTL            0
VISIBILITY          0
LIGHT               0
RDSFCOND            0
ACCLASS             0
IMPACTYPE           0
INVTYPE             0
INVAGE              0
INJURY           1606
INITDIR             0
VEHTYPE             0
MANOEUVER           0
DRIVACT             0
DRIVCOND            0
PEDTYPE             0
PEDACT              0
PEDCOND             0
CYCLISTYPE          0
CYCACT              0
CYCCOND             0
PEDESTRIAN          0
CYCLIST             0
AUTOMOBILE          0
MOTORCYCLE          0
TRUCK               0
TRSN_CITY_VEH       0
EMERG_VEH           0
PASSENGER           0
SPEEDING            0
AG_DRIV             0
REDLIGHT            0
ALCOHOL             0
DISABILITY          0
SEASON              0
Year                0
Month               0
Day                 0
ACC_NUM             0
dtype: int64

In [147]:
df.sort_values('ACC_NUM')

Unnamed: 0,ROAD_CLASS,TRAFFCTL,VISIBILITY,LIGHT,RDSFCOND,ACCLASS,IMPACTYPE,INVTYPE,INVAGE,INJURY,...,SPEEDING,AG_DRIV,REDLIGHT,ALCOHOL,DISABILITY,SEASON,Year,Month,Day,ACC_NUM
6414,5,0,0,0,0,Fatal,4,15,20,,...,1,0,1,1,1,3,2011,1,29,0.0
6417,5,0,0,0,0,Fatal,4,2,6,,...,1,0,1,1,1,3,2011,1,29,0.0
6426,5,0,0,0,0,Fatal,4,11,12,Fatal,...,1,0,1,1,1,3,2011,1,29,0.0
4034,9,0,0,4,0,Non-Fatal Injury,8,2,7,,...,0,0,1,1,1,2,2009,6,22,1.0
4033,9,0,0,4,0,Non-Fatal Injury,8,6,4,Major,...,0,0,1,1,1,2,2009,6,22,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14905,4,0,0,5,0,Non-Fatal Injury,2,8,20,,...,1,1,1,1,1,3,2018,2,3,6084.0
14908,5,0,0,4,0,Non-Fatal Injury,4,2,10,,...,1,1,1,1,1,1,2018,3,2,6085.0
14912,4,0,0,5,0,Non-Fatal Injury,2,0,2,Major,...,1,1,1,1,1,3,2018,2,3,6086.0
14918,5,0,0,0,0,Non-Fatal Injury,4,2,20,,...,1,0,1,1,1,0,2018,9,21,6087.0


In [148]:
df["INJURY"].fillna("No injury")

0         None
1        Fatal
2        Fatal
3        Major
4         None
         ...  
14995     None
14996    Minor
14997     None
14998     None
14999     None
Name: INJURY, Length: 15000, dtype: object

In [149]:
df["INJURY"].unique()

array(['None', 'Fatal', 'Major', 'Minor', nan, 'Minimal'], dtype=object)