##  Imports & Create Synthetic Dataset


In [1]:
import pandas as pd
import numpy as np

# For reproducibility
np.random.seed(42)

# Generate 10 consecutive dates as strings
dates = pd.date_range(start='2023-01-01', periods=10, freq='D').astype(str)

# Randomly pick 15 dates (with repeats)
random_dates = np.random.choice(dates, size=15)

# Create DataFrame
df = pd.DataFrame({
    'accident_id': range(1, 16),
    'date_str': random_dates,
    'severity': np.random.randint(1, 5, size=15)
})

print("Initial dataset with date column as string:")
print(df.head())


Initial dataset with date column as string:
   accident_id    date_str  severity
0            1  2023-01-07         1
1            2  2023-01-04         2
2            3  2023-01-08         4
3            4  2023-01-05         4
4            5  2023-01-07         2


## Step 1 — Quick check of datetime column type


In [2]:
print("Column types before conversion:")
print(df.dtypes)


Column types before conversion:
accident_id     int64
date_str       object
severity        int32
dtype: object


## Step 2 — Convert string column to datetime


In [3]:
df['date'] = pd.to_datetime(df['date_str'], errors='coerce')

print("\nDataset after conversion:")
print(df.head())
print("\nColumn types after conversion:")
print(df.dtypes)



Dataset after conversion:
   accident_id    date_str  severity       date
0            1  2023-01-07         1 2023-01-07
1            2  2023-01-04         2 2023-01-04
2            3  2023-01-08         4 2023-01-08
3            4  2023-01-05         4 2023-01-05
4            5  2023-01-07         2 2023-01-07

Column types after conversion:
accident_id             int64
date_str               object
severity                int32
date           datetime64[ns]
dtype: object


## Step 3 — Extract datetime features


In [4]:
df['hour'] = df['date'].dt.hour             # Hour of the day
df['weekday'] = df['date'].dt.day_name()   # Day of the week name
df['month'] = df['date'].dt.month_name()   # Month name

print("\nDataset with extracted datetime features:")
print(df.head())



Dataset with extracted datetime features:
   accident_id    date_str  severity       date  hour    weekday    month
0            1  2023-01-07         1 2023-01-07     0   Saturday  January
1            2  2023-01-04         2 2023-01-04     0  Wednesday  January
2            3  2023-01-08         4 2023-01-08     0     Sunday  January
3            4  2023-01-05         4 2023-01-05     0   Thursday  January
4            5  2023-01-07         2 2023-01-07     0   Saturday  January


## Step 4 — Introduce missing datetime values (NaT)


In [5]:
# Randomly set 2 rows to NaT
nan_indices = np.random.choice(df.index, size=2, replace=False)
df.loc[nan_indices, 'date'] = pd.NaT

print("\nDataset with missing datetime values (NaT):")
print(df.loc[nan_indices])



Dataset with missing datetime values (NaT):
   accident_id    date_str  severity date  hour    weekday    month
1            2  2023-01-04         2  NaT     0  Wednesday  January
5            6  2023-01-10         2  NaT     0    Tuesday  January


## Step 5 — Fill missing datetime values (Forward Fill)


In [6]:
# Forward fill missing datetime values
df['date_filled'] = df['date'].ffill()

print("\nDataset after filling missing datetime values (forward fill):")
print(df.loc[nan_indices])



Dataset after filling missing datetime values (forward fill):
   accident_id    date_str  severity date  hour    weekday    month  \
1            2  2023-01-04         2  NaT     0  Wednesday  January   
5            6  2023-01-10         2  NaT     0    Tuesday  January   

  date_filled  
1  2023-01-07  
5  2023-01-07  


In [7]:
df_dropped = df.dropna(subset=['date'])
print("\nRows after dropping missing datetime:")
print(df_dropped.head())



Rows after dropping missing datetime:
   accident_id    date_str  severity       date  hour   weekday    month  \
0            1  2023-01-07         1 2023-01-07     0  Saturday  January   
2            3  2023-01-08         4 2023-01-08     0    Sunday  January   
3            4  2023-01-05         4 2023-01-05     0  Thursday  January   
4            5  2023-01-07         2 2023-01-07     0  Saturday  January   
6            7  2023-01-03         2 2023-01-03     0   Tuesday  January   

  date_filled  
0  2023-01-07  
2  2023-01-08  
3  2023-01-05  
4  2023-01-07  
6  2023-01-03  


In [8]:
filtered_df = df[df['date'] > pd.Timestamp('2023-01-05')]
print("\nRows with date after 2023-01-05:")
print(filtered_df.head())



Rows with date after 2023-01-05:
   accident_id    date_str  severity       date  hour   weekday    month  \
0            1  2023-01-07         1 2023-01-07     0  Saturday  January   
2            3  2023-01-08         4 2023-01-08     0    Sunday  January   
4            5  2023-01-07         2 2023-01-07     0  Saturday  January   
7            8  2023-01-07         4 2023-01-07     0  Saturday  January   
8            9  2023-01-08         4 2023-01-08     0    Sunday  January   

  date_filled  
0  2023-01-07  
2  2023-01-08  
4  2023-01-07  
7  2023-01-07  
8  2023-01-08  


## Step 6 — Quick summary


In [9]:
print("Missing datetime values (original):", df['date'].isnull().sum())
print("Missing datetime values (after fill):", df['date_filled'].isnull().sum())


Missing datetime values (original): 2
Missing datetime values (after fill): 0
