In [13]:
import pandas as pd
import numpy as np

# Create synthetic dataset with string datetime values
np.random.seed(42)
dates = pd.date_range(start='2024-01-01', periods=10, freq='D').astype(str)  # 10 days
random_dates = np.random.choice(dates, size=15)  # 15 records with some repeats

# Create DataFrame with string datetime column and a feature
df = pd.DataFrame({
    'accident_id': range(1, 16),
    'date_str': random_dates,
     'weather': np.random.choice(['Sunny', 'Rainy', 'Cloudy'], size=15),
    'injuries': np.random.randint(0, 3, size=15),
    'Severity': np.random.randint(1, 5, size=15)  # severity levels 1 to 4


})

# Show initial dataset
print("Initial dataset with date column as string:")
print(df.head())

Initial dataset with date column as string:
   accident_id    date_str weather  injuries  Severity
0            1  2024-01-07   Sunny         1         3
1            2  2024-01-04   Rainy         2         2
2            3  2024-01-08   Rainy         1         4
3            4  2024-01-05   Rainy         1         1
4            5  2024-01-07   Rainy         2         4


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   accident_id  15 non-null     int64 
 1   date_str     15 non-null     object
 2   weather      15 non-null     object
 3   injuries     15 non-null     int64 
 4   Severity     15 non-null     int64 
dtypes: int64(3), object(2)
memory usage: 732.0+ bytes


In [15]:
# Convert string column to datetime type
df['date'] = pd.to_datetime(df['date_str'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   accident_id  15 non-null     int64         
 1   date_str     15 non-null     object        
 2   weather      15 non-null     object        
 3   injuries     15 non-null     int64         
 4   Severity     15 non-null     int64         
 5   date         15 non-null     datetime64[ns]
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 852.0+ bytes


In [16]:
# Extract datetime features
df['Year'] = df['date'].dt.year
df['Month'] = df['date'].dt.month
df['Day'] = df['date'].dt.day
df

Unnamed: 0,accident_id,date_str,weather,injuries,Severity,date,Year,Month,Day
0,1,2024-01-07,Sunny,1,3,2024-01-07,2024,1,7
1,2,2024-01-04,Rainy,2,2,2024-01-04,2024,1,4
2,3,2024-01-08,Rainy,1,4,2024-01-08,2024,1,8
3,4,2024-01-05,Rainy,1,1,2024-01-05,2024,1,5
4,5,2024-01-07,Rainy,2,4,2024-01-07,2024,1,7
5,6,2024-01-10,Sunny,1,2,2024-01-10,2024,1,10
6,7,2024-01-03,Sunny,2,2,2024-01-03,2024,1,3
7,8,2024-01-07,Rainy,2,2,2024-01-07,2024,1,7
8,9,2024-01-08,Rainy,0,1,2024-01-08,2024,1,8
9,10,2024-01-05,Sunny,2,2,2024-01-05,2024,1,5


In [19]:
# Handling missing datetime values: introduce NaT (missing) for demonstration
nan_indices = np.random.choice(df.index, size=2, replace=False)
df.loc[nan_indices, 'date'] = pd.NaT
print("\nDataset with missing datetime values (NaT)")
print(df.loc[nan_indices])


Dataset with missing datetime values (NaT)
    accident_id    date_str weather  injuries  Severity date  Year  Month  Day
13           14  2024-01-03  Cloudy         0         4  NaT  2024      1    3
2             3  2024-01-08   Rainy         1         4  NaT  2024      1    8


In [20]:
# Fill missig datetime values using forward fill method
df['date'].fillna(method='ffill', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['date'].fillna(method='ffill', inplace=True)
  df['date'].fillna(method='ffill', inplace=True)


In [21]:
print("\nDataset after filling missing datetime values (forward fill):")
print(df.loc[nan_indices])


Dataset after filling missing datetime values (forward fill):
    accident_id    date_str weather  injuries  Severity       date  Year  \
13           14  2024-01-03  Cloudy         0         4 2024-01-08  2024   
2             3  2024-01-08   Rainy         1         4 2024-01-04  2024   

    Month  Day  
13      1    3  
2       1    8  


In [23]:
# Drop rows with missing datetime values
df_dropped = df.dropna(subset=['date'])

# Filter rows where date is after a specific datetime
filtered_df = df[df['date'] > pd.Timestamp('2023-01-05')]

print("\nRows after dropping missing datetime:")
print(df_dropped.head())

print("\nRows with date after 2024-01-05:")
print(filtered_df.head())


Rows after dropping missing datetime:
   accident_id    date_str weather  injuries  Severity       date  Year  \
0            1  2024-01-07   Sunny         1         3 2024-01-07  2024   
1            2  2024-01-04   Rainy         2         2 2024-01-04  2024   
2            3  2024-01-08   Rainy         1         4 2024-01-04  2024   
3            4  2024-01-05   Rainy         1         1 2024-01-05  2024   
4            5  2024-01-07   Rainy         2         4 2024-01-05  2024   

   Month  Day  
0      1    7  
1      1    4  
2      1    8  
3      1    5  
4      1    7  

Rows with date after 2024-01-05:
   accident_id    date_str weather  injuries  Severity       date  Year  \
0            1  2024-01-07   Sunny         1         3 2024-01-07  2024   
1            2  2024-01-04   Rainy         2         2 2024-01-04  2024   
2            3  2024-01-08   Rainy         1         4 2024-01-04  2024   
3            4  2024-01-05   Rainy         1         1 2024-01-05  2024   
4    