<a href="https://colab.research.google.com/github/Ram2005-ui/us_accidents_data_analysis/blob/main/Day_9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

# Create sythetic dataset with string datetime values
np.random.seed(42)
dates = pd.date_range(start='2023-01-01', periods=10, freq='D').astype(str) # 10 days
random_dates = np.random.choice(dates, size=15) # 15 records with some repeats


# Create sythetic dataset with string datetime column and a feature

df  = pd.DataFrame({
    'accident_id':range(1, 16),
    'date_str': random_dates,
    'severity': np.random.randint(1, 5, size=15) # severity levels 1 to 4
}
)
df.set_index('accident_id', inplace=True)
df.head()

Unnamed: 0_level_0,date_str,severity
accident_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2023-01-07,1
2,2023-01-04,2
3,2023-01-08,4
4,2023-01-05,4
5,2023-01-07,2


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15 entries, 1 to 15
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   date_str  15 non-null     object
 1   severity  15 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 360.0+ bytes


In [3]:
# Convert string column to datetime dtype
df['date'] = pd.to_datetime(df['date_str'])
df.head()

Unnamed: 0_level_0,date_str,severity,date
accident_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2023-01-07,1,2023-01-07
2,2023-01-04,2,2023-01-04
3,2023-01-08,4,2023-01-08
4,2023-01-05,4,2023-01-05
5,2023-01-07,2,2023-01-07


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15 entries, 1 to 15
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   date_str  15 non-null     object        
 1   severity  15 non-null     int64         
 2   date      15 non-null     datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 480.0+ bytes


In [5]:
df['month'] = df['date'].dt.month_name()
df['weekday'] = df['date'].dt.day_name()
df['hour'] = df['date'].dt.hour
df.head()

Unnamed: 0_level_0,date_str,severity,date,month,weekday,hour
accident_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,2023-01-07,1,2023-01-07,January,Saturday,0
2,2023-01-04,2,2023-01-04,January,Wednesday,0
3,2023-01-08,4,2023-01-08,January,Sunday,0
4,2023-01-05,4,2023-01-05,January,Thursday,0
5,2023-01-07,2,2023-01-07,January,Saturday,0


In [6]:
nan_indices = np.random.choice(df.index, size=3, replace=False)
df.loc[nan_indices, 'date'] = pd.NaT
df

Unnamed: 0_level_0,date_str,severity,date,month,weekday,hour
accident_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,2023-01-07,1,2023-01-07,January,Saturday,0
2,2023-01-04,2,NaT,January,Wednesday,0
3,2023-01-08,4,2023-01-08,January,Sunday,0
4,2023-01-05,4,2023-01-05,January,Thursday,0
5,2023-01-07,2,NaT,January,Saturday,0
6,2023-01-10,2,NaT,January,Tuesday,0
7,2023-01-03,2,2023-01-03,January,Tuesday,0
8,2023-01-07,4,2023-01-07,January,Saturday,0
9,2023-01-08,4,2023-01-08,January,Sunday,0
10,2023-01-05,1,2023-01-05,January,Thursday,0


In [7]:
df['date_filled'] = df['date'].fillna(method='ffill')
df

  df['date_filled'] = df['date'].fillna(method='ffill')


Unnamed: 0_level_0,date_str,severity,date,month,weekday,hour,date_filled
accident_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,2023-01-07,1,2023-01-07,January,Saturday,0,2023-01-07
2,2023-01-04,2,NaT,January,Wednesday,0,2023-01-07
3,2023-01-08,4,2023-01-08,January,Sunday,0,2023-01-08
4,2023-01-05,4,2023-01-05,January,Thursday,0,2023-01-05
5,2023-01-07,2,NaT,January,Saturday,0,2023-01-05
6,2023-01-10,2,NaT,January,Tuesday,0,2023-01-05
7,2023-01-03,2,2023-01-03,January,Tuesday,0,2023-01-03
8,2023-01-07,4,2023-01-07,January,Saturday,0,2023-01-07
9,2023-01-08,4,2023-01-08,January,Sunday,0,2023-01-08
10,2023-01-05,1,2023-01-05,January,Thursday,0,2023-01-05


In [8]:
# Additional datetime handling examples:

# Drop rows with missing datetime values
df_dropped = df.dropna(subset=['date'])

# Filter rows where date is after a specific datetime
filtered_df = df[df['date'] > pd.Timestamp('2023-01-05')]

print("\nRows after dropping missing datetime:")
print(df_dropped.head())

print("\nRows with date after 2023-01-05:")
print(filtered_df.head())


Rows after dropping missing datetime:
               date_str  severity       date    month   weekday  hour  \
accident_id                                                             
1            2023-01-07         1 2023-01-07  January  Saturday     0   
3            2023-01-08         4 2023-01-08  January    Sunday     0   
4            2023-01-05         4 2023-01-05  January  Thursday     0   
7            2023-01-03         2 2023-01-03  January   Tuesday     0   
8            2023-01-07         4 2023-01-07  January  Saturday     0   

            date_filled  
accident_id              
1            2023-01-07  
3            2023-01-08  
4            2023-01-05  
7            2023-01-03  
8            2023-01-07  

Rows with date after 2023-01-05:
               date_str  severity       date    month   weekday  hour  \
accident_id                                                             
1            2023-01-07         1 2023-01-07  January  Saturday     0   
3            2