In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

In [7]:
df =pd.read_csv("data/bank_data_project.csv")

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1041614 entries, 0 to 1041613
Data columns (total 9 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   TransactionID            1041614 non-null  object 
 1   CustomerID               1041614 non-null  object 
 2   CustomerDOB              1041614 non-null  object 
 3   CustGender               1041614 non-null  object 
 4   CustLocation             1041614 non-null  object 
 5   CustAccountBalance       1041614 non-null  float64
 6   TransactionDate          1041614 non-null  object 
 7   TransactionTime          1041614 non-null  int64  
 8   TransactionAmount (INR)  1041614 non-null  float64
dtypes: float64(2), int64(1), object(6)
memory usage: 71.5+ MB


In [10]:
print(df["CustomerDOB"].head(30).tolist())

['10/01/1994', '04/04/1957', '26/11/1996', '14/09/1973', '24/03/1988', '08/10/1972', '26/01/1992', '27/01/1982', '19/04/1988', '22/06/1984', '22/07/1982', '07/07/1988', '13/06/1978', '05/01/1992', '24/03/1978', '10/07/1968', '1/1/1800', '16/07/1989', '11/01/1991', '24/06/1985', '20/04/1993', '31/08/1989', '1/1/1800', '01/10/1986', '17/05/1991', '24/02/1993', '01/02/1986', '01/04/1993', '1/1/1800', '21/01/1967']


In [None]:
df.head()

In [None]:
# Convert to datetime if needed
df["CustomerDOB"] = pd.to_datetime(df["CustomerDOB"], errors='coerce')
unique_dates = df["CustomerDOB"].unique()
print(unique_dates)

In [12]:
 #Check for invalid dates (NaT values)
invalid_dates = df[df["CustomerDOB"].isna()]
print(f"Number of invalid dates: {len(invalid_dates)}")
print("Invalid entries:")
print(invalid_dates)

Number of invalid dates: 560396
Invalid entries:
        TransactionID CustomerID CustomerDOB CustGender   CustLocation  \
2                  T3   C4417068         NaT          F         MUMBAI   
3                  T4   C5342380         NaT          F         MUMBAI   
4                  T5   C9031234         NaT          F    NAVI MUMBAI   
6                  T7   C7126560         NaT          F         MUMBAI   
7                  T8   C1220223         NaT          M         MUMBAI   
...               ...        ...         ...        ...            ...   
1041607      T1048561   C5028150         NaT          M           PUNE   
1041608      T1048562   C1034220         NaT          M      BANGALORE   
1041610      T1048564   C6459278         NaT          M         NASHIK   
1041611      T1048565   C6412354         NaT          M      HYDERABAD   
1041612      T1048566   C6420483         NaT          M  VISAKHAPATNAM   

         CustAccountBalance TransactionDate  TransactionTime  

In [13]:
# Count NaT values
nat_count = df["CustomerDOB"].isna().sum()
total_count = len(df)
print(f"Invalid dates (NaT): {nat_count}")
print(f"Total rows: {total_count}")
print(f"Percentage invalid: {(nat_count/total_count)*100:.2f}%")

Invalid dates (NaT): 560396
Total rows: 1041614
Percentage invalid: 53.80%


In [11]:
# Check what the original date values actually look like
print("Sample of original CustomerDOB values that became NaT:")
original_dob_sample = df.loc[df["CustomerDOB"].isna(), "CustomerDOB"].head(20)
print(original_dob_sample.tolist())

# Or if you have the original data before conversion, check that

Sample of original CustomerDOB values that became NaT:
[]


In [14]:
df["CustomerDOB"] = df["CustomerDOB"].bfill()

In [15]:
# Convert to datetime if needed
df["CustomerDOB"] = pd.to_datetime(df["CustomerDOB"], errors='coerce')
unique_dates = df["CustomerDOB"].unique()
print(unique_dates)

<DatetimeArray>
['1994-10-01 00:00:00', '1957-04-04 00:00:00', '1972-08-10 00:00:00',
 '1988-07-07 00:00:00', '1992-05-01 00:00:00', '1968-10-07 00:00:00',
 '1800-01-01 00:00:00', '1991-11-01 00:00:00', '1986-01-10 00:00:00',
 '1986-01-02 00:00:00',
 ...
 '1998-04-02 00:00:00', '1959-09-03 00:00:00', '1998-02-06 00:00:00',
 '1957-08-05 00:00:00', '1965-10-02 00:00:00', '2004-07-03 00:00:00',
 '1945-07-06 00:00:00', '1954-09-06 00:00:00', '1946-06-04 00:00:00',
 '1950-07-01 00:00:00']
Length: 7061, dtype: datetime64[ns]


In [16]:
# Convert to datetime 
print(df["CustomerDOB"])

0         1994-10-01
1         1957-04-04
2         1972-08-10
3         1972-08-10
4         1972-08-10
             ...    
1041609   1990-08-04
1041610   1984-05-03
1041611   1984-05-03
1041612   1984-05-03
1041613   1984-05-03
Name: CustomerDOB, Length: 1041614, dtype: datetime64[ns]


In [17]:
# Count NaT values
nat_count = df["CustomerDOB"].isna().sum()
total_count = len(df)
print(f"Invalid dates (NaT): {nat_count}")
print(f"Total rows: {total_count}")
print(f"Percentage invalid: {(nat_count/total_count)*100:.2f}%")

Invalid dates (NaT): 0
Total rows: 1041614
Percentage invalid: 0.00%


In [18]:
df.head()

Unnamed: 0,TransactionID,CustomerID,CustomerDOB,CustGender,CustLocation,CustAccountBalance,TransactionDate,TransactionTime,TransactionAmount (INR)
0,T1,C5841053,1994-10-01,F,JAMSHEDPUR,17819.05,02/08/2016,143207,25.0
1,T2,C2142763,1957-04-04,M,JHAJJAR,2270.69,02/08/2016,141858,27999.0
2,T3,C4417068,1972-08-10,F,MUMBAI,17874.44,02/08/2016,142712,459.0
3,T4,C5342380,1972-08-10,F,MUMBAI,866503.21,02/08/2016,142714,2060.0
4,T5,C9031234,1972-08-10,F,NAVI MUMBAI,6714.43,02/08/2016,181156,1762.5
