mean

In [12]:
import pandas as pd

# Load the dataset
df = pd.read_csv('HDFCBANK.csv')

# Display the first few rows and check for missing values
print(df.head())

print("\nbefore cleaning:\n")
print(df.isnull().sum())

# Handling missing values
# Fill missing numeric values with the mean
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Fill missing categorical values with the mode
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Verify that there are no more missing values
print("\nafter cleaning:\n")
print(df.isnull().sum())

# Save the cleaned dataset to a new CSV file
df.to_csv('HDFCBANK_cleaned.csv', index=False)

       Date   Open   High    Low  Close  Adj Close  Volume
0  1/1/1996    NaN  3.030  2.925  2.980   2.362861  350000
1  1/2/1996  2.980  3.025  2.950  2.975   2.358896  412000
2  1/3/1996    NaN  2.995  2.950  2.985   2.366825  284000
3  1/4/1996  2.985  2.980  2.940  2.965   2.350966  282000
4       NaN  2.965  2.980  2.950  2.960   2.347003  189000

before cleaning:

Date         1
Open         2
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

after cleaning:

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


ffill and bfill

In [15]:
import pandas as pd

# Load the dataset
df = pd.read_csv('HDFCBANK.csv')

# Display the first few rows and check for missing values
print(df.head())

print("\nBefore cleaning:\n")
print(df.isnull().sum())

# Convert 'Date' column to datetime and set as index (assuming there's a Date column)
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'])
    # df.set_index('Date', inplace=True)

# Handling missing values
# Fill missing numeric values with the previous value
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
# df[numeric_cols] = df[numeric_cols].fillna(method='ffill').fillna(method='bfill')
df[numeric_cols] = df[numeric_cols].ffill().bfill()

# Fill missing categorical values with the mode
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Verify that there are no more missing values
print("\nAfter cleaning:\n")
print(df.isnull().sum())

# Save the cleaned dataset to a new CSV file
df.to_csv('HDFCBANK_cleaned.csv', index=False)


       Date   Open   High    Low  Close  Adj Close  Volume
0  1/1/1996    NaN  3.030  2.925  2.980   2.362861  350000
1  1/2/1996  2.980  3.025  2.950  2.975   2.358896  412000
2  1/3/1996    NaN  2.995  2.950  2.985   2.366825  284000
3  1/4/1996  2.985  2.980  2.940  2.965   2.350966  282000
4       NaN  2.965  2.980  2.950  2.960   2.347003  189000

Before cleaning:

Date         1
Open         2
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

After cleaning:

Date         1
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


with date missing but also add date not required

In [17]:
import pandas as pd

# Load the dataset
df = pd.read_csv('HDFCBANK.csv')

# Display the first few rows and check for missing values
print(df.head())

print("\nBefore cleaning:\n")
print(df.isnull().sum())

if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'])

# Fill missing dates by creating a complete date range
if 'Date' in df.columns:
    # Create a complete date range based on the existing dates
    complete_date_range = pd.date_range(start=df['Date'].min(), end=df['Date'].max())
    # Reindex the DataFrame to include all dates in the complete date range
    df = df.set_index('Date').reindex(complete_date_range).rename_axis('Date').reset_index()

# Handling missing values
# Fill missing numeric values with the previous value
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_cols] = df[numeric_cols].ffill().bfill()

# Fill missing categorical values with the mode
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Verify that there are no more missing values
print("\nAfter cleaning:\n")
print(df.isnull().sum())

# Save the cleaned dataset to a new CSV file
df.to_csv('HDFCBANK_cleaned.csv', index=False)


       Date   Open   High    Low  Close  Adj Close  Volume
0  1/1/1996    NaN  3.030  2.925  2.980   2.362861  350000
1  1/2/1996  2.980  3.025  2.950  2.975   2.358896  412000
2  1/3/1996    NaN  2.995  2.950  2.985   2.366825  284000
3  1/4/1996  2.985  2.980  2.940  2.965   2.350966  282000
4       NaN  2.965  2.980  2.950  2.960   2.347003  189000

Before cleaning:

Date         1
Open         2
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

After cleaning:

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64


days

In [22]:
import pandas as pd

# Load the dataset
df = pd.read_csv('HDFCBANK.csv')

# Display the first few rows and check for missing values
print(df.head())
print("\nBefore cleaning:\n")
print(df.isnull().sum())

# Convert 'Date' column to datetime and set as index (assuming there's a Date column)
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'])
    # Uncomment the line below if you want to set 'Date' as the index
    # df.set_index('Date', inplace=True)

# Add 'Days' column
if 'Date' in df.columns:
    df['Days'] = (df['Date'] - df['Date'].min()).dt.days
    # Add 'Day_Name' column
    df['Day_Name'] = df['Date'].dt.day_name()

# Handling missing values
# Fill missing numeric values with the previous value
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_cols] = df[numeric_cols].ffill().bfill()

# Fill missing categorical values with the mode
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Verify that there are no more missing values
print("\nAfter cleaning:\n")
print(df.isnull().sum())

# Save the cleaned dataset to a new CSV file
df.to_csv('HDFCBANK_cleaned.csv', index=False)


         Date   Open   High    Low  Close  Adj Close  Volume
0  1996-01-01  3.030  3.030  2.925  2.980   2.362861  350000
1  1996-01-02  2.980  3.025  2.950  2.975   2.358896  412000
2  1996-01-03  2.975  2.995  2.950  2.985   2.366825  284000
3  1996-01-04  2.985  2.980  2.940  2.965   2.350966  282000
4  1996-01-05  2.965  2.980  2.950  2.960   2.347003  189000

Before cleaning:

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

After cleaning:

Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
Days         0
Day_Name     0
dtype: int64
