In [25]:
import pandas as pd
from datetime import timedelta
import random

# Function to calculate end date based on start date and date gap
def calculate_end_date(start_date, days_gap):
    return start_date + timedelta(days=days_gap)

# Function to calculate date gap based on level
def calculate_date_gap(level):
    if level == 'Basic':
        return random.randint(1, 4)
    elif level == 'Intermidiate':
        return random.randint(5, 7)
    elif level == 'Advance':
        return random.randint(8, 10)
    else:
        return 0  # Default to 0 days if level is unknown

# Read the CSV file into a pandas DataFrame
df = pd.read_csv('data/module.csv')

# Display the initial DataFrame
print("Initial DataFrame:")
print(df)

# Try to convert 'start_date' column to datetime format handling errors
df['start_date'] = pd.to_datetime(df['start_date'], errors='coerce')

# Filter out rows with NaN (not a valid date) in 'start_date' column
df = df.dropna(subset=['start_date'])

# Calculate the date gap based on 'Level' column
df['days_gap'] = df['Level'].apply(calculate_date_gap)

# Calculate the 'end date' based on 'start date' and 'days_gap'
df['end_date'] = df.apply(lambda row: calculate_end_date(row['start_date'], row['days_gap']), axis=1)

# Drop the 'days_gap' column since it's no longer needed
df.drop(['days_gap'], axis=1, inplace=True)

# Write the updated DataFrame to a new CSV file
df.to_csv('updated_data.csv', index=False)

print("\nSuccessfully updated end date in 'updated_data.csv'.")


Initial DataFrame:
                          _id            module_name  Trainee_Count  \
0    6617adb7e10537e70a816d52              ML Basics             65   
1    6617adb7e10537e70a816d54       Data Engineering             38   
2    6617adb7e10537e70a816d56              ML Basics             23   
3    6617adb7e10537e70a816d59              ML Basics             90   
4    6617adb7e10537e70a816d5b      DevOps Operations             85   
..                        ...                    ...            ...   
595  6617adb8e10537e8d6b75ff0       Data Engineering             79   
596  6617adb8e10537e8d6b75ff1  Git & Version Control             93   
597  6617adb8e10537e8d6b75ff4       Data Engineering             46   
598  6617adb8e10537e8d6b75ff5                    DBT             60   
599  6617adb8e10537e8d6b75ff6                    ADF             21   

            Level  start_date    end_date                   quiz_id  \
0    Intermidiate  2022-07-22  2022-07-22  6610e2f9e10537