In [None]:
import pandas as pd
import os

os.makedirs('datasets/new', exist_ok=True)

def clean_dataset(df, is_train=True):
    # Strip white spaces
    df[df.select_dtypes(['object']).columns] = df.select_dtypes(['object']).apply(lambda x: x.str.strip())
    
    # Convert columns to appropriate datatypes
    df['Delivery_person_Age'] = pd.to_numeric(df['Delivery_person_Age'], errors='coerce').astype('Int64')
    df['Weatherconditions'] = df['Weatherconditions'].str.replace("conditions ", "", regex=False)
    
    if is_train:
        # Drop "(min)" and convert to integer
        df['Time_taken(min)'] = df['Time_taken(min)'].str.extract('(\d+)').astype('Int64')
    
    # String columns
    string_columns = [
        'ID', 'Delivery_person_ID', 'Road_traffic_density', 'Type_of_order', 
        'Type_of_vehicle', 'Festival', 'City', 'Order_Date', 'Time_Orderd', 
        'Time_Order_picked'
    ]
    df[string_columns] = df[string_columns].astype(str)
    
    # Float columns
    df['Restaurant_latitude'] = pd.to_numeric(df['Restaurant_latitude'], errors='coerce')
    df['Restaurant_longitude'] = pd.to_numeric(df['Restaurant_longitude'], errors='coerce')
    df['Delivery_location_latitude'] = pd.to_numeric(df['Delivery_location_latitude'], errors='coerce')
    df['Delivery_location_longitude'] = pd.to_numeric(df['Delivery_location_longitude'], errors='coerce')
    
    # Numeric columns
    df['Vehicle_condition'] = pd.to_numeric(df['Vehicle_condition'], errors='coerce').astype('Int64')
    df['multiple_deliveries'] = pd.to_numeric(df['multiple_deliveries'], errors='coerce').astype('Int64')
    
    # Date and time columns
    df['Order_Date'] = pd.to_datetime(df['Order_Date'], format='%d-%m-%Y', errors='coerce')
    df['Time_Orderd'] = pd.to_datetime(df['Time_Orderd'], format='%H:%M:%S', errors='coerce').dt.time
    df['Time_Order_picked'] = pd.to_datetime(df['Time_Order_picked'], format='%H:%M:%S', errors='coerce').dt.time
    
    # Drop rows with NaN values
    df = df.replace("NaN", pd.NA)
    df_cleaned = df.dropna()
    
    return df_cleaned

# Load the train dataset
train = pd.read_csv('train.csv', skipinitialspace=True)
train_cleaned = clean_dataset(train, is_train=True)
train_cleaned.to_csv('new_train.csv', index=False)

print("Train dataset cleaned and saved to 'datasets/new/train.csv'")
print(f"\nNumber of rows in train dataset before cleaning: {train.shape[0]}")
print(f"Number of rows in train dataset after cleaning: {train_cleaned.shape[0]}")

In [None]:
import pandas as pd
from tqdm import tqdm


# Start the timer
# start_time = time.time()

# Load the preprocessed train dataset
train = pd.read_csv('new_train.csv')
print(train.head())

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import LabelEncoder
import time

# Drop ID columns that are not needed for regression
columns_to_drop = ['ID', 'Delivery_person_ID', 'Order_Date', 'Time_Orderd', 'Time_Order_picked']
train.drop(columns=columns_to_drop, inplace=True, errors='ignore')  # Use errors='ignore' to avoid KeyError

# Check for non-numeric columns
non_numeric_columns = train.select_dtypes(include=['object']).columns
print(f"Non-numeric columns: {non_numeric_columns}")

# Encode categorical columns using Label Encoding
label_encoder = LabelEncoder()

# List of categorical columns to encode
categorical_cols = ['Weatherconditions', 'Road_traffic_density', 
                    'Type_of_order', 'Type_of_vehicle', 'Festival', 'City']

# Apply LabelEncoder to categorical columns
for col in tqdm(categorical_cols):
    train[col] = label_encoder.fit_transform(train[col])

# After encoding, check again for any non-numeric columns
non_numeric_columns_after = train.select_dtypes(include=['object']).columns
print(f"Remaining non-numeric columns after encoding: {non_numeric_columns_after}")

# Separate features (X) and target variable (y)
X = train.drop(columns=['Time_taken(min)'])  # Drop target variable
y = train['Time_taken(min)']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"R-squared: {r2}")
print(f"Mean Absolute Error: {mae}")
