# print('hello')

In [None]:
! pip install nltk

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import time

# worldcloud
import random
from wordcloud import WordCloud


In [None]:
# Download NLTK resources (only needed once)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')


In [None]:
# retrieving the datasets we used for the last notebook

raw_path = '../data/olist_datasets/'

df_customer = pd.read_csv(raw_path + 'olist_customers_dataset.csv')
df_geolocation = pd.read_csv(raw_path + 'olist_geolocation_dataset.csv')
df_orders = pd.read_csv(raw_path + 'olist_orders_dataset.csv')
df_order_items = pd.read_csv(raw_path + 'olist_order_items_dataset.csv')
df_order_payments = pd.read_csv(raw_path + 'olist_order_payments_dataset.csv')
df_order_reviews = pd.read_csv(raw_path + 'olist_order_reviews_dataset.csv')
df_products = pd.read_csv(raw_path + 'olist_products_dataset.csv')
df_sellers = pd.read_csv(raw_path + 'olist_sellers_dataset.csv')


# I - Sentiment analysis of customer reviews

## A - Reminders on the Review dataframe

In [None]:
# checking that df_order_reviews is loaded

df_order_reviews.head(10)

In [None]:
# Create a distribution plot (distplot) of review scores using Seaborn

# Define professional color palette
palette_colors = ['#2C3E50', '#E74C3C', '#ECF0F1', '#3498DB', '#2ECC71', '#F1C40F', '#9B59B6']

# Set up the figure
fig, ax = plt.subplots(figsize=(10, 6))

# Plot the histogram
sns.histplot(
    data=df_order_reviews,
    x='review_score',
    bins=5,
    edgecolor='black',
    stat='count',
    ax=ax
)

# Customize the plot
ax.set_title('Distribution of Review Scores', fontsize=16, fontweight='bold')
ax.set_xlabel('Review Score', fontsize=14)
ax.set_ylabel('Frequency', fontsize=14)

plt.tight_layout()
plt.show()



In [None]:
# Checking for missing values in review comments
missing_comments = df_order_reviews[['review_comment_title', 'review_comment_message']].isnull().sum()
print(missing_comments)

# Dropping rows where both comment fields are missing
df_reviews_clean = df_order_reviews.dropna(subset=['review_comment_title', 'review_comment_message'], how='all')

# Define a function to categorize review sentiment
def classify_sentiment(score):
    """Classify sentiment based on review score."""
    if score >= 4:
        return 'positive'
    elif score <= 2:
        return 'negative'
    else:
        return 'neutral'

# Apply the sentiment classification
df_order_reviews['sentiment'] = df_order_reviews['review_score'].apply(classify_sentiment)

# Display the updated dataframe
df_order_reviews

# Filter again to remove any missing comments before analysis
df_order_reviews = df_order_reviews.dropna(subset=['review_comment_message', 'review_comment_title'])
df_order_reviews


In [None]:
## Wordcloud

# Generate a WordCloud for orders with negative reviews

# Define a custom color function
def random_color(word, font_size, position, orientation=None, random_state=42, **kwargs):
    return random.choice(palette_colors)

# Create the wordcloud
plt.figure(figsize=(10, 5))

# Focus only on negative reviews
wordcloud = WordCloud(
    width=800, 
    height=400, 
    background_color='white', 
    color_func=random_color
)

# Generate wordcloud from review comments
wordcloud.generate(' '.join(df_order_reviews['review_comment_message'].dropna()))

# Display the wordcloud
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Review Comments', fontsize=16, fontweight='bold')

plt.tight_layout()
plt.show()

## B - Text Preprocessing and conversion to numerical features

### 1 - Cleaning a small sample of the data and looking at the results

In [None]:
# Create a sample dataframe from df_order_reviews
# Filtering rows with non-null review_comment_message
# Keeping only review_id, review_score, and review_comment_message

df_reviews_sample = df_order_reviews[
    df_order_reviews['review_comment_message'].notnull()
][['review_id', 'review_score', 'review_comment_message']].sample(frac=0.1)

df_reviews_sample

# Create a new column to lower-case the text

df_reviews_sample['comment_clean'] = df_reviews_sample['review_comment_message'].apply(lambda x: x.lower())

# Remove special characters using regex

df_reviews_sample['comment_clean'] = df_reviews_sample['comment_clean'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

df_reviews_sample.head(20)

# Tokenize the text into lists of words using NLTK

nltk.download('punkt')

df_reviews_sample['comment_clean_tokenized'] = df_reviews_sample['comment_clean'].apply(word_tokenize)

df_reviews_sample

# Remove Portuguese stopwords

nltk.download('stopwords')

portuguese_stopwords = set(stopwords.words('portuguese'))
df_reviews_sample['comment_clean_tokenized'] = df_reviews_sample['comment_clean_tokenized'].apply(
    lambda tokens: [word for word in tokens if word not in portuguese_stopwords]
)

df_reviews_sample

# Apply stemming to reduce words to their root form

stemmer = PorterStemmer()

df_reviews_sample['comment_clean_stemmed'] = df_reviews_sample['comment_clean_tokenized'].apply(
    lambda tokens: [stemmer.stem(word) for word in tokens]
)

df_reviews_sample


### 2 - Text preprocessing

In [None]:
# Define a function to clean and preprocess text data
def clean_text(text):
    if isinstance(text, str):
        # Convert text to lowercase
        text = text.lower()

        # Remove punctuation, numbers, and special characters
        text = re.sub(r'[^\w\s]', '', text)

        # Tokenize the text
        words = word_tokenize(text)

        # Remove Portuguese stopwords
        portuguese_stopwords = set(stopwords.words('portuguese'))
        words = [word for word in words if word not in portuguese_stopwords]

        # Apply stemming
        stemmer = PorterStemmer()
        stemmed_words = [stemmer.stem(word) for word in words]

        # Rebuild the cleaned text
        cleaned_text = ' '.join(stemmed_words)

        return cleaned_text
    else:
        return ''


In [None]:
# Apply text cleaning to the review comments

df_reviews_sample['comment_clean'] = df_reviews_sample['review_comment_message'].apply(lambda x: clean_text(x))

# Display a few examples of original and processed comments

df_reviews_sample.head()



### 3 - Final preparation steps

In [None]:
# Split the dataset into training and testing subsets

train_data, test_data = train_test_split(df_reviews_sample, test_size=0.2, random_state=42)

# Display the size of each subset
print(f"Train set dimensions: {train_data.shape}")
print(f"Test set dimensions: {test_data.shape}")


## C - Building a Simple Naive Bayes Model

In [None]:
# Use CountVectorizer to transform text into a matrix of token counts

vectorizer = CountVectorizer(max_features=5000)  # Keep only the top 5000 most frequent words

# Fit the vectorizer on the training data
X_train_vectorized = vectorizer.fit_transform(train_data['comment_clean'])

# Transform the test data using the same fitted vectorizer
X_test_vectorized = vectorizer.transform(test_data['comment_clean'])

print(f"Training features shape: {X_train_vectorized.shape}")
print(f"Testing features shape: {X_test_vectorized.shape}")



In [None]:
# Understanding what we just built

# Compute word frequencies to find interesting tokens
word_frequencies = X_train_vectorized.sum(axis=0).A1
top_word_indices = word_frequencies.argsort()[-10:][::-1]
top_words = [vectorizer.get_feature_names_out()[i] for i in top_word_indices]

# Create a small sample with the top 10 words and first 5 reviews
df_bow_sample = pd.DataFrame(
    X_train_vectorized[:5, top_word_indices].toarray(),
    columns=top_words
)

print("Extract from the Bag-of-Words matrix (first 5 comments, 10 most frequent words):")
df_bow_sample


In [None]:
# Train a Naive Bayes classifier

naive_bayes_model = MultinomialNB()

# Fit the model on the training data
naive_bayes_model.fit(X_train_vectorized, train_data['review_score'])

# Make predictions on the test data
y_pred_nb = naive_bayes_model.predict(X_test_vectorized)

# Evaluate the model
print(f"Test Set Accuracy: {accuracy_score(test_data['review_score'], y_pred_nb):.4f}")
print("\nClassification Report:\n")
print(classification_report(test_data['review_score'], y_pred_nb))


In [None]:
# Generate predictions on the test set
y_pred = naive_bayes_model.predict(X_test_vectorized)

# Display the first few predictions
print(f"First 10 predictions on test set: {y_pred[:10]}")

# Evaluate the model performance
test_accuracy = accuracy_score(test_data['review_score'], y_pred)
classification_rep = classification_report(test_data['review_score'], y_pred)
confusion_mat = confusion_matrix(test_data['review_score'], y_pred)

# Print evaluation metrics
print(f"Accuracy Score: {test_accuracy:.4f}\n")
print("Classification Report:\n")
print(classification_rep)

# Print confusion matrix
print("Confusion Matrix:\n")
print(confusion_mat)



## D - Building a More Advanced Model - Logistic Regression with TF-IDF

In [None]:
# Use TF-IDF Vectorizer for better feature extraction

tfidf = TfidfVectorizer(max_features=5000)

# Fit and transform the training data
X_train_tfidf = tfidf.fit_transform(train_data['comment_clean'])

# Transform the test data
X_test_tfidf = tfidf.transform(test_data['comment_clean'])

# Print feature shapes
print(f"Training features shape (TF-IDF): {X_train_tfidf.shape}")
print(f"Testing features shape (TF-IDF): {X_test_tfidf.shape}")

# Train a Logistic Regression model
logistic_model = LogisticRegression(max_iter=1000, random_state=42)

# Fit the model
logistic_model.fit(X_train_tfidf, train_data['review_score'])

# Make predictions
y_pred_logistic = logistic_model.predict(X_test_tfidf)

# Evaluate the model
accuracy_logistic = accuracy_score(test_data['review_score'], y_pred_logistic)
classification_logistic = classification_report(test_data['review_score'], y_pred_logistic, zero_division=0)
confusion_logistic = confusion_matrix(test_data['review_score'], y_pred_logistic)

print(f"Logistic Regression Accuracy: {accuracy_logistic:.4f}")

# Display classification report
print("\nClassification Report for Logistic Regression:\n")
print(classification_logistic)



In [None]:
# Generate and visualize the confusion matrix for Logistic Regression

conf_matrix = confusion_matrix(test_data['review_score'], y_pred_logistic)

plt.figure(figsize=(8, 6))
sns.heatmap(
    conf_matrix, 
    annot=True, 
    fmt='d', 
    cmap='Blues', 
    xticklabels=logistic_model.classes_, 
    yticklabels=logistic_model.classes_
)

plt.title('Confusion Matrix - Logistic Regression', fontsize=16)
plt.xlabel('Predicted Class', fontsize=14)
plt.ylabel('Actual Class', fontsize=14)
plt.tight_layout()
plt.show()



In [None]:
# Get feature importance from Logistic Regression model
feature_importance = pd.DataFrame({
    'feature': tfidf_vectorizer.get_feature_names_out(),
    'importance': lr_classifier.coef_.mean(axis=0)
})

# Sort by absolute importance
feature_importance['abs_importance'] = abs(feature_importance['importance'])
feature_importance = feature_importance.sort_values('abs_importance', ascending=False)


In [None]:
# Display the most influential words for positive and negative sentiment

print("Top 10 words associated with positive sentiment:")
top_positive_words = feature_importance_df.sort_values('importance', ascending=False).head(10)
print(top_positive_words[['feature', 'importance']])

print("\nTop 10 words associated with negative sentiment:")
top_negative_words = feature_importance_df.sort_values('importance', ascending=True).head(10)
print(top_negative_words[['feature', 'importance']])



In [None]:
# Visualize the top features influencing sentiment

plt.figure(figsize=(12, 8))

# Combine top positive and negative features
top_words = pd.concat([top_positive_words.head(15), top_negative_words.head(15)])

# Define colors: green for positive, red for negative
bar_colors = ['green' if val > 0 else 'red' for val in top_words['importance']]

# Create a horizontal bar plot
plt.barh(top_words['feature'], top_words['importance'], color=bar_colors)

plt.axvline(x=0, color='black', linestyle='--', alpha=0.5)
plt.title('Top Positive and Negative Words Impacting Sentiment', fontsize=16)
plt.xlabel('Importance', fontsize=14)
plt.tight_layout()
plt.show()



## E - Rainforest model

In [None]:
# Start timing to measure model training duration
start_time = time.time()

# Initialize a Random Forest model with limited trees to control training time
random_forest_model = RandomForestClassifier(
    n_estimators=100,         # number of trees
    max_depth=None,           # no limit on tree depth
    min_samples_split=2,      # minimum samples required to split a node
    random_state=42,
    n_jobs=-1                 # utilize all processors
)

# Train the Random Forest on TF-IDF features
random_forest_model.fit(X_train_tfidf, train_data['review_score'])

# Make predictions on the training set
y_pred_rf_train = random_forest_model.predict(X_train_tfidf)

# Compute and display the training time
elapsed_time = time.time() - start_time
print(f"Training time for Random Forest: {elapsed_time:.2f} seconds")



In [None]:
# Evaluate the model

# Predictions on the test set
y_pred_rf = rf_classifier.predict(X_test_tfidf)

# Calculate accuracy on the test set
accuracy_rf = accuracy_score(test_df['review_score'], y_pred_rf)
print(f"Random Forest Test Accuracy: {accuracy_rf:.4f}")

# Classification report
print("\nRandom Forest Classification Report:")
print(classification_report(test_df['review_score'], y_pred_rf))

# Confusion Matrix
cm_rf = confusion_matrix(test_df['review_score'], y_pred_rf)

# Display confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues',
            xticklabels=rf_classifier.classes_,
            yticklabels=rf_classifier.classes_)
plt.title('Confusion Matrix - Random Forest', fontsize=16)
plt.xlabel('Predicted Label', fontsize=14)
plt.ylabel('True Label', fontsize=14)
plt.tight_layout()
plt.show()


In [None]:
# Display the test accuracy for the Random Forest model

# Make predictions on the test set
y_pred_rf = random_forest_model.predict(X_test_tfidf)

# Compute the accuracy
rf_accuracy = accuracy_score(test_data['review_score'], y_pred_rf)
print(f"Accuracy of Random Forest on Test Set: {rf_accuracy:.4f}")


In [None]:
# Generate and display the confusion matrix for the Random Forest model

conf_matrix_rf = confusion_matrix(test_data['review_score'], y_pred_rf)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(
    conf_matrix_rf,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=random_forest_model.classes_,
    yticklabels=random_forest_model.classes_
)

plt.title('Confusion Matrix - Random Forest', fontsize=16)
plt.xlabel('Predicted Class', fontsize=14)
plt.ylabel('Actual Class', fontsize=14)
plt.tight_layout()
plt.show()


In [None]:
# get the feature importance of the Random Forest model

feature_importances = rf_classifier.feature_importances_

# Create a DataFrame to hold the feature names and their corresponding importance values
import pandas as pd
features_df = pd.DataFrame({
    'Feature': count_vectorizer.get_feature_names_out(),
    'Importance': feature_importances
})

# Sort the features by their importance in descending order
features_df = features_df.sort_values(by='Importance', ascending=False)

# Display the top 20 important features
print(features_df.head(20))

In [None]:
# Visualize the top important features from the Random Forest model

# Select the top 20 features
top_rf_features = features_importance_df.head(20)

# Plot the feature importances
plt.figure(figsize=(10, 6))
sns.barplot(
    x='importance', 
    y='feature', 
    data=top_rf_features, 
    palette='viridis', 
    legend=False
)

# Add plot title and axis labels
plt.title('Top 20 Important Features - Random Forest', fontsize=16)
plt.xlabel('Importance Score', fontsize=12)
plt.ylabel('Feature Name', fontsize=12)

plt.tight_layout()
plt.show()


## F - Model comparison

In [None]:

# Compare the performances of the 2 models
models = ['Naive Bayes', 'Logistic Regression', 'Random Forest']
accuracies = [accuracy_nb, accuracy_lr, accuracy_rf]

# Create a DataFrame for easy plotting
performance_df = pd.DataFrame({
    'Model': models,
    'Accuracy': accuracies
})

plt.figure(figsize=(8, 6))
sns.barplot(x='Accuracy', y='Model', data=performance_df, palette='Blues_d')

plt.title('Comparison of Model Performances', fontsize=16)
plt.xlabel('Accuracy', fontsize=12)
plt.ylabel('Model', fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
# Compute the F1 score by class (f1_nb, f1_lr, f1_rf)

f1_nb = f1_score(test_df['review_score'], y_pred_nb, average=None)
f1_lr = f1_score(test_df['review_score'], y_pred_lr, average=None)
f1_rf = f1_score(test_df['review_score'], y_pred_rf, average=None)

# Create a dataframe for the visualization
f1_df = pd.DataFrame({
    'Naive Bayes': f1_nb,
    'Logistic Regression': f1_lr,
    'Random Forest': f1_rf
}, index=rf_classifier.classes_)

# Visualize F1 scores by classes

plt.figure(figsize=(10, 6))

sns.heatmap(f1_df, annot=True, cmap='viridis', fmt='.2f', cbar=True)

plt.title('F1 Scores by Class for Different Models', fontsize=16)
plt.xlabel('Model', fontsize=12)
plt.ylabel('Class', fontsize=12)

plt.tight_layout()
plt.show()

## G - Example use case: predicting new comments

In [None]:
# Create a function to predict sentiment for new reviews
def predict_sentiment(review_text, vectorizer, model):
    # Preprocess the review
    processed_review = preprocess_text(review_text)
    # Vectorize the review
    review_vector = vectorizer.transform([processed_review])
    # Predict the sentiment
    sentiment = model.predict(review_vector)[0]
    # Get prediction probabilities
    proba = model.predict_proba(review_vector)[0]
    # Return the sentiment and confidence
    return sentiment, proba

In [None]:
# Example reviews for sentiment prediction

# Define a new TF-IDF vectorizer and fit it on training data
example_vectorizer = TfidfVectorizer(max_features=5000)
example_vectorizer.fit(train_data['comment_clean'])

# List of sample reviews to test
sample_reviews = [
    "O produto é excelente, superou minhas expectativas!",  # Positive
    "Entrega foi feita no prazo, mas o produto não é tão bom quanto esperava.",  # Neutral
    "Péssimo produto, chegou com defeito e o atendimento ao cliente foi horrível."  # Negative
]

# Predict sentiment for each example
print("Sentiment prediction for sample reviews:")
for idx, review in enumerate(sample_reviews, start=1):
    pred_sentiment, pred_proba = predict_sentiment(review, example_vectorizer, logistic_model)
    print(f"\nReview {idx}: {review}")
    print(f"Predicted sentiment: {pred_sentiment}")
    print(f"Confidence: {max(pred_proba):.2f}")
    print(f"Class probabilities: {dict(zip(logistic_model.classes_, pred_proba))}")



# II - Delivery prediction (bonus)

## A - Data preparation

In [None]:
# We'll use the merged dataframe from previous parts
# If not already done, we need to merge the necessary dataframes
if 'df' not in globals():
    # Load required datasets
    df_orders = pd.read_csv(raw_path + 'olist_orders_dataset.csv')
    df_customers = pd.read_csv(raw_path + 'olist_customers_dataset.csv')
    df_order_items = pd.read_csv(raw_path + 'olist_order_items_dataset.csv')
    df_products = pd.read_csv(raw_path + 'olist_products_dataset.csv')

    # Merge datasets
    df = df_orders.merge(df_customers, on='customer_id')
    df = df.merge(df_order_items, on='order_id')
    df = df.merge(df_products, on='product_id')

df.head()


In [None]:
# Convert date columns to datetime
date_columns = ['order_purchase_timestamp', 'order_approved_at',
                'order_delivered_carrier_date', 'order_delivered_customer_date',
                'order_estimated_delivery_date']

for col in date_columns:
    df[col] = pd.to_datetime(df[col])


# Calculate delivery time in days
df['actual_delivery_time'] = (df['order_delivered_customer_date'] -
                             df['order_purchase_timestamp']).dt.total_seconds() / (24 * 3600)

# Calculate if the delivery was delayed (1) or not (0)
df['estimated_delivery_time'] = (df['order_estimated_delivery_date'] -
                                df['order_purchase_timestamp']).dt.total_seconds() / (24 * 3600)
df['is_delayed'] = (df['actual_delivery_time'] > df['estimated_delivery_time']).astype(int)

# Calculate time to carrier in days
df['time_to_carrier'] = (df['order_delivered_carrier_date'] -
                        df['order_purchase_timestamp']).dt.total_seconds() / (24 * 3600)

# Filter out rows with missing delivery dates (canceled orders, etc.)
delivery_df = df.dropna(subset=['order_delivered_customer_date', 'order_delivered_carrier_date'])

delivery_df.head()


In [None]:
# Visualize the distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='is_delayed', data=delivery_df, palette='viridis')
plt.title('Distribution of Delayed vs On-time Deliveries', fontsize=16)
plt.xlabel('Is Delayed (1 = Yes, 0 = No)', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks([0, 1], ['On-time', 'Delayed'])
for i, count in enumerate(delivery_df['is_delayed'].value_counts()):
    plt.text(i, count + 100, f"{count} ({count/len(delivery_df):.1%})",
             ha='center', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


## B - Feature Engineering

In [None]:

model_df = delivery_df.copy()

# Extract temporal features
model_df['purchase_hour'] = model_df['order_purchase_timestamp'].dt.hour
model_df['purchase_day'] = model_df['order_purchase_timestamp'].dt.day
model_df['purchase_month'] = model_df['order_purchase_timestamp'].dt.month
model_df['purchase_year'] = model_df['order_purchase_timestamp'].dt.year
model_df['purchase_dayofweek'] = model_df['order_purchase_timestamp'].dt.dayofweek
model_df['purchase_weekend'] = (model_df['purchase_dayofweek'] >= 5).astype(int)

# Calculate distance between customer and seller (using zip code prefix as a proxy)
model_df['zip_distance'] = abs(model_df['customer_zip_code_prefix'] - model_df['seller_zip_code_prefix'])

# Calculate price per weight
model_df['price_per_weight'] = model_df['price'] / model_df['product_weight_g'].replace(0, 0.1)

# Create product volume feature
model_df['product_volume'] = (model_df['product_length_cm'] *
                             model_df['product_height_cm'] *
                             model_df['product_width_cm'])

# Handle infinite values
model_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Display the first few rows of the engineered features
print("Sample of engineered features:")
model_df[['purchase_hour', 'purchase_day', 'purchase_month', 'purchase_year',
                'purchase_dayofweek', 'purchase_weekend', 'zip_distance',
                'price_per_weight', 'product_volume']].head()

## C - Exploratory Data Analysis for Feature Selection

In [None]:
# Analyze correlation between features and delivery time with a correlation matrix

'your code here'


In [None]:
# Analyze categorical features
categorical_features = ['customer_state', 'seller_state', 'product_category_name']

# compute the average delivery time for each of the categorical feature with a bar chart

'your code here'


## D - Feature Selection

In [None]:
# Select features based on correlation analysis and domain knowledge
selected_numeric_features = ['freight_value', 'price', 'product_weight_g',
                            'product_volume', 'zip_distance', 'time_to_carrier',
                            'purchase_month', 'purchase_dayofweek']


In [None]:
# Prepare feature and target variables
X_numeric = model_df[selected_numeric_features]
X_categorical = model_df[selected_categorical_features]
X_combined = pd.concat([X_numeric, X_categorical], axis=1)

# For regression task (predicting delivery time)
y_regression = model_df['actual_delivery_time']

# Split the data
X_train, X_test, y_reg_train, y_reg_test = train_test_split(
    X_combined, y_regression, test_size=0.2, random_state=42)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

## E - Building the Regression Model (Predicting Delivery Time)

In [None]:
# Define preprocessing for numeric features

'your code here'


In [None]:
# Create a Linear Regression Pipeline

'your code here'

# Train and evaluate Linear Regression

'your code here'


In [None]:
# Make predictions based on your linear regression

'your code here'

# Evaluate the model
mae_lr = 'your code here'
rmse_lr = 'your code here'
r2_lr = 'your code here'

print(f"Linear Regression - MAE: {mae_lr:.2f} days")
print(f"Linear Regression - RMSE: {rmse_lr:.2f} days")
print(f"Linear Regression - R²: {r2_lr:.2f}")


the R² should be about 0.40, which is not a good number. Can you do better with a Random Forest model, or any other model?

Objective is to get a R² > 0.45!

## F - Building a Random Forest model

In [None]:
'your code here'

### G - Building a K-Nearest Neighbors model

In [None]:
'your code here'