## Preprocessing part 2:

In [1]:
import pandas as pd
import re

# Load the preprocessed CSV file
data = pd.read_csv('/Users/prathambhonge/AmazonML/final_updated.csv')

# Convert text to lowercase and remove special characters, handling missing or non-string values
def clean_text(text):
    if isinstance(text, str):  # Check if the value is a string
        return re.sub(r'[^\w\s]', '', text.lower())
    return ''  # Return an empty string for non-string values

data['clean_text'] = data['text'].apply(clean_text)

# Example: For 'item_weight', map possible units and normalize values
unit_patterns = {
    'gram': r'\b(\d+\.?\d*)\s*(g|gram|grams)\b',
    'kilogram': r'\b(\d+\.?\d*)\s*(kg|kilogram|kilograms)\b',
    'milligram': r'\b(\d+\.?\d*)\s*(mg|milligram|milligrams)\b',
    'pound': r'\b(\d+\.?\d*)\s*(lb|pound|pounds)\b',
    'ounce': r'\b(\d+\.?\d*)\s*(oz|ounce|ounces)\b',
    'ton': r'\b(\d+\.?\d*)\s*(ton|tons)\b'
}

def extract_value(text, entity_name):
    if entity_name == 'item_weight':
        for unit, pattern in unit_patterns.items():
            match = re.search(pattern, text)
            if match:
                return f"{match.group(1)} {unit}"
    return ''

# Apply the extraction function
data['extracted_value'] = data.apply(lambda row: extract_value(row['clean_text'], row['entity_name']), axis=1)

# Display the dataframe with extracted values
print(data[['filename', 'entity_name', 'extracted_value']])

# Save the processed data to a new CSV file
data.to_csv('processed_data.csv', index=False)


              filename  entity_name extracted_value
0      61Mxgdk7NES.jpg  item_weight        100 gram
1      712Q9pScGsL.jpg  item_weight                
2      717en-V4EDL.jpg  item_weight     14 kilogram
3      613MHCt4UvL.jpg  item_weight                
4      61dphjNagYL.jpg  item_weight                
...                ...          ...             ...
22173  718YXdgUJqL.jpg  item_weight   900 milligram
22174  61U31F+JKJL.jpg  item_weight                
22175  61Ueu26MdmL.jpg  item_weight    20 milligram
22176  51Onc432L8L.jpg  item_weight                
22177  61tjv5nAYgL.jpg  item_weight        170 gram

[22178 rows x 3 columns]


## Split data to train and test

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the processed CSV file
data = pd.read_csv('processed_data.csv')

# Split the data into training and testing sets (80% train, 20% test)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Save the training data to a CSV file
train_data.to_csv('train_data.csv', index=False)

# Save the testing data to a CSV file
test_data.to_csv('test_data.csv', index=False)

# Output a confirmation message
print(f"Data split completed. Train data: {len(train_data)} samples, Test data: {len(test_data)} samples.")


Data split completed. Train data: 17742 samples, Test data: 4436 samples.


## Training the data

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import re

# Load the training data
train_data = pd.read_csv('train_data.csv')

# Handle missing values in 'text' and 'entity_name'
train_data['text'].fillna('', inplace=True)
train_data['entity_name'].fillna('', inplace=True)

# Combine the text and entity_name columns for better feature representation
train_data['input_text'] = train_data['text'] + ' ' + train_data['entity_name']

# Function to extract numeric values from the 'entity_value' column
def extract_numeric_value(value):
    # Regular expression to extract the first number from the string
    match = re.search(r'(\d+\.?\d*)', str(value))
    if match:
        return float(match.group(1))
    return None

# Apply the function to the 'entity_value' column
train_data['numeric_value'] = train_data['entity_value'].apply(extract_numeric_value)

# Drop rows with missing numeric values
train_data.dropna(subset=['numeric_value'], inplace=True)

# Separate the input features (X) and target variable (y)
X = train_data['input_text']
y = train_data['numeric_value']

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the pipeline: TF-IDF Vectorizer + Random Forest Regressor
pipeline = make_pipeline(
    TfidfVectorizer(max_features=5000),  # Convert text into numeric features
    RandomForestRegressor(n_estimators=100, random_state=42)  # Random Forest model
)

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = pipeline.predict(X_val)

# Evaluate the model
mse = mean_squared_error(y_val, y_pred)
print(f"Mean Squared Error on validation set: {mse}")

# Save the trained model for future predictions
import joblib
joblib.dump(pipeline, 'entity_value_prediction_model_new.pkl')

print("Model training completed and saved to 'entity_value_prediction_model.pkl'.")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['text'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['entity_name'].fillna('', inplace=True)


Mean Squared Error on validation set: 6.02562795792511e+22
Model training completed and saved to 'entity_value_prediction_model.pkl'.


## testing the data

In [5]:
import pandas as pd
import joblib
import numpy as np
import re
from sklearn.metrics import f1_score, classification_report

# Load the trained model
pipeline = joblib.load('entity_value_prediction_model_new.pkl')

# Load the validation data
test_data = pd.read_csv('test_data.csv')

# Handle missing values in 'text' and 'entity_name'
test_data['text'].fillna('', inplace=True)
test_data['entity_name'].fillna('', inplace=True)

# Combine the text and entity_name columns for better feature representation
test_data['input_text'] = test_data['text'] + ' ' + test_data['entity_name']

# Make predictions on the test set
X_test = test_data['input_text']
y_pred = pipeline.predict(X_test)

# Extract numeric values from the entity_value column in the test data
def extract_numeric_value(value):
    match = re.search(r'(\d+\.?\d*)', str(value))
    if match:
        return float(match.group(1))
    return None

test_data['numeric_value'] = test_data['entity_value'].apply(extract_numeric_value)

# Drop rows with missing numeric values in the test set
test_data.dropna(subset=['numeric_value'], inplace=True)

# True values for validation
y_true = test_data['numeric_value']

# Convert numeric values to binary classes (0 or 1) based on a threshold
threshold = 10  # Define a threshold for binarization
y_true_binary = (y_true > threshold).astype(int)
y_pred_binary = (y_pred > threshold).astype(int)

# Calculate F1 score
f1 = f1_score(y_true_binary, y_pred_binary)
print(f"F1 Score: {f1}")

# Print classification report for detailed metrics
print(classification_report(y_true_binary, y_pred_binary))


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['text'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['entity_name'].fillna('', inplace=True)


F1 Score: 0.8212125474750317
              precision    recall  f1-score   support

           0       0.82      0.17      0.28      1464
           1       0.71      0.98      0.82      2972

    accuracy                           0.71      4436
   macro avg       0.76      0.58      0.55      4436
weighted avg       0.74      0.71      0.64      4436



In [10]:
import pandas as pd
import re
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline

# Load the trained model
model = joblib.load('entity_value_prediction_model_new.pkl')

# Define the allowed units
allowed_units = {
    "width": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "depth": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "height": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "item_weight": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
    "maximum_weight_recommendation": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
    "voltage": {"millivolt", "kilovolt", "volt"},
    "wattage": {"kilowatt", "watt"},
    "item_volume": {"cubic foot", "microlitre", "cup", "fluid ounce", "centilitre", "imperial gallon", "pint", "decilitre", "litre", "millilitre", "quart", "cubic inch", "gallon"}
}

# Load the test data
test_data = pd.read_csv('test_data.csv')

# Handle missing values in 'text' and 'entity_name'
test_data['text'].fillna('', inplace=True)
test_data['entity_name'].fillna('', inplace=True)

# Combine the text and entity_name columns for feature representation
test_data['input_text'] = test_data['text'] + ' ' + test_data['entity_name']

# Predict the numeric values
predictions = model.predict(test_data['input_text'])

# Map the predictions to the allowed units
# For simplicity, this example will use a default unit 'gram'. You should adapt this as needed.
def format_prediction(value, unit="gram"):
    if unit in allowed_units.get(test_data['entity_name'][0], []):
        return f"{value:.2f} {unit}"
    return ""

# Apply the formatting function
test_data['prediction'] = [format_prediction(pred) for pred in predictions]

# Prepare the output DataFrame
output = pd.DataFrame({
    'index': test_data.index,  # Use the DataFrame index directly
    'prediction': test_data['prediction']
})

# Save the output to a CSV file
output.to_csv('test_predictions.csv', index=False)

print("Predictions saved to 'test_predictions.csv'.")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['text'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['entity_name'].fillna('', inplace=True)


Predictions saved to 'test_predictions.csv'.
