In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
# Change to your working directory in Google Drive
project_path = '/content/drive/MyDrive/Amazon/'
os.chdir(project_path)


In [None]:
!pip install tqdm requests  # Install required libraries if not already done

import os
import pandas as pd
from pathlib import Path
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm  # For progress bar

# Define folder to store downloaded images
download_folder = '/content/drive/MyDrive/Amazon/resource/dataset/images/'

# Ensure the folder exists
if not os.path.exists(download_folder):
    os.makedirs(download_folder)

# Load the full train.csv file from Google Drive
input_csv = '/content/drive/MyDrive/Amazon/resource/dataset/train.csv'  # Replace with your actual file path
df = pd.read_csv(input_csv)


# Define the column that contains image links
image_column = 'image_link'  # Ensure this matches the name of the column in your CSV

# Function to download a single image
def download_image(image_link, download_folder):
    try:
        # Extract image name from the link
        image_name = Path(image_link).name
        image_path = os.path.join(download_folder, image_name)

        # Download the image
        response = requests.get(image_link, stream=True)
        response.raise_for_status()  # Check if the request was successful

        # Save the image in chunks to avoid memory issues
        with open(image_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):  # 8KB chunks
                file.write(chunk)

        return image_link, True
    except Exception as e:
        print(f"Error downloading {image_link}: {e}")
        return image_link, False

# Function to download images in parallel with a progress bar
def download_images_from_links(df, image_column, download_folder, num_threads=8):
    # Get unique image links
    image_links = df[image_column].dropna().unique()

    # Use ThreadPoolExecutor to download images in parallel
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        # Submit download tasks
        futures = {executor.submit(download_image, link, download_folder): link for link in image_links}

        # Monitor progress with tqdm
        for future in tqdm(as_completed(futures), total=len(futures), desc="Downloading images"):
            link = futures[future]
            try:
                link, success = future.result()
                if success:
                    pass  # Image downloaded successfully
                else:
                    print(f"Failed to download {link}")
            except Exception as e:
                print(f"Exception occurred for {link}: {e}")

# Download all images from train.csv
download_images_from_links(df, image_column, download_folder)

print("All image downloads complete.")




KeyboardInterrupt: 

In [4]:
import pandas as pd

# Load your original CSV file from Google Drive
csv_file_path = '/content/drive/MyDrive/Amazon/resource/dataset/train.csv'
df = pd.read_csv(csv_file_path)

# Extract the first 20000 rows
sample_df = df.head(20000)
# Save these rows as a new CSV file (sample_train.csv) in Google Drive
sample_csv_path = '/content/drive/MyDrive/Amazon/resource/dataset/sample_trainImage.csv'
sample_df.to_csv(sample_csv_path, index=True)


In [5]:
import pandas as pd

# Load your original CSV file from Google Drive
csv_file_path = '/content/drive/MyDrive/Amazon/resource/dataset/train.csv'
df = pd.read_csv(csv_file_path)

# Extract the first 20000 rows
sample_df = df.head(20000)

sample_df = sample_df.reset_index(drop=True)
# Assuming 'image_url' column contains the image URLs
sample_df['image_name'] = sample_df['image_link'].str.split('/').str[-1].str.replace('.jpg', '')
step_df = sample_df[['image_link','entity_name','entity_value','group_id','image_name']]
step_df['index_2'] = step_df.index
sample_csv_path = '/content/drive/MyDrive/Amazon/resource/dataset/sample_trainImage.csv'
step_df.to_csv(sample_csv_path, index=True, index_label='index') # Set index=True to include the image names as index


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  step_df['index_2'] = step_df.index


In [9]:
!pip install easyocr
import os
import pandas as pd
import easyocr
from tqdm import tqdm

# Initialize EasyOCR with GPU support
ocr_reader = easyocr.Reader(['en'], gpu=True)  # Set gpu=False if not using GPU
# df = pd.read_csv('/content/drive/MyDrive/Amazon/resource/dataset/train.csv')
# sample_df = df.head(20000)
# sample_df.to_csv('/content/drive/MyDrive/Amazon/resource/dataset/sample_train.csv', index=False)
# Load the CSV file with image paths
input_csv = '/content/drive/MyDrive/Amazon/resource/dataset/sample_trainImage.csv'
df = pd.read_csv(input_csv)

# Process only the first 20000 rows
df = df.head(20000)

# Folder where the images are stored
image_folder = '/content/drive/MyDrive/Amazon/resource/dataset/images/'

# Function to extract text from an image
def extract_text_from_image(image_path):
    try:
        # Read text from the image using EasyOCR
        results = ocr_reader.readtext(image_path)
        # Join all detected text into a single string
        text = " ".join([result[1] for result in results])
        return text
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return "Error"

# Add a new column for extracted text in the dataframe
df['extracted_text'] = ""

# Iterate through each image path and extract text with a progress bar
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Extracting text"):
    # Use 'image_link' instead of 'image_path'
    x=row['image_link']
    prefix_to_remove = "https://m.media-amazon.com/images/I/"
    trimmed_x = x.replace(prefix_to_remove, "")
    image_path = os.path.join(image_folder, row['image_name']+'.jpg')

    if os.path.exists(image_path):
        # Extract text from the image
        extracted_text = extract_text_from_image(image_path)
        # Store the extracted text in the dataframe
        df.at[idx, 'extracted_text'] = extracted_text
    else:
        print(extracted_text)
        df.at[idx, 'extracted_text'] = "Image not found"

# Save the updated dataframe with extracted text to a new CSV file
output_csv = '/content/drive/MyDrive/Amazon/resource/dataset/sample_trainExtracted.csv'
df.to_csv(output_csv, index=False)
print(f"Text extraction complete. Updated CSV saved to {output_csv}.")



  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  state_dict = torch.load(model_path, map_location=device)
Extracting text:  10%|█         | 2/20 [01:51<16:42, 55.68s/it]


KeyboardInterrupt: 

In [10]:
import pandas as pd
import re
from tqdm import tqdm

# Load the CSV file with the extracted text
input_csv = '/content/drive/MyDrive/Amazon/resource/dataset/sample_trainExtracted.csv'
df = pd.read_csv(input_csv)

# Function to clean and preprocess text
def preprocess_text(text):
    if not isinstance(text, str):
        return ''
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)     # Replace multiple spaces with a single space
    text = text.strip()                  # Remove leading and trailing spaces
    return text

tqdm.pandas(desc="Cleaning text")
df['cleaned_text'] = df['extracted_text'].progress_apply(preprocess_text)

# Save the cleaned data to a new CSV file
output_csv = '/content/drive/MyDrive/Amazon/resource/dataset/sample_trainCleaned.csv'
df.to_csv(output_csv, index=False)

print(f"Data cleaning complete. Updated CSV saved to {output_csv}.")

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

# Load the combined CSV file
df_combined = pd.read_csv('/content/drive/MyDrive/Amazon/resource/dataset/sample_trainCleaned.csv')

df_combined['cleaned_text'] = df_combined['cleaned_text'].fillna('')

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the cleaned text data with progress bar
tqdm.pandas(desc="Fitting TF-IDF Vectorizer")
tfidf_matrix = tfidf_vectorizer.fit_transform(tqdm(df_combined['cleaned_text'], desc="Transforming Text"))

# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Concatenate the original DataFrame with the TF-IDF features
df_final = pd.concat([df_combined, tfidf_df], axis=1)

# Drop the original 'cleaned_text' column
df_final = df_final.drop(columns=['cleaned_text'])

# Save the final DataFrame with TF-IDF features to a new CSV file
output_csv = '/content/drive/MyDrive/Amazon/resource/dataset/sample_trainTfidf.csv'
df_final.to_csv(output_csv, index=False)
print("TF-IDF features extracted and saved to '/content/drive/MyDrive/Amazon/resource/dataset/sample_trainTfidf.csv'.")


Cleaning text: 100%|██████████| 20/20 [00:00<00:00, 3398.40it/s]


Data cleaning complete. Updated CSV saved to /content/drive/MyDrive/Amazon/resource/dataset/sample_trainCleaned.csv.


Transforming Text: 100%|██████████| 20/20 [00:00<00:00, 4100.61it/s]


TF-IDF features extracted and saved to '/content/drive/MyDrive/Amazon/resource/dataset/sample_trainTfidf.csv'.


In [11]:
import pandas as pd
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import numpy as np
from tqdm import tqdm

# Load the combined CSV file
df_combined = pd.read_csv('/content/drive/MyDrive/Amazon/resource/dataset/sample_trainCleaned.csv')

df_combined['cleaned_text'] = df_combined['cleaned_text'].fillna('')
# Prepare text data for Word2Vec
def preprocess_text(text):
    return simple_preprocess(text)

# Preprocess the cleaned text data with a progress bar
tqdm.pandas(desc="Preprocessing text")
texts = df_combined['cleaned_text'].progress_apply(preprocess_text)

# Train a Word2Vec model (or load a pre-trained model)
word2vec_model = Word2Vec(sentences=texts, vector_size=100, window=5, min_count=1, sg=0)

def get_avg_word_vector(text):
    vectors = [word2vec_model.wv[word] for word in text if word in word2vec_model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(word2vec_model.vector_size)

# Apply the function to get the average word vectors with a progress bar
tqdm.pandas(desc="Generating Word2Vec features")
word_vectors = np.array([get_avg_word_vector(text) for text in tqdm(texts.tolist(), desc="Extracting vectors")])

# Convert word vectors to a DataFrame
word_vectors_df = pd.DataFrame(word_vectors, columns=[f'word2vec_{i}' for i in range(word_vectors.shape[1])])

# Concatenate the original DataFrame with the Word2Vec features
df_final = pd.concat([df_combined, word_vectors_df], axis=1)

# Drop the original 'cleaned_text' column
df_final = df_final.drop(columns=['cleaned_text'])

# Save the final DataFrame with Word2Vec features to a new CSV file
output_csv = '/content/drive/MyDrive/Amazon/resource/dataset/sample_trainWord2vec.csv'
df_final.to_csv(output_csv, index=False)

print(f"Word2Vec features extracted and saved to '{output_csv}'.")


Preprocessing text: 100%|██████████| 20/20 [00:00<00:00, 4734.78it/s]
Extracting vectors: 100%|██████████| 20/20 [00:00<00:00, 2591.48it/s]


Word2Vec features extracted and saved to '/content/drive/MyDrive/Amazon/resource/dataset/sample_trainWord2vec.csv'.


In [12]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

# Load ResNet50 model pre-trained on ImageNet
resnet_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')

# Function to preprocess and extract features from an image
def extract_resnet_features(image_path):
    try:
        img = image.load_img(image_path, target_size=(224, 224))
        img_array = image.img_to_array(img)
        img_array = np.expand_dims(img_array, axis=0)
        img_array = preprocess_input(img_array)
        features = resnet_model.predict(img_array)
        return features.flatten()
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return np.zeros(resnet_model.output_shape[1])

# Load the image paths from CSV
df = pd.read_csv('/content/drive/MyDrive/Amazon/resource/dataset/sample_trainCleaned.csv')
image_folder = '/content/drive/MyDrive/Amazon/resource/dataset/images/'

# Extract features for each image with a progress bar
features_list = []
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Extracting ResNet50 Features"):
    x=row['image_link']
    prefix_to_remove = "https://m.media-amazon.com/images/I/"
    trimmed_x = x.replace(prefix_to_remove, "")
    image_path = os.path.join(image_folder, trimmed_x)
    # image_path = os.path.join(image_folder, row['image_path'])
    features = extract_resnet_features(image_path)
    features_list.append(features)

# Convert features to DataFrame
resnet_features_df = pd.DataFrame(features_list, columns=[f'resnet_{i}' for i in range(features_list[0].shape[0])])

# Combine with the original DataFrame
df_final = pd.concat([df, resnet_features_df], axis=1)

# Save the DataFrame with ResNet50 features
df_final.to_csv('/content/drive/MyDrive/Amazon/resource/dataset/sample_trainResnet.csv', index=False)

print("ResNet50 features extracted and saved to '/content/drive/MyDrive/Amazon/resource/dataset/sample_trainResnet.csv'.")

# print(df_cleaned.columns)
# print(df_tfidf.columns)
# print(df_word2vec.columns)
# print(df_resnet.columns)

import pandas as pd

# Load all feature files and the cleaned text file
df_tfidf = pd.read_csv('/content/drive/MyDrive/Amazon/resource/dataset/sample_trainTfidf.csv')
df_word2vec = pd.read_csv('/content/drive/MyDrive/Amazon/resource/dataset/sample_trainWord2vec.csv')
df_resnet = pd.read_csv('/content/drive/MyDrive/Amazon/resource/dataset/sample_trainResnet.csv')
df_cleaned = pd.read_csv('/content/drive/MyDrive/Amazon/resource/dataset/sample_trainCleaned.csv')

# Inspect columns to identify overlaps
print("Columns in df_tfidf:", df_tfidf.columns)
print("Columns in df_word2vec:", df_word2vec.columns)
print("Columns in df_resnet:", df_resnet.columns)
print("Columns in df_cleaned:", df_cleaned.columns)

# Rename columns in each DataFrame to avoid overlap
def rename_columns(df, prefix):
    return df.rename(columns=lambda x: f'{prefix}_{x}' if x not in ['image_link', 'group_id', 'entity_name', 'entity_value', 'image_path', 'extracted_text', 'cleaned_text'] else x)

df_tfidf = rename_columns(df_tfidf, 'tfidf')
df_word2vec = rename_columns(df_word2vec, 'word2vec')
df_resnet = rename_columns(df_resnet, 'resnet')

# Reset index and set image_path as index for merging
df_cleaned.set_index('image_link', inplace=True)
df_tfidf.set_index('image_link', inplace=True)
df_word2vec.set_index('image_link', inplace=True)
df_resnet.set_index('image_link', inplace=True)

# Perform the join operation while managing overlap
df_combined = df_cleaned.join(df_tfidf, how='inner', lsuffix='_cleaned').join(df_word2vec, how='inner', rsuffix='_word2vec').join(df_resnet, how='inner', rsuffix='_resnet')

# Reset index to get 'image_path' back as a column
df_combined.reset_index(inplace=True)

# Save the combined DataFrame
df_combined.to_csv('/content/drive/MyDrive/Amazon/resource/dataset/sample_trainCombined.csv', index=False)

print("All features combined and saved to 'sample_trainCombined.csv'.")

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 0us/step


Extracting ResNet50 Features:   0%|          | 0/20 [00:00<?, ?it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step


Extracting ResNet50 Features:   5%|▌         | 1/20 [00:02<00:51,  2.72s/it]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 199ms/step


Extracting ResNet50 Features:  10%|█         | 2/20 [00:03<00:24,  1.36s/it]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 194ms/step


Extracting ResNet50 Features:  15%|█▌        | 3/20 [00:03<00:14,  1.16it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 201ms/step


Extracting ResNet50 Features:  20%|██        | 4/20 [00:03<00:10,  1.58it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 199ms/step


Extracting ResNet50 Features:  25%|██▌       | 5/20 [00:04<00:08,  1.82it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 191ms/step


Extracting ResNet50 Features:  30%|███       | 6/20 [00:04<00:06,  2.15it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 200ms/step


Extracting ResNet50 Features:  35%|███▌      | 7/20 [00:04<00:05,  2.35it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 207ms/step


Extracting ResNet50 Features:  40%|████      | 8/20 [00:05<00:04,  2.51it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 200ms/step


Extracting ResNet50 Features:  45%|████▌     | 9/20 [00:05<00:04,  2.64it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 207ms/step


Extracting ResNet50 Features:  50%|█████     | 10/20 [00:05<00:03,  2.84it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 195ms/step


Extracting ResNet50 Features:  55%|█████▌    | 11/20 [00:06<00:03,  2.65it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 192ms/step


Extracting ResNet50 Features:  60%|██████    | 12/20 [00:06<00:02,  2.90it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 205ms/step


Extracting ResNet50 Features:  65%|██████▌   | 13/20 [00:06<00:02,  2.92it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 193ms/step


Extracting ResNet50 Features:  70%|███████   | 14/20 [00:07<00:02,  2.98it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 193ms/step


Extracting ResNet50 Features:  75%|███████▌  | 15/20 [00:07<00:01,  3.03it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 205ms/step


Extracting ResNet50 Features:  80%|████████  | 16/20 [00:07<00:01,  3.01it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 194ms/step


Extracting ResNet50 Features:  85%|████████▌ | 17/20 [00:07<00:00,  3.15it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 197ms/step


Extracting ResNet50 Features:  90%|█████████ | 18/20 [00:08<00:00,  3.25it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 195ms/step


Extracting ResNet50 Features:  95%|█████████▌| 19/20 [00:08<00:00,  3.37it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 204ms/step


Extracting ResNet50 Features: 100%|██████████| 20/20 [00:08<00:00,  2.23it/s]


ResNet50 features extracted and saved to '/content/drive/MyDrive/Amazon/resource/dataset/sample_trainResnet.csv'.
Columns in df_tfidf: Index(['index', 'image_link', 'entity_name', 'entity_value', 'group_id',
       'image_name', 'index_2', 'extracted_text', '00', '001',
       ...
       'xe', 'xi', 'xs', 'ye', 'you', 'youlike', 'your', 'yrda', 'yu', 'zu'],
      dtype='object', length=1118)
Columns in df_word2vec: Index(['index', 'image_link', 'entity_name', 'entity_value', 'group_id',
       'image_name', 'index_2', 'extracted_text', 'word2vec_0', 'word2vec_1',
       ...
       'word2vec_90', 'word2vec_91', 'word2vec_92', 'word2vec_93',
       'word2vec_94', 'word2vec_95', 'word2vec_96', 'word2vec_97',
       'word2vec_98', 'word2vec_99'],
      dtype='object', length=108)
Columns in df_resnet: Index(['index', 'image_link', 'entity_name', 'entity_value', 'group_id',
       'image_name', 'index_2', 'extracted_text', 'cleaned_text', 'resnet_0',
       ...
       'resnet_2038', 'resnet

In [21]:
# temporary
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import re
import joblib

# Define allowed units
allowed_units = {
    'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard',
    'gram', 'kilogram', 'microgram', 'milligram', 'ounce', 'pound', 'ton',
    'kilovolt', 'millivolt', 'volt',
    'kilowatt', 'watt',
    'centilitre', 'cubic foot', 'cubic inch', 'cup', 'decilitre', 'fluid ounce',
    'gallon', 'imperial gallon', 'litre', 'microlitre', 'millilitre', 'pint', 'quart'
}

# Load the combined features dataset
df_combined = pd.read_csv('/content/drive/MyDrive/Amazon/resource/dataset/sample_trainCombined.csv', usecols=['index', 'entity_value_cleaned'])

# Focus on the first 20000 rows
df_combined = df_combined.head(20000)
df_combined = df_combined.rename(columns={'entity_value_cleaned': 'entity_value'})

# Function to extract numeric value and unit
def extract_value_and_unit(entity_value):
    match = re.match(r"([0-9.]+)\s*([a-zA-Z\s]+)", entity_value)
    if match:
        value = float(match.group(1))
        unit = match.group(2).strip()
        return value, unit
    return None, ''

# Apply the extraction function
df_combined[['numeric_value', 'unit']] = df_combined['entity_value'].apply(extract_value_and_unit).apply(pd.Series)

# Replace NaN with empty strings in 'unit'
df_combined['unit'] = df_combined['unit'].fillna('')

# Filter rows where the numeric value is not NaN and unit is in allowed units
# Check the initial and final shape of df_combined
print(f"Initial rows: {df_combined.shape[0]}")
df_combined = df_combined[df_combined['numeric_value'].notna()]
df_combined = df_combined[df_combined['unit'].isin(allowed_units)]
print(f"After filtering rows: {df_combined.shape[0]}")


# Drop the original 'entity_value' column
df_combined = df_combined.drop(columns=['entity_value'])

# Replace NaN values with empty strings in the feature columns
X = df_combined.drop(columns=['numeric_value', 'unit'])
X = X.fillna('')

# Prepare features and target
y_value = df_combined['numeric_value']
y_unit = df_combined['unit']

# Ensure all features are numeric
X = X.apply(pd.to_numeric, errors='ignore')
X = X.select_dtypes(include=['number'])

# Encode the unit labels
unit_encoder = LabelEncoder()
y_unit_encoded = unit_encoder.fit_transform(y_unit.fillna(''))

# Split data into training and test sets
X_train, X_test, y_value_train, y_value_test, y_unit_train, y_unit_test = train_test_split(
    X, y_value, y_unit_encoded, test_size=0.2, random_state=42
)
# Print the shapes of training and testing sets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_value_train shape: {y_value_train.shape}")
print(f"y_value_test shape: {y_value_test.shape}")
print(f"y_unit_train shape: {y_unit_train.shape}")
print(f"y_unit_test shape: {y_unit_test.shape}")
# Train model for numeric value prediction
tqdm.write("Training numeric value model...")
value_model = RandomForestRegressor(n_estimators=10, random_state=42)
value_model.fit(X_train, y_value_train)

# Predict and evaluate numeric values
y_value_pred = value_model.predict(X_test)
mse = mean_squared_error(y_value_test, y_value_pred)
tqdm.write(f"Mean Squared Error for numeric value prediction: {mse}")

# Train model for unit prediction
tqdm.write("Training unit prediction model...")
unit_model = RandomForestRegressor(n_estimators=10, random_state=42)
unit_model.fit(X_train, y_unit_train)

# Save the test data to a CSV file
test_data = X_test.copy()
test_data['numeric_value'] = y_value_test
test_data['unit'] = y_unit_test

# Ensure index is preserved in test_data
test_data = test_data.reset_index(drop=False)
test_data.rename(columns={'index': 'index'}, inplace=True)

test_data.to_csv('test.csv', index=False)

# Load the test data
test_data = pd.read_csv('test.csv')
test_X = test_data.drop(columns=['numeric_value'])

# Ensure the feature columns in the test data match those used in training
test_X = test_X[X.columns]
# Ensure test_X columns match X columns
print(f"Test_X columns: {test_X.columns}")
print(f"Training_X columns: {X.columns}")
# Ensure test features are numeric
test_X = test_X.apply(pd.to_numeric, errors='ignore')
test_X = test_X.select_dtypes(include=['number'])

# Predict on the test data
test_value_predictions = value_model.predict(test_X)
test_unit_predictions = unit_model.predict(test_X)

# Convert unit predictions back to original labels
test_unit_predictions = unit_encoder.inverse_transform(test_unit_predictions.astype(int))

# Format value predictions based on their type
def format_value(value):
    if value % 1 == 0:  # Check if the value is an integer
        return int(value)
    else:  # Format to 2 decimal places if it's a floating-point number
        return round(value, 2)

test_value_predictions = [format_value(value) for value in test_value_predictions]

# Format predictions to include only allowed units
test_data['entity_value'] = [f"{value} {unit}" for value, unit in zip(test_value_predictions, test_unit_predictions)]

# Handle empty predictions
test_data['entity_value'] = test_data['entity_value'].replace('', '*')

# Save predictions to a CSV file, including the index
test_data[['index', 'entity_value']].to_csv('/content/drive/MyDrive/Amazon/resource/dataset/test_output.csv', index=False)
df = pd.read_csv('/content/drive/MyDrive/Amazon/resource/dataset/test_output.csv')

# Rename the column
df = df.rename(columns={'entity_value': 'prediction'})

# Save the updated CSV
df.to_csv('/content/drive/MyDrive/Amazon/resource/dataset/test_output.csv', index=False)

tqdm.write("Predictions saved to 'test_output.csv'.")

# # Save the models
joblib.dump(value_model, '/content/drive/MyDrive/Amazon/resource/dataset/value_model.pkl')
joblib.dump(unit_model, '/content/drive/MyDrive/Amazon/resource/dataset/unit_model.pkl')


Initial rows: 20
After filtering rows: 19
X_train shape: (15, 1)
X_test shape: (4, 1)
y_value_train shape: (15,)
y_value_test shape: (4,)
y_unit_train shape: (15,)
y_unit_test shape: (4,)
Training numeric value model...
Mean Squared Error for numeric value prediction: 81221.4060483775
Training unit prediction model...
Test_X columns: Index(['index'], dtype='object')
Training_X columns: Index(['index'], dtype='object')
Predictions saved to 'test_output.csv'.
