In [1]:
from google.colab import drive
import os
import zipfile

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
zip_file_path = '/content/drive/My Drive/ML_Hackathon/68e8d1d70b66d_student_resource.zip'

In [4]:
unzip_path = '/content/'

In [5]:
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(unzip_path)

print("Files have been unzipped successfully.")

Files have been unzipped successfully.


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
import numpy as np

# Base path to your data, based on the unzipped folder structure
base_path = '/content/68e8d1d70b66d_student_resource/student_resource'

In [None]:
train_df = pd.read_csv("/content/student_resource/dataset/train.csv")
test_df = pd.read_csv("/content/student_resource/dataset/test.csv")

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
train_df.head()

In [None]:
train_df["catalog_content"][0]

In [None]:
test_df.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.set_style("whitegrid")

# Create a figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Plot a histogram of the price
sns.histplot(train_df['price'], bins=50, kde=True, ax=axes[0])
axes[0].set_title('Distribution of Product Prices (Histogram)')
axes[0].set_xlabel('Price')
axes[0].set_ylabel('Frequency')
# Plot a box plot of the price to spot outliers
sns.boxplot(x=train_df['price'], ax=axes[1])
axes[1].set_title('Distribution of Product Prices (Box Plot)')
axes[1].set_xlabel('Price')

plt.tight_layout()
plt.show()


In [None]:
train_df['word_count'] = train_df['catalog_content'].str.split().str.len()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(train_df['word_count'], bins=50, kde=True)
plt.title('Distribution of Word Count in catalog_content')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.show()

In [None]:
train_df["word_count"].describe()

In [None]:
from collections import Counter
all_words = ' '.join(train_df['catalog_content'].fillna('')).split()
most_common_words = Counter(all_words).most_common(20)

print("\n--- Most Common Words in catalog_content ---")
print(most_common_words)

In [None]:
import numpy as np

# Apply the log transformation to the 'price' column in the training data
train_df['log_price'] = np.log1p(train_df['price'])
train_df.head()

In [None]:
train_df["log_price"].describe()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

sns.histplot(train_df['log_price'], bins=50, kde=True, ax=axes[0])
axes[0].set_title('Distribution of Log-Transformed Prices (Histogram)')
axes[0].set_xlabel('Log(Price)')
axes[0].set_ylabel('Frequency')

sns.boxplot(x=train_df['log_price'], ax=axes[1])
axes[1].set_title('Distribution of Log-Transformed Prices (Box Plot)')
axes[1].set_xlabel('Log(Price)')

plt.tight_layout()
plt.show()

In [None]:
import string
import re
import random

# Get 5 random samples to inspect
sample_indices = random.sample(range(len(train_df)), 5)
print("--- Checking for Punctuation and Special Characters ---")
for i in sample_indices:
    text = train_df['catalog_content'][i]
    # Use a regular expression to find all punctuation characters
    punctuation_found = re.findall(f'[{re.escape(string.punctuation)}]', text)
    print(f"Sample {i}: Punctuation found: {set(punctuation_found)}")

In [None]:
# Check for "Item Name:", "Value:", "Unit:" patterns
print("\n--- Checking for Data Labels ---")
print(f"Contains 'Item Name:' pattern: {train_df['catalog_content'].str.contains('Item Name:', na=False).sum()}")
print(f"Contains 'Value:' pattern: {train_df['catalog_content'].str.contains('Value:', na=False).sum()}")
print(f"Contains 'Unit:' pattern: {train_df['catalog_content'].str.contains('Unit:', na=False).sum()}")

In [None]:
# Check for HTML tags
print("\n--- Checking for HTML Tags ---")
html_tags_present = train_df['catalog_content'].str.contains('<.*?>', na=False).sum()
print(f"HTML tags are present: {html_tags_present}")

# Check for URLs
print("\n--- Checking for URLs ---")
urls_present = train_df['catalog_content'].str.contains('http[s]?://\S+|www\.\S+', na=False).sum()
print(f"URLs are present: {urls_present}")

In [None]:
import re
import string

def clean_text_final(text):
    text = str(text).lower()

    # 1. Handle common punctuation and symbols first
    text = re.sub(r'\'', '', text)  # Remove apostrophes

    # Remove unwanted patterns
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'http[s]?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)

    # Remove specific labels or prefixes
    text = re.sub(r'(item name:|value:|unit:|bullet point:\s*\d+|bullet point)', '', text)

    # Replace newlines and hyphens with a space
    text = re.sub(r'[\n\-]', ' ', text)

    # Remove all unwanted punctuation except '.' in numbers.
    # This regex removes everything that is NOT a lowercase letter, a number, a space, or a decimal point.
    text = re.sub(r'[^a-z0-9\s\.]', ' ', text)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [None]:
import pandas as pd
import random

# Assume train_df and test_df are loaded and clean_text_final_fixed is defined
train_df['clean_catalog_content'] = train_df['catalog_content'].apply(clean_text_final)
test_df['clean_catalog_content'] = test_df['catalog_content'].apply(clean_text_final)

# Find samples with numbers to check if the decimal point is handled
samples_with_decimals = train_df[train_df['catalog_content'].str.contains(r'\d+\.\d+', na=False, regex=True)].index.tolist()

if samples_with_decimals:
    random_index = random.choice(samples_with_decimals)
    original_text = train_df.loc[random_index, 'catalog_content']
    cleaned_text = train_df.loc[random_index, 'clean_catalog_content']

    print(f"--- Checking a random sample with a decimal (Index: {random_index}) ---")
    print("Original Text:", original_text)
    print("Cleaned Text:", cleaned_text)
else:
    print("No samples with decimal numbers found for this check.")

In [None]:
train_df.columns

In [None]:
test_df.columns

In [None]:
import string
import random

# Choose a random sample from the dataset
random_index = random.randint(0, len(train_df) - 1)
sample_text = train_df['catalog_content'][random_index]
cleaned_text = clean_text_final(sample_text)

print(f"--- Checking a random sample (Index: {random_index}) ---")
print("Original Text:", sample_text)
print("Cleaned Text:", cleaned_text)

# Check if any punctuation remains in the cleaned text
remaining_punctuation = set(cleaned_text).intersection(set(string.punctuation))
print(f"\nRemaining punctuation in cleaned text: {remaining_punctuation}")

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
import numpy as np
import re
import string

In [None]:
# Prepare the data for the model
X_train_text = train_df['clean_catalog_content'].fillna('')
X_test_text = test_df['clean_catalog_content'].fillna('')
y_train_log = train_df['log_price']

In [None]:
tfidf_vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=15000,
    stop_words='english',
)

In [None]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_text)
X_test_tfidf = tfidf_vectorizer.transform(X_test_text)

In [None]:
# Train the Linear Regression model
model = LinearRegression()
model.fit(X_train_tfidf, y_train_log)

In [None]:
# Make predictions and inverse transform them
predictions_log = model.predict(X_test_tfidf)
final_predictions = np.expm1(predictions_log)
final_predictions[final_predictions < 0] = 0.01

In [None]:
submission_df = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': final_predictions
})
submission_df.to_csv('test_out_improved_text.csv', index=False)
print("New submission file created with improved text features!")

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# Assuming train_df has 'clean_catalog_content' and 'log_price' columns
X_train_full = train_df['clean_catalog_content']
y_train_log_full = train_df['log_price']

# Split the data (80% for training, 20% for validation)
X_train_subset, X_val, y_train_log_subset, y_val_log = train_test_split(
    X_train_full,
    y_train_log_full,
    test_size=0.2,
    random_state=42
)

print(f"New Training set size: {len(X_train_subset)}")
print(f"Validation set size: {len(X_val)}")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression

# Create TF-IDF features based on the new subsets
tfidf_vectorizer_val = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=15000,
    stop_words='english',
)

X_train_subset_tfidf = tfidf_vectorizer_val.fit_transform(X_train_subset)
X_val_tfidf = tfidf_vectorizer_val.transform(X_val)

# Train the model
model_val = LinearRegression()
model_val.fit(X_train_subset_tfidf, y_train_log_subset)

# Make predictions on the validation set and inverse transform
predictions_val_log = model_val.predict(X_val_tfidf)
predictions_val = np.expm1(predictions_val_log)
predictions_val[predictions_val < 0] = 0.01

# Inverse transform the actual prices for comparison
y_val = np.expm1(y_val_log)

In [None]:
def smape_score(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    smape = np.mean(np.abs(y_pred - y_true) / denominator) * 100
    return smape

smape = smape_score(y_val, predictions_val)
print(f"Your model's SMAPE score on the validation set is: {smape:.2f}%")

In [None]:
# Save the preprocessed training data
train_df.to_csv('train_preprocessed.csv', index=False)

# Save the preprocessed test data
test_df.to_csv('test_preprocessed.csv', index=False)

print("Preprocessed DataFrames saved as CSVs!")

In [6]:
import pandas as pd
import numpy as np

# Load the preprocessed dataframes
train_df = pd.read_csv('train_preprocessed.csv')
test_df = pd.read_csv('test_preprocessed.csv')

# Verify the columns are correct
print("Training DataFrame columns:", train_df.columns)
print("Test DataFrame columns:", test_df.columns)

# Check the first few rows to ensure data is loaded correctly
print(train_df.head())

Training DataFrame columns: Index(['sample_id', 'catalog_content', 'image_link', 'price', 'word_count',
       'log_price', 'clean_catalog_content'],
      dtype='object')
Test DataFrame columns: Index(['sample_id', 'catalog_content', 'image_link', 'clean_catalog_content'], dtype='object')
   sample_id                                    catalog_content  \
0      33127  Item Name: La Victoria Green Taco Sauce Mild, ...   
1     198967  Item Name: Salerno Cookies, The Original Butte...   
2     261251  Item Name: Bear Creek Hearty Soup Bowl, Creamy...   
3      55858  Item Name: Judee’s Blue Cheese Powder 11.25 oz...   
4     292686  Item Name: kedem Sherry Cooking Wine, 12.7 Oun...   

                                          image_link  price  word_count  \
0  https://m.media-amazon.com/images/I/51mo8htwTH...   4.89          18   
1  https://m.media-amazon.com/images/I/71YtriIHAA...  13.12          80   
2  https://m.media-amazon.com/images/I/51+PFEe-w-...   1.97          59   
3  htt

In [7]:
train_df.shape

(75000, 7)

In [8]:
test_df.shape

(75000, 4)

In [9]:
import sys
import os
base_resources_path = 'student_resource'
src_path = os.path.join('/content', base_resources_path, 'src')
sys.path.append(src_path)
#/content/student_resource/src/utils.py
from utils import download_images

In [10]:
images_dir = os.path.join('/content', 'product_images')
os.makedirs(images_dir, exist_ok=True)

In [12]:
print("Downloading training images...")
download_images(train_df['image_link'], images_dir)

# Corrected code: Pass the 'image_link' column from the DataFrame
print("Downloading test images...")
download_images(test_df['image_link'], images_dir)

print("Image download complete.")

Downloading training images...


 51%|█████▏    | 38452/75000 [00:03<00:03, 11248.37it/s]

HTTP Error 404: Not Found


100%|██████████| 75000/75000 [03:16<00:00, 382.19it/s]


Downloading test images...


 56%|█████▌    | 41937/75000 [03:25<01:55, 286.64it/s]

HTTP Error 404: Not Found


100%|██████████| 75000/75000 [06:06<00:00, 204.58it/s]


Image download complete.


In [13]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import os
import glob
from tqdm import tqdm

# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load a pre-trained ResNet50 model
model = models.resnet50(pretrained=True)

# Remove the final classification layer
# The model will now output features, not class probabilities
model = torch.nn.Sequential(*(list(model.children())[:-1]))

# Move the model to the GPU
model.to(device)

# Set the model to evaluation mode
model.eval()

print("ResNet50 model loaded and configured for feature extraction.")

Using device: cuda




Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


100%|██████████| 97.8M/97.8M [00:00<00:00, 176MB/s]


ResNet50 model loaded and configured for feature extraction.


In [14]:
# Updated code for feature extraction with batches and a custom collate function
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from PIL import Image
import os
import glob
from tqdm import tqdm
import numpy as np

# A simple custom dataset class to handle image loading
class ImageDataset(Dataset):
    def __init__(self, image_paths, transform=None):
        self.image_paths = image_paths
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        try:
            img = Image.open(img_path).convert('RGB')
            if self.transform:
                img = self.transform(img)
            return img, os.path.basename(img_path)
        except Exception as e:
            print(f"Warning: Skipping image {img_path}. Error: {e}")
            return None, None

# Define the custom collate function
def custom_collate_fn(batch):
    # Filter out None values
    batch = list(filter(lambda x: x[0] is not None, batch))
    if not batch:  # Check if the batch is empty after filtering
        return None, None

    images, image_ids = zip(*batch)
    images = torch.stack(images, 0)
    return images, image_ids

# Define the image transformations
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Get all image paths
image_dir = '/content/product_images'
all_image_paths = glob.glob(os.path.join(image_dir, '*.jpg'))

# Create the dataset and dataloader
batch_size = 32
image_dataset = ImageDataset(all_image_paths, transform=preprocess)
# Pass the custom collate function to the DataLoader
image_dataloader = DataLoader(
    image_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=2,
    collate_fn=custom_collate_fn
)

# Dictionary to store features
all_image_features = {}

# Extract features in batches
print("Extracting features from images in batches...")
for images, image_ids in tqdm(image_dataloader, desc="Extracting features"):
    if images is None:  # Check for empty batches
        continue
    images = images.to(device)
    with torch.no_grad():
        features = model(images)

    for i, img_id in enumerate(image_ids):
        feature_vector = features[i].squeeze().cpu().numpy()
        all_image_features[img_id] = feature_vector

print("Feature extraction complete!")

Extracting features from images in batches...


Extracting features:  14%|█▎        | 596/4394 [12:24<1:18:11,  1.24s/it]



Extracting features:  22%|██▏       | 959/4394 [19:42<55:12,  1.04it/s]  



Extracting features:  42%|████▏     | 1835/4394 [37:49<38:45,  1.10it/s]



Extracting features:  42%|████▏     | 1851/4394 [38:11<50:56,  1.20s/it]  



Extracting features:  48%|████▊     | 2097/4394 [43:20<47:38,  1.24s/it]



Extracting features:  48%|████▊     | 2110/4394 [43:35<39:26,  1.04s/it]



Extracting features:  49%|████▉     | 2166/4394 [44:45<45:17,  1.22s/it]  



Extracting features:  50%|████▉     | 2184/4394 [45:06<32:59,  1.12it/s]



Extracting features:  50%|█████     | 2198/4394 [45:24<40:57,  1.12s/it]



Extracting features:  53%|█████▎    | 2314/4394 [47:50<35:48,  1.03s/it]



Extracting features:  56%|█████▌    | 2446/4394 [50:34<44:32,  1.37s/it]



Extracting features:  58%|█████▊    | 2560/4394 [53:00<42:07,  1.38s/it]



Extracting features:  59%|█████▉    | 2604/4394 [53:54<27:51,  1.07it/s]



Extracting features:  79%|███████▉  | 3473/4394 [1:12:11<20:31,  1.34s/it]



Extracting features:  80%|███████▉  | 3496/4394 [1:12:39<20:14,  1.35s/it]



Extracting features:  81%|████████▏ | 3575/4394 [1:14:21<18:28,  1.35s/it]



Extracting features:  84%|████████▎ | 3677/4394 [1:16:26<11:39,  1.02it/s]



Extracting features:  86%|████████▌ | 3757/4394 [1:18:07<12:38,  1.19s/it]



Extracting features:  90%|█████████ | 3958/4394 [1:22:22<10:03,  1.38s/it]



Extracting features:  91%|█████████▏| 4017/4394 [1:23:36<07:21,  1.17s/it]



Extracting features:  93%|█████████▎| 4103/4394 [1:25:22<04:24,  1.10it/s]



Extracting features: 100%|██████████| 4394/4394 [1:31:25<00:00,  1.25s/it]

Feature extraction complete!





In [16]:
import numpy as np

# Assuming your dictionary is named all_image_features
# Save the dictionary to a file in your Colab environment
np.save('all_image_features.npy', all_image_features)

print("Dictionary saved successfully to all_image_features.npy!")

Dictionary saved successfully to all_image_features.npy!


In [17]:
import numpy as np

# Load the dictionary from the saved file
all_image_features = np.load('all_image_features.npy', allow_pickle=True).item()

print("Dictionary loaded successfully!")

Dictionary loaded successfully!


In [18]:
import pandas as pd
import numpy as np
import os

# Load your preprocessed dataframes
train_df = pd.read_csv('train_preprocessed.csv')
test_df = pd.read_csv('test_preprocessed.csv')

# Load the image features dictionary
# Make sure the file path is correct (e.g., in your Google Drive)
all_image_features = np.load('all_image_features.npy', allow_pickle=True).item()

In [1]:
# Convert the image features dictionary to a DataFrame for easier merging
image_features_df = pd.DataFrame.from_dict(all_image_features, orient='index')
image_features_df.index.name = 'filename'

# Extract the filename (unique ID) from the original image_link
train_df['filename'] = train_df['image_link'].apply(lambda x: x.split('/')[-1])
test_df['filename'] = test_df['image_link'].apply(lambda x: x.split('/')[-1])

# Merge the image features into the original dataframes
train_df = train_df.merge(image_features_df, on='filename', how='left')
test_df = test_df.merge(image_features_df, on='filename', how='left')

# The merge might have created NaNs for products without a downloaded image.
# We'll fill these missing values with zeros.
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

print("Text and image features have been combined.")

NameError: name 'pd' is not defined