In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [3]:

# Load the CSV file
df = pd.read_csv('dataset/sample_test.csv')

# Inspect the data (optional)
print(df.head())


   index                                         image_link  group_id  \
0      0  https://m.media-amazon.com/images/I/41-NCxNuBx...    658003   
1      1  https://m.media-amazon.com/images/I/41-NCxNuBx...    658003   
2      2  https://m.media-amazon.com/images/I/417NJrPEk+...    939426   
3      3  https://m.media-amazon.com/images/I/417SThj+Sr...    276700   
4      4  https://m.media-amazon.com/images/I/417SThj+Sr...    276700   

                     entity_name  
0                          width  
1                          depth  
2  maximum_weight_recommendation  
3                        voltage  
4                        wattage  


In [4]:

# Preprocess the data
# We'll extract 'group_id' and encode the target 'entity_name'
df['group_id'] = df['group_id'].fillna(0).astype(int)  # Fill missing and convert to int
df['image_link'] = df['image_link'].apply(lambda x: x.split('/')[-1].split('.')[0])  # Extract image identifier

# Encode 'entity_name' target column (classification target)
label_encoder = LabelEncoder()
df['entity_label'] = label_encoder.fit_transform(df['entity_name'])

# Define features (X) and target (y) for entity classification
X = df[['group_id', 'image_link']]  # For simplicity, we'll only use 'group_id' and 'image_link'
y = df['entity_label']


In [13]:

unique_values = df['entity_label'].unique()
print(unique_values)


[6 0 3 4 5 2 1]


In [None]:

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use RandomForestClassifier as an example
entity_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model to predict entity (classification task)
entity_classifier.fit(X_train, y_train)

# Make predictions
y_pred_entity = entity_classifier.predict(X_test)

# Evaluate the model (classification)
accuracy = accuracy_score(y_test, y_pred_entity)
print(f"Entity Prediction Accuracy: {accuracy * 100:.2f}%")

# Decode predicted entities
y_pred_entity_labels = label_encoder.inverse_transform(y_pred_entity)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Filter the dataset for 'width' predictions (for example)
width_df = df[df['entity_name'] == 'width']

# Define X (input features) and y (target value) for regression
X_width = width_df[['group_id', 'image_link']]
y_width = width_df['data']  # Assuming 'data' column holds the actual value (e.g., width in cm)

# Split the data into training and test sets for width prediction
X_train_width, X_test_width, y_train_width, y_test_width = train_test_split(X_width, y_width, test_size=0.2, random_state=42)

# Use a Linear Regression model for simplicity (or any other regression model)
regression_model = LinearRegression()

# Train the regression model
regression_model.fit(X_train_width, y_train_width)

# Make predictions
y_pred_width = regression_model.predict(X_test_width)

# Evaluate the model (regression)
mse = mean_squared_error(y_test_width, y_pred_width)
print(f"Mean Squared Error for Width Prediction: {mse:.2f}")
