In [None]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


dataset = pd.read_csv("train.csv")

# Apply one-hot encoding on 'entity_name' column (before dropping it)
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(handle_unknown='ignore'), ['entity_name'])],
    remainder='passthrough'
)
x_encoded = ct.fit_transform(dataset)

# Convert encoded data to DataFrame
x_encoded_df = pd.DataFrame(x_encoded)

# Drop original 'entity_name' column from the dataset
dataset_drop_entity = dataset.drop(columns=['entity_name'])

# Get the one-hot encoded column names
encoded_column_names = ct.transformers_[0][1].get_feature_names_out(['entity_name'])

# Create DataFrame for one-hot encoded columns
onehot_cols = pd.DataFrame(x_encoded[:, :len(encoded_column_names)], columns=encoded_column_names)

# Concatenate original columns (excluding 'entity_name') and one-hot encoded columns
dataset_onehot = pd.concat([dataset_drop_entity, onehot_cols], axis=1)

# Move 'entity_value' column to the last position
cols = [col for col in dataset_onehot.columns if col != 'entity_value']  # All columns except 'entity_value'
cols.append('entity_value')  # Add 'entity_value' at the end

# Reorder the DataFrame
dataset_onehot = dataset_onehot[cols]

# Show the first few rows
print(dataset_onehot.head())


                                          image_link  group_id  \
0  https://m.media-amazon.com/images/I/61I9XdN6OF...    748919   
1  https://m.media-amazon.com/images/I/71gSRbyXmo...    916768   
2  https://m.media-amazon.com/images/I/61BZ4zrjZX...    459516   
3  https://m.media-amazon.com/images/I/612mrlqiI4...    459516   
4  https://m.media-amazon.com/images/I/617Tl40LOX...    731432   

  entity_name_depth entity_name_height entity_name_item_volume  \
0               0.0                0.0                     0.0   
1               0.0                0.0                     1.0   
2               0.0                0.0                     0.0   
3               0.0                0.0                     0.0   
4               0.0                0.0                     0.0   

  entity_name_item_weight entity_name_maximum_weight_recommendation  \
0                     1.0                                       0.0   
1                     0.0                                       

In [None]:
print(dataset_onehot.columns)

Index(['image_link', 'group_id', 'entity_name_depth', 'entity_name_height',
       'entity_name_item_volume', 'entity_name_item_weight',
       'entity_name_maximum_weight_recommendation', 'entity_name_voltage',
       'entity_name_wattage', 'entity_name_width', 'entity_value'],
      dtype='object')


In [None]:
dataset_onehot.to_csv("rearranged_dataset2.csv", index=False)

In [None]:
import pandas as pd

# Load your dataset
df = pd.read_csv('rearranged_dataset2.csv')

# Split entity_value into numeric and unit
df[['value', 'unit']] = df['entity_value'].str.extract(r'(\d+\.?\d*)\s*(\D+)')
df = df.drop(columns=['entity_value'])


df.to_csv('updated_file.csv', index=False)

# Show the updated DataFrame
print(df)


                                               image_link  group_id  \
0       https://m.media-amazon.com/images/I/61I9XdN6OF...    748919   
1       https://m.media-amazon.com/images/I/71gSRbyXmo...    916768   
2       https://m.media-amazon.com/images/I/61BZ4zrjZX...    459516   
3       https://m.media-amazon.com/images/I/612mrlqiI4...    459516   
4       https://m.media-amazon.com/images/I/617Tl40LOX...    731432   
...                                                   ...       ...   
263854  https://m.media-amazon.com/images/I/612J1R1xHl...    558806   
263855  https://m.media-amazon.com/images/I/61Blzh2+28...    470067   
263856  https://m.media-amazon.com/images/I/51MsegDL9V...    204245   
263857  https://m.media-amazon.com/images/I/510KhVw4VS...    752266   
263858  https://m.media-amazon.com/images/I/51lzTNLQ-6...    416664   

        entity_name_depth  entity_name_height  entity_name_item_volume  \
0                     0.0                 0.0                      0.0   

In [None]:
df


Unnamed: 0,image_link,group_id,entity_name_depth,entity_name_height,entity_name_item_volume,entity_name_item_weight,entity_name_maximum_weight_recommendation,entity_name_voltage,entity_name_wattage,entity_name_width,value,unit
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,500.0,gram
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,cup
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.709,gram
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.709,gram
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1400,milligram
...,...,...,...,...,...,...,...,...,...,...,...,...
263854,https://m.media-amazon.com/images/I/612J1R1xHl...,558806,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,centimetre
263855,https://m.media-amazon.com/images/I/61Blzh2+28...,470067,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,8.5,inch
263856,https://m.media-amazon.com/images/I/51MsegDL9V...,204245,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,43.2,centimetre
263857,https://m.media-amazon.com/images/I/510KhVw4VS...,752266,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,9.1,centimetre


In [None]:
import pandas as pd

# Load your dataset
df = pd.read_csv('updated_file.csv')

# Factorize the 'unit' column to assign numeric labels
df['unit_label'] = pd.factorize(df['unit'])[0]

# Save the modified DataFrame (optional)
df.to_csv('updated_file_with_unit_labels.csv', index=False)

# Save the unit label mapping
unit_mapping = pd.factorize(df['unit'])[1]

import pickle
with open('unit_mapping.pkl', 'wb') as file:
    pickle.dump(unit_mapping, file)

# Show the updated DataFrame
print(df)


                                               image_link  group_id  \
0       https://m.media-amazon.com/images/I/61I9XdN6OF...    748919   
1       https://m.media-amazon.com/images/I/71gSRbyXmo...    916768   
2       https://m.media-amazon.com/images/I/61BZ4zrjZX...    459516   
3       https://m.media-amazon.com/images/I/612mrlqiI4...    459516   
4       https://m.media-amazon.com/images/I/617Tl40LOX...    731432   
...                                                   ...       ...   
263854  https://m.media-amazon.com/images/I/612J1R1xHl...    558806   
263855  https://m.media-amazon.com/images/I/61Blzh2+28...    470067   
263856  https://m.media-amazon.com/images/I/51MsegDL9V...    204245   
263857  https://m.media-amazon.com/images/I/510KhVw4VS...    752266   
263858  https://m.media-amazon.com/images/I/51lzTNLQ-6...    416664   

        entity_name_depth  entity_name_height  entity_name_item_volume  \
0                     0.0                 0.0                      0.0   

In [None]:
df


Unnamed: 0,image_link,group_id,entity_name_depth,entity_name_height,entity_name_item_volume,entity_name_item_weight,entity_name_maximum_weight_recommendation,entity_name_voltage,entity_name_wattage,entity_name_width,value,unit,unit_label
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,500.000,gram,0
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.000,cup,1
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.709,gram,0
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.709,gram,0
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1400.000,milligram,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
263854,https://m.media-amazon.com/images/I/612J1R1xHl...,558806,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.000,centimetre,19
263855,https://m.media-amazon.com/images/I/61Blzh2+28...,470067,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,8.500,inch,31
263856,https://m.media-amazon.com/images/I/51MsegDL9V...,204245,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,43.200,centimetre,19
263857,https://m.media-amazon.com/images/I/510KhVw4VS...,752266,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,9.100,centimetre,19


In [None]:
df.columns

Index(['image_link', 'group_id', 'entity_name_depth', 'entity_name_height',
       'entity_name_item_volume', 'entity_name_item_weight',
       'entity_name_maximum_weight_recommendation', 'entity_name_voltage',
       'entity_name_wattage', 'entity_name_width', 'value', 'unit',
       'unit_label'],
      dtype='object')

In [None]:
dataset=pd.read_csv('updated_file_with_unit_labels.csv')

In [None]:
X=dataset.iloc[:, 1:10]
y=dataset.iloc[:, -1]

In [None]:
X

Unnamed: 0,group_id,entity_name_depth,entity_name_height,entity_name_item_volume,entity_name_item_weight,entity_name_maximum_weight_recommendation,entity_name_voltage,entity_name_wattage,entity_name_width
0,748919,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,916768,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,459516,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,459516,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,731432,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
263854,558806,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
263855,470067,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
263856,204245,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
263857,752266,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
y

Unnamed: 0,unit_label
0,0
1,1
2,0
3,0
4,2
...,...
263854,19
263855,31
263856,19
263857,19


In [None]:
# from sklearn.preprocessing import StandardScaler
# import numpy as np

# # Assuming 'X' is your DataFrame and you're standardizing the first column (index 0)
# sc = StandardScaler()

# # Convert the first column to a NumPy array if needed and fit-transform the data
# X.iloc[:, 0] = sc.fit_transform(X.iloc[:, [0]])  # Use double brackets to keep it as a DataFrame for fit_transform

# # Print the standardized column

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1)

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier=DecisionTreeClassifier(criterion='entropy',random_state=0)
classifier.fit(x_train,y_train)

In [None]:
y_pred=classifier.predict(x_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[11686     0   764 ...     0     0     0]
 [    0     0     0 ...     0     0     0]
 [  607     0   990 ...     0     0     0]
 ...
 [    5     0     0 ...     0     0     0]
 [    0     0     0 ...     0    29     0]
 [    0     0     0 ...     0     0     0]]


0.6143977867050708

In [None]:
import pickle

# Save the encoder (ColumnTransformer) after training
with open('encoder.pkl', 'wb') as file:
    pickle.dump(ct, file)

# Save the trained SVM model
with open('svm_model.pkl', 'wb') as file:
    pickle.dump(classifier, file)

# New Section

In [None]:
import pandas as pd
import pickle

# Load the saved encoder, SVM model, and unit mapping
with open('encoder.pkl', 'rb') as file:
    ct = pickle.load(file)

with open('svm_model.pkl', 'rb') as file:
    classifier = pickle.load(file)

with open('unit_mapping.pkl', 'rb') as file:
    unit_mapping = pickle.load(file)

# Function to make predictions based on manual input
def predict_unit(group_id_value, entity_name_value):
    # Create a DataFrame with the manual input, including all required columns
    manual_input = {
        'group_id': [group_id_value],       # Use group_id here
        'entity_name': [entity_name_value],
        'entity_value': [None],
        'image_link': [None],               # Add placeholder values for missing columns
        'product_id': [None]                # Add placeholder values for missing columns
    }
    single_row_df = pd.DataFrame(manual_input)

    # Apply one-hot encoding on the 'entity_name' column of the manual input
    encoded_row = ct.transform(single_row_df)

    # Get the encoded column names (same as training)
    encoded_column_names = ct.transformers_[0][1].get_feature_names_out(['entity_name'])

    # Create DataFrame for encoded entity_name with proper column names
    encoded_entity_name_df = pd.DataFrame(encoded_row[:, :len(encoded_column_names)], columns=encoded_column_names)

    # Combine 'group_id' and the encoded columns
    group_id_df = pd.DataFrame(single_row_df[['group_id']].values, columns=['group_id'])

    # Prepare the final input row for prediction by combining 'group_id' and encoded columns
    input_data = pd.concat([group_id_df, encoded_entity_name_df], axis=1)

    # Predict the unit label using the classifier
    predicted_unit_label = classifier.predict(input_data)

    # Map the predicted numeric label to the original unit name
    predicted_unit = unit_mapping[predicted_unit_label[0]]

    return predicted_unit

# Example usage
if __name__ == "__main__":
    group_id_input = 156839  # Replace with your input
    entity_name_input = 'height'  # Replace with your input

    predicted_unit = predict_unit(group_id_input, entity_name_input)
    print(f"Predicted unit is {predicted_unit}")


Predicted unit is centimetre
