In [139]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


dataset = pd.read_csv("train.csv")

# Apply one-hot encoding on 'entity_name' column (before dropping it)
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(handle_unknown='ignore'), ['entity_name'])],
    remainder='passthrough'
)
x_encoded = ct.fit_transform(dataset)

# Convert encoded data to DataFrame
x_encoded_df = pd.DataFrame(x_encoded)

# Drop original 'entity_name' column from the dataset
dataset_drop_entity = dataset.drop(columns=['entity_name'])

# Get the one-hot encoded column names
encoded_column_names = ct.transformers_[0][1].get_feature_names_out(['entity_name'])

# Create DataFrame for one-hot encoded columns
onehot_cols = pd.DataFrame(x_encoded[:, :len(encoded_column_names)], columns=encoded_column_names)

# Concatenate original columns (excluding 'entity_name') and one-hot encoded columns
dataset_onehot = pd.concat([dataset_drop_entity, onehot_cols], axis=1)

# Move 'entity_value' column to the last position
cols = [col for col in dataset_onehot.columns if col != 'entity_value']  # All columns except 'entity_value'
cols.append('entity_value')  # Add 'entity_value' at the end

# Reorder the DataFrame
dataset_onehot = dataset_onehot[cols]

# Show the first few rows
print(dataset_onehot.head())

dataset_onehot.to_csv("rearranged_dataset2.csv", index=False)

import pandas as pd

# Load your dataset
df = pd.read_csv('rearranged_dataset2.csv')

# Split entity_value into numeric and unit
df[['value', 'unit']] = df['entity_value'].str.extract(r'(\d+\.?\d*)\s*(\D+)')
df = df.drop(columns=['entity_value'])


df.to_csv('updated_file.csv', index=False)

# Show the updated DataFrame
print(df)

import pandas as pd

# Load your dataset
df = pd.read_csv('updated_file.csv')

# Factorize the 'unit' column to assign numeric labels
df['unit_label'] = pd.factorize(df['unit'])[0]

# Save the modified DataFrame (optional)
df.to_csv('updated_file_with_unit_labels.csv', index=False)

# Save the unit label mapping
unit_mapping = pd.factorize(df['unit'])[1]

import pickle
with open('unit_mapping.pkl', 'wb') as file:
    pickle.dump(unit_mapping, file)

# Show the updated DataFrame
print(df)

dataset=pd.read_csv('updated_file_with_unit_labels.csv')

X=dataset.iloc[:, 1:10]
y=dataset.iloc[:, -1]

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1)

from sklearn.tree import DecisionTreeClassifier
classifier=DecisionTreeClassifier(criterion='entropy',random_state=0)
classifier.fit(x_train,y_train)

y_pred=classifier.predict(x_test)

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

import pickle

# Save the encoder (ColumnTransformer) after training
with open('encoder.pkl', 'wb') as file:
    pickle.dump(ct, file)

# Save the trained SVM model
with open('svm_model.pkl', 'wb') as file:
    pickle.dump(classifier, file)

import pandas as pd
import pickle

# Load the saved encoder, SVM model, and unit mapping
with open('encoder.pkl', 'rb') as file:
    ct = pickle.load(file)

with open('svm_model.pkl', 'rb') as file:
    classifier = pickle.load(file)

with open('unit_mapping.pkl', 'rb') as file:
    unit_mapping = pickle.load(file)

# Function to make predictions based on manual input
def predict_unit(group_id_value, entity_name_value):
    # Create a DataFrame with the manual input, including all required columns
    manual_input = {
        'group_id': [group_id_value],       # Use group_id here
        'entity_name': [entity_name_value],
        'entity_value': [None],
        'image_link': [None],               # Add placeholder values for missing columns
        'product_id': [None]                # Add placeholder values for missing columns
    }
    single_row_df = pd.DataFrame(manual_input)

    # Apply one-hot encoding on the 'entity_name' column of the manual input
    encoded_row = ct.transform(single_row_df)

    # Get the encoded column names (same as training)
    encoded_column_names = ct.transformers_[0][1].get_feature_names_out(['entity_name'])

    # Create DataFrame for encoded entity_name with proper column names
    encoded_entity_name_df = pd.DataFrame(encoded_row[:, :len(encoded_column_names)], columns=encoded_column_names)

    # Combine 'group_id' and the encoded columns
    group_id_df = pd.DataFrame(single_row_df[['group_id']].values, columns=['group_id'])

    # Prepare the final input row for prediction by combining 'group_id' and encoded columns
    input_data = pd.concat([group_id_df, encoded_entity_name_df], axis=1)

    # Predict the unit label using the classifier
    predicted_unit_label = classifier.predict(input_data)

    # Map the predicted numeric label to the original unit name
    predicted_unit = unit_mapping[predicted_unit_label[0]]

    return predicted_unit

# Example usage
if __name__ == "__main__":
    group_id_input = 281678  # Replace with your input
    entity_name_input = 'item_weight'  # Replace with your input

    predicted_unit = predict_unit(group_id_input, entity_name_input)
    print(f"Predicted unit is {predicted_unit}")


                                          image_link  group_id  \
0  https://m.media-amazon.com/images/I/61I9XdN6OF...    748919   
1  https://m.media-amazon.com/images/I/71gSRbyXmo...    916768   
2  https://m.media-amazon.com/images/I/61BZ4zrjZX...    459516   
3  https://m.media-amazon.com/images/I/612mrlqiI4...    459516   
4  https://m.media-amazon.com/images/I/617Tl40LOX...    731432   

  entity_name_depth entity_name_height entity_name_item_volume  \
0               0.0                0.0                     0.0   
1               0.0                0.0                     1.0   
2               0.0                0.0                     0.0   
3               0.0                0.0                     0.0   
4               0.0                0.0                     0.0   

  entity_name_item_weight entity_name_maximum_weight_recommendation  \
0                     1.0                                       0.0   
1                     0.0                                       

In [146]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pickle

# Load your dataset
dataset = pd.read_csv("train.csv")

# Split entity_value into numeric and unit (already done in the classification code)
dataset[['value', 'unit']] = dataset['entity_value'].str.extract(r'(\d+\.?\d*)\s*(\D+)')
dataset['value'] = dataset['value'].astype(float)  # Convert the 'value' column to numeric
dataset = dataset.drop(columns=['entity_value'])

# Apply one-hot encoding on the 'entity_name' column
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(handle_unknown='ignore'), ['entity_name'])],
    remainder='passthrough'
)
X_encoded = ct.fit_transform(dataset[['group_id', 'entity_name']])

# Prepare the target variable (numeric values before the unit)
y = dataset['value']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=1)

# Train a regression model (e.g., Linear Regression)
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(X_train, y_train)

# Save the trained regression model and ColumnTransformer
with open('regressor_model.pkl', 'wb') as file:
    pickle.dump(regressor, file)

with open('column_transformer_reg.pkl', 'wb') as file:
    pickle.dump(ct, file)

# Evaluate the model
y_pred = regressor.predict(X_test)
print(f"Test Predictions: {y_pred[:5]}")
print(f"Actual Values: {y_test[:5]}")

# Model Evaluation (optional)
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


Test Predictions: [1.45595881e+01 3.95399270e+01 6.95945192e+01 5.30130811e+09
 1.41612966e+02]
Actual Values: 253346      6.9
133053     23.0
253117     25.0
45634       1.0
100982    100.0
Name: value, dtype: float64
Mean Squared Error: 1.5452887704813738e+27


In [178]:
import pandas as pd
import pickle

# Load the saved encoder, regression model
with open('column_transformer_reg.pkl', 'rb') as file:
    ct = pickle.load(file)

with open('regressor_model.pkl', 'rb') as file:
    regressor = pickle.load(file)

# Function to predict the numeric value based on manual input
def predict_value(group_id_value, entity_name_value):
    # Create a DataFrame with the manual input, including all required columns
    manual_input = {
        'group_id': [group_id_value],
        'entity_name': [entity_name_value]
    }
    single_row_df = pd.DataFrame(manual_input)

    # Apply one-hot encoding on the 'entity_name' column of the manual input
    encoded_row = ct.transform(single_row_df)
    encoded_column_names = ct.transformers_[0][1].get_feature_names_out(['entity_name'])

    # Predict the numeric value using the regression model
    predicted_value = regressor.predict(encoded_row)

    return predicted_value[0]

# Example usage
if __name__ == "__main__":
    group_id_input = 281678  # Replace with your input
    entity_name_input = 'item_weight'  # Replace with your input

    predicted_value = predict_value(group_id_input, entity_name_input)
    print(f"Predicted value is {predicted_value}")


Predicted value is 5301308109.72129


In [186]:
dataset_2  = pd.read_csv("test.csv")
dataset_2 = pd.DataFrame(dataset_2)
dataset_2.head()

Unnamed: 0,index,image_link,group_id,entity_name
0,0,https://m.media-amazon.com/images/I/110EibNycl...,156839,height
1,1,https://m.media-amazon.com/images/I/11TU2clswz...,792578,width
2,2,https://m.media-amazon.com/images/I/11TU2clswz...,792578,height
3,3,https://m.media-amazon.com/images/I/11TU2clswz...,792578,depth
4,4,https://m.media-amazon.com/images/I/11gHj8dhhr...,792578,depth


In [190]:
import pandas as pd
import pickle

# Load the saved encoder and regression model
with open('column_transformer_reg.pkl', 'rb') as file:
    ct = pickle.load(file)

with open('regressor_model.pkl', 'rb') as file:
    regressor = pickle.load(file)

# Function to predict the numeric values for entire columns of inputs
def predict_values(group_id_column, entity_name_column):
    # Create a DataFrame with the manual input columns
    input_df = pd.DataFrame({
        'group_id': group_id_column,
        'entity_name': entity_name_column
    })

    # Apply one-hot encoding on the 'entity_name' column of the input DataFrame
    encoded_df = ct.transform(input_df)
    
    # Predict the numeric values using the regression model for all rows
    predicted_values = regressor.predict(encoded_df)

    return predicted_values

# Example usage
if __name__ == "__main__":
    # Assuming 'dataset_2' is the DataFrame you're working with
    input_data = pd.DataFrame({
        'group_id': dataset_2['group_id'],  # Use the actual column data
        'entity_name': dataset_2['entity_name']  # Use the actual column data
    })

    # Get the predicted values for the entire column of inputs
    predicted_values = predict_values(input_data['group_id'], input_data['entity_name'])
    
    # Add the predicted values as a new column to the input DataFrame
    input_data['predicted_value'] = predicted_values
    
    # Print the input data with predicted values
    print(input_data)


        group_id                    entity_name  predicted_value
0         156839                         height        47.529061
1         792578                          width        39.734033
2         792578                         height        55.763115
3         792578                          depth        89.312735
4         792578                          depth        89.312735
...          ...                            ...              ...
131182    721522  maximum_weight_recommendation       234.692254
131183    603688                    item_weight       187.552825
131184    603688  maximum_weight_recommendation       228.148909
131185    853009                    item_weight        80.552105
131186    853009  maximum_weight_recommendation       229.249122

[131187 rows x 3 columns]


In [None]:
import pandas as pd
import pickle

# Load the saved encoder, SVM model, regression model, and unit mapping
with open('encoder.pkl', 'rb') as file:
    ct = pickle.load(file)

with open('svm_model.pkl', 'rb') as file:
    classifier = pickle.load(file)

with open('regressor_model.pkl', 'rb') as file:
    regressor = pickle.load(file)

with open('unit_mapping.pkl', 'rb') as file:
    unit_mapping = pickle.load(file)

# Function to predict both unit and numeric value based on manual input
def predict_value_and_unit(group_id_value, entity_name_value):
    # Create a DataFrame with the manual input
    manual_input = {
        'group_id': [group_id_value],
        'entity_name': [entity_name_value],
        'entity_value': [None],
        'image_link': [None],
        'product_id': [None]
    }
    single_row_df = pd.DataFrame(manual_input)

    # Apply one-hot encoding on the 'entity_name' column of the manual input
    encoded_row = ct.transform(single_row_df)

    # Get the encoded column names (same as training)
    encoded_column_names = ct.transformers_[0][1].get_feature_names_out(['entity_name'])

    # Create DataFrame for encoded entity_name with proper column names
    encoded_entity_name_df = pd.DataFrame(encoded_row[:, :len(encoded_column_names)], columns=encoded_column_names)

    # Combine 'group_id' and the encoded columns
    group_id_df = pd.DataFrame(single_row_df[['group_id']].values, columns=['group_id'])

    # Prepare the final input row for prediction by combining 'group_id' and encoded columns
    input_data = pd.concat([group_id_df, encoded_entity_name_df], axis=1)

    # Predict the numeric value using the regression model
    predicted_value = regressor.predict(input_data)

    # Predict the unit label using the classifier
    predicted_unit_label = classifier.predict(input_data)

    # Map the predicted numeric label to the original unit name
    predicted_unit = unit_mapping[predicted_unit_label[0]]

    return predicted_value[0], predicted_unit

# Load the test dataset
test_dataset = pd.read_csv("test.csv")

# Prepare columns to store predicted values and units
test_dataset['predicted_value'] = None
test_dataset['predicted_unit'] = None

# Loop through each row in the test dataset
for index, row in test_dataset.iterrows():
    group_id_input = row['group_id']
    entity_name_input = row['entity_name']
    
    # Predict value and unit for the current row
    predicted_value, predicted_unit = predict_value_and_unit(group_id_input, entity_name_input)
    
    # Store the predicted value and unit in the dataset
    test_dataset.at[index, 'predicted_value'] = predicted_value
    test_dataset.at[index, 'predicted_unit'] = predicted_unit

# Save the updated dataset with predictions
test_dataset.to_csv('test_with_predictions.csv', index=False)

# Show the first few rows of the updated dataset
print(test_dataset.head())




In [None]:
def process_csv(input_csv_path, output_csv_path):
    input_data = pd.read_csv(input_csv_path)
    output_data = []

    for idx,row in input_data.iterrows():
        group_id = row['group_id']
        entity_name = row['entity_name']

        # extracted_value = process_and_extract_integers(image_url)
        # if extracted_value is None:
        #     output_data.append({"index": row['index'], "prediction": None})
        #     continue
        
        prediction = predict_unit(group_id, entity_name)
        output_data.append({"index": row['index'], "prediction": prediction})

    output_df = pd.DataFrame(output_data)
    output_df.to_csv(output_csv_path, index=False)
    print(f"Processing complete. Results saved to {output_csv_path}")

input_csv_path = 'test.csv'
output_csv_path = 'test_out.csv'
            
process_csv(input_csv_path, output_csv_path)

In [160]:
from scipy.sparse import csr_matrix


In [166]:
import pandas as pd
from scipy.sparse import csr_matrix
import joblib

def process_csv(input_csv_path, output_csv_path):
    # Load input CSV
    input_data = pd.read_csv(input_csv_path)
    output_data = []

    # Load the saved column names from training
    train_columns = joblib.load('train_columns.pkl')

    for idx, row in input_data.iterrows():
        group_id = row['group_id']
        entity_name = row['entity_name']

        # Create a DataFrame for the manual input
        manual_input = pd.DataFrame({
            'group_id': [group_id],
            'entity_name': [entity_name]
        })

        # Convert all column names to strings (required by scikit-learn)
        manual_input.columns = manual_input.columns.astype(str)

        # Apply the same one-hot encoding to 'entity_name' as used during training
        encoded_input = ct.transform(manual_input)

        # Convert the encoded input to dense array if it's sparse
        if isinstance(encoded_input, csr_matrix):
            encoded_input = encoded_input.toarray()

        # Create a DataFrame from the encoded input with the correct feature names
        input_for_prediction = pd.DataFrame(encoded_input, columns=ct.get_feature_names_out())

        # Add the 'group_id' column to the encoded features
        input_for_prediction['group_id'] = group_id

        # Ensure columns are in the correct order (same as used during training)
        input_for_prediction = input_for_prediction[train_columns]

        # Predict the value (numeric part) using the regression model
        predicted_value = regressor.predict(input_for_prediction)

        # Predict the unit (classification part) using the classification model
        predicted_unit = classifier.predict(input_for_prediction)

        # Map the predicted unit label back to the original unit name
        predicted_unit_name = unit_mapping[predicted_unit[0]]

        # Concatenate the predicted value and unit
        full_prediction = f"{predicted_value[0]} {predicted_unit_name}"

        # Append the results into the output list
        output_data.append({
            "index": row['index'],
            "group_id": group_id,
            "entity_name": entity_name,
            "prediction": full_prediction
        })

    # Convert the output list to a DataFrame
    output_df = pd.DataFrame(output_data)

    # Save the output DataFrame to CSV
    output_df.to_csv(output_csv_path, index=False)
    print(f"Processing complete. Results saved to {output_csv_path}")

# Example usage
input_csv_path = 'test.csv'
output_csv_path = 'test_out.csv'

# Call the function with input and output paths
process_csv(input_csv_path, output_csv_path)


FileNotFoundError: [Errno 2] No such file or directory: 'train_columns.pkl'

In [182]:
def process_csv_with_local_images(input_csv_path,output_csv_path):
    input_data = pd.read_csv(input_csv_path)
    
    # Print column names for debugging
    print("Column names in the CSV file:", input_data.columns)
    
    # Ensure there are no extra spaces in column names
    input_data.columns = input_data.columns.str.strip()
    
    output_data = []
    cnt = 0


    for idx, row in input_data.iterrows():
        group_id = row['group_id']
        entity_name = row['entity_name']
        print(idx)
   
        # # For each row, process all images in the folder
        # image_filename = f"index_{idx}"
        # image_path = os.path.join(image_folder_path, image_filename + ".jpg")
        # cnt +=1

        # Process the image and extract integer/float values
        extracted_value = predict_value(group_id, entity_name)
        if extracted_value is None:
            return random.randint(10.0,200.0)

        # Predict the unit using the provided 'group_id' and 'entity_name'
        prediction = predict_unit(group_id, entity_name)

        # Append the extracted float value and predicted unit as separate items
        output_data.append({"index": idx, "extracted_value": float(extracted_value), "predicted_unit": prediction})

    # Create the output DataFrame with two separate columns: one for the float value and one for the unit string
    output_df = pd.DataFrame(output_data)
    
    # Combine both columns into a single column for the CSV file, keeping extracted value as float and unit as string
    output_df['prediction'] = output_df.apply(lambda row: f"{row['extracted_value']:.2f} {row['predicted_unit']}", axis=1)

    # Save only the 'index' and 'prediction' columns to the output CSV
    output_df[['index', 'prediction']].to_csv(output_csv_path, index=False)
    
    print(f"Processing complete. Results saved to {output_csv_path}")

# Example usage
input_csv_path = 'test.csv'  # Folder where local images are stored
output_csv_path = 'test_out__1.csv'

process_csv_with_local_images(input_csv_path,output_csv_path)

Column names in the CSV file: Index(['index', 'image_link', 'group_id', 'entity_name'], dtype='object')
0


ValueError: Shape of passed values is (1, 1), indices imply (1, 8)

In [54]:
entity_name_encoder = LabelEncoder()
entity_name_encoder.fit(df['entity_name'])

NameError: name 'LabelEncoder' is not defined

In [51]:
predict_unit_and_number(input_df, classifier, regressor, entity_name_encoder, unit_decoder)

NameError: name 'entity_name_encoder' is not defined

In [68]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder


dataset = pd.read_csv("train.csv")

# Apply one-hot encoding on 'entity_name' column (before dropping it)
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(handle_unknown='ignore'), ['entity_name'])],
    remainder='passthrough'
)
x_encoded = ct.fit_transform(dataset)

# Convert encoded data to DataFrame
x_encoded_df = pd.DataFrame(x_encoded)

# Drop original 'entity_name' column from the dataset
dataset_drop_entity = dataset.drop(columns=['entity_name'])

# Get the one-hot encoded column names
encoded_column_names = ct.transformers_[0][1].get_feature_names_out(['entity_name'])

# Create DataFrame for one-hot encoded columns
onehot_cols = pd.DataFrame(x_encoded[:, :len(encoded_column_names)], columns=encoded_column_names)

# Concatenate original columns (excluding 'entity_name') and one-hot encoded columns
dataset_onehot = pd.concat([dataset_drop_entity, onehot_cols], axis=1)

# Move 'entity_value' column to the last position
cols = [col for col in dataset_onehot.columns if col != 'entity_value']  # All columns except 'entity_value'
cols.append('entity_value')  # Add 'entity_value' at the end

# Reorder the DataFrame
dataset_onehot = dataset_onehot[cols]

# Show the first few rows
print(dataset_onehot.head())


                                          image_link  group_id  \
0  https://m.media-amazon.com/images/I/61I9XdN6OF...    748919   
1  https://m.media-amazon.com/images/I/71gSRbyXmo...    916768   
2  https://m.media-amazon.com/images/I/61BZ4zrjZX...    459516   
3  https://m.media-amazon.com/images/I/612mrlqiI4...    459516   
4  https://m.media-amazon.com/images/I/617Tl40LOX...    731432   

  entity_name_depth entity_name_height entity_name_item_volume  \
0               0.0                0.0                     0.0   
1               0.0                0.0                     1.0   
2               0.0                0.0                     0.0   
3               0.0                0.0                     0.0   
4               0.0                0.0                     0.0   

  entity_name_item_weight entity_name_maximum_weight_recommendation  \
0                     1.0                                       0.0   
1                     0.0                                       

In [71]:
print(dataset_onehot.columns)

Index(['image_link', 'group_id', 'entity_name_depth', 'entity_name_height',
       'entity_name_item_volume', 'entity_name_item_weight',
       'entity_name_maximum_weight_recommendation', 'entity_name_voltage',
       'entity_name_wattage', 'entity_name_width', 'entity_value'],
      dtype='object')


In [74]:
dataset_onehot.to_csv("rearranged_dataset2.csv", index=False)

In [76]:
import pandas as pd

# Load your dataset
df = pd.read_csv('rearranged_dataset2.csv')

# Split entity_value into numeric and unit
df[['value', 'unit']] = df['entity_value'].str.extract(r'(\d+\.?\d*)\s*(\D+)')
df = df.drop(columns=['entity_value'])


df.to_csv('updated_file.csv', index=False)

# Show the updated DataFrame
print(df)


                                               image_link  group_id  \
0       https://m.media-amazon.com/images/I/61I9XdN6OF...    748919   
1       https://m.media-amazon.com/images/I/71gSRbyXmo...    916768   
2       https://m.media-amazon.com/images/I/61BZ4zrjZX...    459516   
3       https://m.media-amazon.com/images/I/612mrlqiI4...    459516   
4       https://m.media-amazon.com/images/I/617Tl40LOX...    731432   
...                                                   ...       ...   
263854  https://m.media-amazon.com/images/I/612J1R1xHl...    558806   
263855  https://m.media-amazon.com/images/I/61Blzh2+28...    470067   
263856  https://m.media-amazon.com/images/I/51MsegDL9V...    204245   
263857  https://m.media-amazon.com/images/I/510KhVw4VS...    752266   
263858  https://m.media-amazon.com/images/I/51lzTNLQ-6...    416664   

        entity_name_depth  entity_name_height  entity_name_item_volume  \
0                     0.0                 0.0                      0.0   

In [79]:
import pandas as pd

# Load your dataset
df = pd.read_csv('updated_file.csv')

# Factorize the 'unit' column to assign numeric labels
df['unit_label'] = pd.factorize(df['unit'])[0]

# Save the modified DataFrame (optional)
df.to_csv('updated_file_with_unit_labels.csv', index=False)

# Save the unit label mapping
unit_mapping = pd.factorize(df['unit'])[1]

import pickle
with open('unit_mapping.pkl', 'wb') as file:
    pickle.dump(unit_mapping, file)

# Show the updated DataFrame
print(df)


                                               image_link  group_id  \
0       https://m.media-amazon.com/images/I/61I9XdN6OF...    748919   
1       https://m.media-amazon.com/images/I/71gSRbyXmo...    916768   
2       https://m.media-amazon.com/images/I/61BZ4zrjZX...    459516   
3       https://m.media-amazon.com/images/I/612mrlqiI4...    459516   
4       https://m.media-amazon.com/images/I/617Tl40LOX...    731432   
...                                                   ...       ...   
263854  https://m.media-amazon.com/images/I/612J1R1xHl...    558806   
263855  https://m.media-amazon.com/images/I/61Blzh2+28...    470067   
263856  https://m.media-amazon.com/images/I/51MsegDL9V...    204245   
263857  https://m.media-amazon.com/images/I/510KhVw4VS...    752266   
263858  https://m.media-amazon.com/images/I/51lzTNLQ-6...    416664   

        entity_name_depth  entity_name_height  entity_name_item_volume  \
0                     0.0                 0.0                      0.0   

In [81]:
dataset=pd.read_csv('updated_file_with_unit_labels.csv')

In [84]:
X=dataset.iloc[:, 1:10]
y=dataset.iloc[:, -1]

In [86]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=1)

In [90]:
from sklearn.tree import DecisionTreeClassifier
classifier=DecisionTreeClassifier(criterion='entropy',random_state=0)
classifier.fit(x_train,y_train)

In [92]:
y_pred=classifier.predict(x_test)

In [94]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[11686     0   764 ...     0     0     0]
 [    0     0     0 ...     0     0     0]
 [  607     0   990 ...     0     0     0]
 ...
 [    5     0     0 ...     0     0     0]
 [    0     0     0 ...     0    29     0]
 [    0     0     0 ...     0     0     0]]


0.6143977867050708

In [96]:
import pickle

# Save the encoder (ColumnTransformer) after training
with open('encoder.pkl', 'wb') as file:
    pickle.dump(ct, file)

# Save the trained SVM model
with open('svm_model.pkl', 'wb') as file:
    pickle.dump(classifier, file)

In [98]:
import pandas as pd
import pickle

# Load the saved encoder, SVM model, and unit mapping
with open('encoder.pkl', 'rb') as file:
    ct = pickle.load(file)

with open('svm_model.pkl', 'rb') as file:
    classifier = pickle.load(file)

with open('unit_mapping.pkl', 'rb') as file:
    unit_mapping = pickle.load(file)

# Function to make predictions based on manual input
def predict_unit(group_id_value, entity_name_value):
    # Create a DataFrame with the manual input, including all required columns
    manual_input = {
        'group_id': [group_id_value],       # Use group_id here
        'entity_name': [entity_name_value],
        'entity_value': [None],
        'image_link': [None],               # Add placeholder values for missing columns
        'product_id': [None]                # Add placeholder values for missing columns
    }
    single_row_df = pd.DataFrame(manual_input)

    # Apply one-hot encoding on the 'entity_name' column of the manual input
    encoded_row = ct.transform(single_row_df)

    # Get the encoded column names (same as training)
    encoded_column_names = ct.transformers_[0][1].get_feature_names_out(['entity_name'])

    # Create DataFrame for encoded entity_name with proper column names
    encoded_entity_name_df = pd.DataFrame(encoded_row[:, :len(encoded_column_names)], columns=encoded_column_names)

    # Combine 'group_id' and the encoded columns
    group_id_df = pd.DataFrame(single_row_df[['group_id']].values, columns=['group_id'])

    # Prepare the final input row for prediction by combining 'group_id' and encoded columns
    input_data = pd.concat([group_id_df, encoded_entity_name_df], axis=1)

    # Predict the unit label using the classifier
    predicted_unit_label = classifier.predict(input_data)

    # Map the predicted numeric label to the original unit name
    predicted_unit = unit_mapping[predicted_unit_label[0]]

    return predicted_unit

# Example usage
if __name__ == "__main__":
    group_id_input = 281678  # Replace with your input
    entity_name_input = 'item_weight'  # Replace with your input

    predicted_unit = predict_unit(group_id_input, entity_name_input)
    print(f"Predicted unit is {predicted_unit}")


Predicted unit is gram


In [104]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
import joblib

# Load your dataset
dataset = pd.read_csv("train.csv")
dataset = pd.DataFrame(dataset)
df = df.drop(df.columns[0], axis=1)
# Apply one-hot encoding on 'entity_name' column
ct = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(handle_unknown='ignore'), ['entity_name'])],
    remainder='passthrough'
)
X_encoded = ct.fit_transform(dataset)
X_encoded_df = pd.DataFrame(X_encoded, columns=ct.get_feature_names_out())

# Drop original 'entity_name' column from the dataset
dataset_drop_entity = dataset.drop(columns=['entity_name'])

# Concatenate original columns (excluding 'entity_name') and one-hot encoded columns
dataset_onehot = pd.concat([dataset_drop_entity, X_encoded_df], axis=1)
dataset_onehot = dataset_onehot[[col for col in dataset_onehot.columns if col != 'entity_value'] + ['entity_value']]

# Prepare data for the regressor
X = dataset_onehot.drop(columns='entity_value')
y = dataset_onehot['entity_value']

# Create and train the regressor model
regressor = LinearRegression()
regressor.fit(X, y)

# Save the trained regressor model and ColumnTransformer
joblib.dump(regressor, 'regressor_model.pkl')
joblib.dump(ct, 'column_transformer.pkl')


ValueError: could not convert string to float: 'https://m.media-amazon.com/images/I/61I9XdN6OFL.jpg'