In [29]:
import pandas as pd
import numpy as np
#import nltk
#from nltk.corpus import stopwords
#from nltk.tokenize import word_tokenize
#from nltk.stem import WordNetLemmatizer
#import re

# Download required NLTK data
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')

# Display the entire sorted DataFrame
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
pd.set_option('display.max_colwidth', None)

# File paths
df_file = '/Users/cyrusaghaee/DS 207/Data/ebay_data/final_df.csv'
desc_file = '/Users/cyrusaghaee/DS 207/Data/ebay_data/desc.csv'
fitment_file = '/Users/cyrusaghaee/DS 207/Data/ebay_data/ftmnt_train.csv'

over_train_indices_path = '/Users/cyrusaghaee/DS 207/Final Project/over_train_indices.npy'
val_indices_path = '/Users/cyrusaghaee/DS 207/Final Project/val_indices.npy'
test_indices_path = '/Users/cyrusaghaee/DS 207/Final Project/test_indices.npy'

# Read the CSV files
df = pd.read_csv(df_file)
df.replace({"[]": np.nan, "['']": np.nan, '["]': np.nan}, inplace=True)
df.drop(columns=['ManufacturePartNumber'], inplace=True)

over_train_indices = np.load(over_train_indices_path)
val_indices = np.load(val_indices_path)
test_indices = np.load(test_indices_path)

train_df = df.take(over_train_indices)
val_df = df.take(val_indices)
test_df = df.take(test_indices)

desc = pd.read_csv(desc_file).drop(columns=['Decoded_DESC'])
fitment = pd.read_csv(fitment_file)

def merge_data(fitment, df, desc):
    return fitment.merge(df, on='RECORD_ID', how='right').merge(desc, on='RECORD_ID', how='left')

merged_train_df = merge_data(fitment, train_df, desc)
merged_val_df = merge_data(fitment, val_df, desc)
merged_test_df = merge_data(fitment, test_df, desc)

X_train = merged_train_df.drop(columns=['FTMNT_YEAR', 'FTMNT_MAKE', 'FTMNT_MODEL', 'RECORD_ID'])
X_val = merged_val_df.drop(columns=['FTMNT_YEAR', 'FTMNT_MAKE', 'FTMNT_MODEL', 'RECORD_ID'])
X_test = merged_test_df.drop(columns=['FTMNT_YEAR', 'FTMNT_MAKE', 'FTMNT_MODEL', 'RECORD_ID'])

# Function to create the concatenated string for each row
def create_x_string(row):
    return ', '.join([f"{col}: {val}" for col, val in row.items() if pd.notnull(val)])

# Apply the function to each row in X and create a new column 'x_string'
merged_train_df['processed_x_string'] = X_train.apply(create_x_string, axis=1).str.lower()
merged_val_df['processed_x_string'] = X_val.apply(create_x_string, axis=1).str.lower()
merged_test_df['processed_x_string'] = X_test.apply(create_x_string, axis=1).str.lower()

merged_train_df['FTMNT_MAKE'] = merged_train_df['FTMNT_MAKE'].str.lower()
merged_val_df['FTMNT_MAKE'] = merged_val_df['FTMNT_MAKE'].str.lower()
merged_test_df['FTMNT_MAKE'] = merged_test_df['FTMNT_MAKE'].str.lower()

# Group the data by description and aggregate the compatible makes into lists
def group_by_make(df):
    grouped = df.groupby('processed_x_string')['FTMNT_MAKE'].apply(lambda x: list(set(x))).reset_index()
    return grouped[['FTMNT_MAKE', 'processed_x_string']]

grouped_train = group_by_make(merged_train_df)
grouped_val = group_by_make(merged_val_df)
grouped_test = group_by_make(merged_test_df)

grouped_train.to_csv('merged_train_df.csv', index=False)
grouped_val.to_csv('merged_val_df.csv', index=False)
grouped_test.to_csv('merged_test_df.csv', index=False)


In [33]:
# Read the CSV file
file_path = '/Users/cyrusaghaee/DS 207/Final Project/tabular_base_model_comparison.csv'
tab_preds_df = pd.read_csv(file_path)

In [34]:
tab_preds_df.rename(columns={'Index': 'RECORD_ID'}, inplace=True)

In [35]:
tab_preds_df = tab_preds_df.merge(df[['RECORD_ID', 'Make']], on='RECORD_ID', how='left')

In [38]:
tab_pred_df = tab_preds_df.copy()
tab_pred_df.head(50)

Unnamed: 0.1,Unnamed: 0,RECORD_ID,Predicted,Target,Equal,Make
0,0,2,[['Nissan']],[['Nissan']],True,NISSAN
1,1,3,[['Ford']],[['Ford']],True,FORD
2,2,8,[['International Harvester']],[['International Harvester']],True,
3,3,22,"[['Buick'], ['Chevrolet'], ['GMC'], ['Hummer'], ['Isuzu'], ['Oldsmobile'], ['Saab']]","[['Buick'], ['Chevrolet'], ['GMC'], ['Hummer'], ['Isuzu'], ['Oldsmobile'], ['Saab']]",True,GMC
4,4,34,[['Buick']],[['Buick']],True,BUICK
5,5,37,[['Volkswagen']],[['Volkswagen']],True,VOLKSWAGEN
6,6,60,[['Honda']],[['Honda']],True,
7,7,70,"[['INFINITI'], ['Nissan']]","[['INFINITI'], ['Nissan']]",True,
8,8,71,"[['Buick'], ['Chevrolet']]","[['Buick'], ['Chevrolet']]",True,CHEVROLET
9,9,90,[['Mitsubishi']],[['Mitsubishi']],True,MITSUBISHI


In [40]:
import pandas as pd
import ast

tab_pred_df = tab_preds_df.copy()

# Convert string representations of lists into actual lists
tab_pred_df['Predicted'] = tab_pred_df['Predicted'].apply(ast.literal_eval)
tab_pred_df['Target'] = tab_pred_df['Target'].apply(ast.literal_eval)

# Initialize an empty list to store the new rows
new_rows = []

# Iterate through each row
for idx, row in tab_pred_df.iterrows():
    predictions = [item[0] for item in row['Predicted']]  # Flatten the list
    targets = [item[0] for item in row['Target']]  # Flatten the list
    
    for target in targets:
        correct = 1 if target in predictions else 0
        new_rows.append({'Target': target, 'Correct': correct, 'Make': row['Make']})

# Create a new dataframe with the new rows
new_df = pd.DataFrame(new_rows)

# Calculate the overall percentage of correct predictions
overall_correct_percentage = new_df['Correct'].mean() * 100

# Calculate the percentage of correct predictions when 'Make' is not NaN
not_nan_correct_percentage = new_df[new_df['Make'].notna()]['Correct'].mean() * 100

# Calculate the percentage of correct predictions when 'Make' is NaN
nan_correct_percentage = new_df[new_df['Make'].isna()]['Correct'].mean() * 100

# Display the results
print(f"Overall percentage of correct predictions: {overall_correct_percentage:.2f}%")
print(f"Percentage of correct predictions when 'Make' is not NaN: {not_nan_correct_percentage:.2f}%")
print(f"Percentage of correct predictions when 'Make' is NaN: {nan_correct_percentage:.2f}%")


Overall percentage of correct predictions: 91.06%
Percentage of correct predictions when 'Make' is not NaN: 94.03%
Percentage of correct predictions when 'Make' is NaN: 85.52%


In [41]:
import pandas as pd
import ast

tab_pred_df = tab_preds_df.copy()

# Convert string representations of lists into actual lists
tab_pred_df['Predicted'] = tab_pred_df['Predicted'].apply(ast.literal_eval)
tab_pred_df['Target'] = tab_pred_df['Target'].apply(ast.literal_eval)

# Initialize an empty list to store the new rows
new_rows = []

# Iterate through each row
for idx, row in tab_pred_df.iterrows():
    predictions = [item[0] for item in row['Predicted']]  # Flatten the list
    targets = [item[0] for item in row['Target']]  # Flatten the list
    
    for target in targets:
        correct = 1 if target in predictions else 0
        new_rows.append({'Target': target, 'Correct': correct, 'Make': row['Make']})

# Create a new dataframe with the new rows
new_df = pd.DataFrame(new_rows)

# Calculate the overall percentage of correct predictions
overall_correct_percentage = new_df['Correct'].mean() * 100

# Calculate the percentage of correct predictions when 'Make' is not NaN
not_nan_df = new_df[new_df['Make'].notna()]
not_nan_correct_percentage = not_nan_df['Correct'].mean() * 100

# Calculate the percentage of correct predictions when 'Make' is NaN
nan_df = new_df[new_df['Make'].isna()]
nan_correct_percentage = nan_df['Correct'].mean() * 100

# Count of each group
overall_count = len(new_df)
not_nan_count = len(not_nan_df)
nan_count = len(nan_df)

# Display the results
print(f"Overall percentage of correct predictions: {overall_correct_percentage:.2f}%")
print(f"Percentage of correct predictions when 'Make' is not NaN: {not_nan_correct_percentage:.2f}%")
print(f"Percentage of correct predictions when 'Make' is NaN: {nan_correct_percentage:.2f}%")

print(f"\nOverall count: {overall_count}")
print(f"Count when 'Make' is not NaN: {not_nan_count}")
print(f"Count when 'Make' is NaN: {nan_count}")

# Show a sample of each group
print("\nSample of overall dataframe:")
print(new_df.sample(5))

print("\nSample when 'Make' is not NaN:")
print(not_nan_df.sample(5))

print("\nSample when 'Make' is NaN:")
print(nan_df.sample(5))


Overall percentage of correct predictions: 91.06%
Percentage of correct predictions when 'Make' is not NaN: 94.03%
Percentage of correct predictions when 'Make' is NaN: 85.52%

Overall count: 850
Count when 'Make' is not NaN: 553
Count when 'Make' is NaN: 297

Sample of overall dataframe:
       Target  Correct    Make
464      Jeep        1   DODGE
249    Suzuki        1  SUZUKI
629  Cadillac        1     NaN
605    Toyota        1  TOYOTA
104   Mercury        1     NaN

Sample when 'Make' is not NaN:
        Target  Correct                    Make
511        GMC        1                    CASE
497     Datsun        1                  NISSAN
360  Chevrolet        1  CHEVROLET C6500 KODIAK
488      Dodge        1                CHRYSLER
677       Ford        1                    FORD

Sample when 'Make' is NaN:
            Target  Correct Make
518           Ford        1  NaN
184        Pontiac        1  NaN
23       Chevrolet        1  NaN
773           Ford        1  NaN
565  Merced

In [8]:
import pandas as pd
import numpy as np
#import nltk
#from nltk.corpus import stopwords
#from nltk.tokenize import word_tokenize
#from nltk.stem import WordNetLemmatizer
#import re

# Download required NLTK data
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')

# Display the entire sorted DataFrame
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
pd.set_option('display.max_colwidth', None)

# File paths
df_file = '/Users/cyrusaghaee/DS 207/Data/ebay_data/final_df.csv'
desc_file = '/Users/cyrusaghaee/DS 207/Data/ebay_data/desc.csv'
fitment_file = '/Users/cyrusaghaee/DS 207/Data/ebay_data/ftmnt_train.csv'

over_train_indices_path = '/Users/cyrusaghaee/DS 207/Final Project/over_train_indices.npy'
val_indices_path = '/Users/cyrusaghaee/DS 207/Final Project/val_indices.npy'
test_indices_path = '/Users/cyrusaghaee/DS 207/Final Project/test_indices.npy'

# Read the CSV files
df = pd.read_csv(df_file)
df.replace({"[]": np.nan, "['']": np.nan, '["]': np.nan}, inplace=True)
df.drop(columns=['ManufacturePartNumber'], inplace=True)

over_train_indices = np.load(over_train_indices_path)
val_indices = np.load(val_indices_path)
test_indices = np.load(test_indices_path)

train_df = df.take(over_train_indices)
val_df = df.take(val_indices)
test_df = df.take(test_indices)

desc = pd.read_csv(desc_file).drop(columns=['Decoded_DESC'])
fitment = pd.read_csv(fitment_file)

def merge_data(fitment, df, desc):
    return fitment.merge(df, on='RECORD_ID', how='right').merge(desc, on='RECORD_ID', how='left')

merged_train_df = merge_data(fitment, train_df, desc)
merged_val_df = merge_data(fitment, val_df, desc)
merged_test_df = merge_data(fitment, test_df, desc)

X_train = merged_train_df.drop(columns=['FTMNT_YEAR', 'FTMNT_MAKE', 'FTMNT_MODEL', 'RECORD_ID'])
X_val = merged_val_df.drop(columns=['FTMNT_YEAR', 'FTMNT_MAKE', 'FTMNT_MODEL', 'RECORD_ID'])
X_test = merged_test_df.drop(columns=['FTMNT_YEAR', 'FTMNT_MAKE', 'FTMNT_MODEL', 'RECORD_ID'])

# Function to create the concatenated string for each row
def create_x_string(row):
    return ', '.join([f"{col}: {val}" for col, val in row.items() if pd.notnull(val)])

# Apply the function to each row in X and create a new column 'x_string'
merged_train_df['processed_x_string'] = X_train.apply(create_x_string, axis=1).str.lower()
merged_val_df['processed_x_string'] = X_val.apply(create_x_string, axis=1).str.lower()
merged_test_df['processed_x_string'] = X_test.apply(create_x_string, axis=1).str.lower()

merged_train_df['FTMNT_MAKE'] = merged_train_df['FTMNT_MAKE'].str.lower()
merged_val_df['FTMNT_MAKE'] = merged_val_df['FTMNT_MAKE'].str.lower()
merged_test_df['FTMNT_MAKE'] = merged_test_df['FTMNT_MAKE'].str.lower()

# Group the data by description and aggregate the compatible makes into lists
def group_by_make(df):
    grouped = df.groupby('processed_x_string')['FTMNT_MAKE'].apply(lambda x: list(set(x))).reset_index()
    return grouped[['FTMNT_MAKE', 'processed_x_string']]

grouped_train = group_by_make(merged_train_df)
grouped_val = group_by_make(merged_val_df)
grouped_test = group_by_make(merged_test_df)


In [15]:
pred_test_df = test_df.merge(desc, on='RECORD_ID', how='left')[['RECORD_ID', 'Cleaned_DESC']]
pred_test_df.to_csv('pred_test_df.csv', index=False)


In [54]:
import numpy as np
import os
import pandas as pd

over_train_indices_path = '/Users/cyrusaghaee/DS 207/Final Project/over_train_indices.npy'
val_indices_path = '/Users/cyrusaghaee/DS 207/Final Project/val_indices.npy'
test_indices_path = '/Users/cyrusaghaee/DS 207/Final Project/test_indices.npy'


over_train_indices = np.load(over_train_indices_path)
val_indices = np.load(val_indices_path)
test_indices = np.load(test_indices_path)

df_file = '/Users/cyrusaghaee/DS 207/Data/ebay_data/final_df.csv'

# Read the CSV files
#df = pd.read_csv(df_file)


X_train = df.take(over_train_indices)


print(X_train.shape)

print(X_train['RECORD_ID'].value_counts())



(7066, 19)
RECORD_ID
3423    68
1727    64
1668    57
3801    57
3009    57
1110    57
3622    56
2284    55
1847    55
3611    54
2423    52
1585    51
3970    51
3939    50
2880    50
768     50
2855    50
2846    49
3121    48
2716    48
2899    43
1732    43
3834    43
928     42
1048    42
1904    41
3004    41
1522    41
645     41
855     41
2931    41
2774    41
1506    40
1169    39
2358    38
3955    38
3727    31
3763    30
2202    29
2056    29
572     28
1372    27
3219    27
3380    26
3914    26
936     25
3930    25
553     25
2176    25
2858    23
749     22
2823    21
96      20
3884    20
651     20
3188    19
2064    19
1667    18
2557    18
3328    17
430     17
3226    16
1252    16
3665    16
210     15
761     15
2950    15
488     14
2702    14
76      14
511     13
2292    13
3187    12
2103    12
2717    11
3566    11
2859    11
1134    11
3133    11
3558    11
3261    10
836     10
3232    10
2802     9
3810     9
1610     9
2379     9
727      9
3995     8


# Pre-processing and EDA

In [48]:
import pandas as pd
import numpy as np

from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
import tensorflow as tf
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import matplotlib.pyplot as plt
import pickle
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from sklearn.metrics import classification_report, hamming_loss
from collections import Counter
import seaborn as sns

# Display the entire sorted DataFrame
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
pd.set_option('display.max_colwidth', None)


# Download required NLTK data
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')

df_file = '/Users/cyrusaghaee/DS 207/Data/ebay_data/final_df.csv'
desc_file = '/Users/cyrusaghaee/DS 207/Data/ebay_data/desc.csv'
fitment_file = '/Users/cyrusaghaee/DS 207/Data/ebay_data/ftmnt_train.csv'

# Read the CSV files
df = pd.read_csv(df_file)

print(df.head(20))

print(df.shape)





    RECORD_ID                                    Brand                   Make           Model                                                                                                                                                                                                                                      Year                               Fitment                                               Type OEM Part Number                                                   Product Name  \
0           0                                ['dayco']                  DODGE     W100 SERIES                                                                                                                                                                                                                          ['1967', '1967']                ['Direct Replacement']                          ['Radiator Coolant Hose']              []                                                             []   
1 

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/cyrusaghaee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cyrusaghaee/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/cyrusaghaee/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
df.replace("[]", np.nan, inplace=True)
df.replace("['']", np.nan, inplace=True)
df.replace('["]', np.nan, inplace=True)
df = df.drop(columns=['ManufacturePartNumber'])

In [65]:
import pandas as pd
import numpy as np

from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
import tensorflow as tf
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import matplotlib.pyplot as plt
import pickle
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from sklearn.metrics import classification_report, hamming_loss
from collections import Counter
import seaborn as sns

# Display the entire sorted DataFrame
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
pd.set_option('display.max_colwidth', None)


# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

df_file = '/Users/cyrusaghaee/DS 207/Data/ebay_data/final_df.csv'
desc_file = '/Users/cyrusaghaee/DS 207/Data/ebay_data/desc.csv'
fitment_file = '/Users/cyrusaghaee/DS 207/Data/ebay_data/ftmnt_train.csv'

over_train_indices_path = '/Users/cyrusaghaee/DS 207/Final Project/over_train_indices.npy'
val_indices_path = '/Users/cyrusaghaee/DS 207/Final Project/val_indices.npy'
test_indices_path = '/Users/cyrusaghaee/DS 207/Final Project/test_indices.npy'

# Read the CSV files
df = pd.read_csv(df_file)

print(df.columns)

df.replace("[]", np.nan, inplace=True)
df.replace("['']", np.nan, inplace=True)
df.replace('["]', np.nan, inplace=True)
df = df.drop(columns=['ManufacturePartNumber'])


over_train_indices = np.load(over_train_indices_path)
val_indices = np.load(val_indices_path)
test_indices = np.load(test_indices_path)



train_df = df.take(over_train_indices)
val_df = df.take(val_indices)
test_df = df.take(test_indices)



desc = pd.read_csv(desc_file)
fitment = pd.read_csv(fitment_file)
desc = desc.drop(columns=['Decoded_DESC'])


merged_train_df = fitment.merge(train_df, on='RECORD_ID', how='right').merge(desc, on='RECORD_ID', how='left')
merged_val_df = fitment.merge(val_df, on='RECORD_ID', how='right').merge(desc, on='RECORD_ID', how='left')
merged_test_df = fitment.merge(test_df, on='RECORD_ID', how='right').merge(desc, on='RECORD_ID', how='left')

print("df: ",df.shape)
print("desc: ",desc.shape)
print("fitment: ",fitment.shape)

print("merged_train_df: ",merged_train_df.shape)
print("merged_val_df: ",merged_val_df.shape)
print("merged_test_df: ",merged_test_df.shape)

X_train = merged_train_df.drop(columns=['FTMNT_YEAR', 'FTMNT_MAKE', 'FTMNT_MODEL', 'RECORD_ID'])
X_val = merged_val_df.drop(columns=['FTMNT_YEAR', 'FTMNT_MAKE', 'FTMNT_MODEL', 'RECORD_ID'])
X_test = merged_test_df.drop(columns=['FTMNT_YEAR', 'FTMNT_MAKE', 'FTMNT_MODEL', 'RECORD_ID'])

# Function to create the concatenated string for each row
def create_x_string(row):
    # List comprehension to create 'column: value' strings for non-null values
    return ', '.join([f"{col}: {val}" for col, val in row.items() if pd.notnull(val)])

# Apply the function to each row in X and create a new column 'x_string'
merged_train_df['processed_x_string'] = X_train.apply(create_x_string, axis=1).str.lower()
merged_val_df['processed_x_string'] = X_val.apply(create_x_string, axis=1).str.lower()
merged_test_df['processed_x_string'] = X_test.apply(create_x_string, axis=1).str.lower()


merged_train_df['FTMNT_MAKE'] = merged_train_df['FTMNT_MAKE'].str.lower()
merged_val_df['FTMNT_MAKE'] = merged_val_df['FTMNT_MAKE'].str.lower()
merged_test_df['FTMNT_MAKE'] = merged_test_df['FTMNT_MAKE'].str.lower()

# Group the data by description and aggregate the compatible makes into lists
grouped_train = merged_train_df.groupby('processed_x_string')['FTMNT_MAKE'].apply(list).reset_index()
grouped_train['FTMNT_MAKE'] = grouped_train['FTMNT_MAKE'].apply(lambda x: list(set(x)))
grouped_train = grouped_train[['FTMNT_MAKE','processed_x_string']]

grouped_val = merged_val_df.groupby('processed_x_string')['FTMNT_MAKE'].apply(list).reset_index()
grouped_val['FTMNT_MAKE'] = grouped_val['FTMNT_MAKE'].apply(lambda x: list(set(x)))
grouped_val = grouped_val[['FTMNT_MAKE','processed_x_string']]


grouped_test = merged_test_df.groupby('processed_x_string')['FTMNT_MAKE'].apply(list).reset_index()
grouped_test['FTMNT_MAKE'] = grouped_test['FTMNT_MAKE'].apply(lambda x: list(set(x)))
grouped_test = grouped_test[['FTMNT_MAKE','processed_x_string']]



grouped_train.to_csv('merged_train_df.csv', index=False)
grouped_val.to_csv('merged_val_df.csv', index=False)
grouped_test.to_csv('merged_test_df.csv', index=False)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/cyrusaghaee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cyrusaghaee/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/cyrusaghaee/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Index(['RECORD_ID', 'Brand', 'Make', 'Model', 'Year', 'Fitment', 'Type', 'OEM Part Number', 'Product Name', 'Manufacturer Part Number', 'Vehicle Identification Number', 'SKU', 'ManufacturePartNumber', 'Part Title', 'Part Brands', 'SubType', 'Part Types', 'CATEGORY', 'ITEM_TITLE'], dtype='object')
df:  (5000, 18)
desc:  (5000, 2)
fitment:  (151106, 4)
merged_train_df:  (203645, 22)
merged_val_df:  (17766, 22)
merged_test_df:  (14732, 22)


: 

: 

In [None]:
X = merged_df.drop(columns=['FTMNT_YEAR', 'FTMNT_MAKE', 'FTMNT_MODEL', 'RECORD_ID'])

# Function to create the concatenated string for each row
def create_x_string(row):
    # List comprehension to create 'column: value' strings for non-null values
    return ', '.join([f"{col}: {val}" for col, val in row.items() if pd.notnull(val)])

# Apply the function to each row in X and create a new column 'x_string'
merged_df['x_string'] = X.apply(create_x_string, axis=1)

# Apply preprocessing with prioritization
merged_df['processed_x_string'] = merged_df['x_string'].str.lower()
merged_df['FTMNT_MAKE'] = merged_df['FTMNT_MAKE'].str.lower()

# Group the data by description and aggregate the compatible makes into lists
grouped = merged_df.groupby('processed_x_string')['FTMNT_MAKE'].apply(list).reset_index()
grouped['FTMNT_MAKE'] = grouped['FTMNT_MAKE'].apply(lambda x: list(set(x)))

grouped.head(50)

In [5]:
grouped['processed_x_string'].head(10)

0    brand: ["bolt n' go"], make: hyundai, model: v...
1    brand: ["caili's company"], make: mazda, model...
2    brand: ["ton's performance"], make: gmc, model...
3    brand: ['%20used'], make: toyota, model: 4 run...
4    brand: ['1autopowerstore'], make: ford, model:...
5    brand: ['2006 mini mini cooper stock number: 0...
6    brand: ['2mplastic'], year: ['2015', '2016', '...
7    brand: ['4 seasons', 'fan motor', 'four season...
8    brand: ['4 seasons', 'four seasons'], make: ch...
9    brand: ['4 seasons', 'four seasons'], make: fo...
Name: processed_x_string, dtype: object

In [7]:
grouped.to_csv('merged_df.csv', index=False)

In [27]:
def preprocess_text(text):
    # Lowercase and remove special characters except hyphens
    return re.sub(r'[^a-z0-9\s-]', '', text.lower())


# Preprocess the text
merged_df['processed_x_string'] = merged_df['x_string'].apply(preprocess_text)

grouped.to_csv('preprocessed_merged_df.csv', index=False)