In [1]:
import pandas as pd

In [2]:
class DataProcessor:
    def __init__(self, file_path, sep='\t'):
        self.file_path = file_path
        self.sep = sep
        self.df = pd.read_csv(file_path, sep=sep)
    
    def remove_columns(self, columns):
        self.df = self.df.drop(columns=columns)
    
    def drop_duplicates(self):
        self.df = self.df.drop_duplicates()
    
    def rename_columns(self, columns_dict):
        self.df.rename(columns=columns_dict, inplace=True)
    
    def save_to_csv(self, output_path):
        self.df.to_csv(output_path, index=False)
    
    def classify_product(self):
        self.df['Category'] = self.df['Tên sản phẩm'].apply(self._classify)
        
    def _classify(self, name):
        name = name.lower()
        if 'áo' in name or 'sơ mi' in name or 'shirt' in name:
            return 'Áo'
        elif 'quần' in name or 'pants' in name or 'tights' in name:
            return 'Quần'
        elif 'váy' in name or 'skirt' in name or 'dress' in name:
            return 'Váy'
        elif 'túi' in name or 'ví' in name or 'bag' in name or 'wallet' in name:
            return 'Túi'
        elif 'giày' in name or 'sandal' in name or 'shoes' in name:
            return 'Giày'
        elif 'dép' in name or 'slippers' in name:
            return 'Dép'
        elif 'đầm' in name:
            return 'Đầm'
        elif 'nón' in name or 'mũ' in name or 'hat' in name:
            return 'Nón/Mũ'
        elif 'balo' in name or 'cặp' in name or 'ba lô' in name or 'backpack' in name:
            return 'Balo/Cặp'
        elif 'mắt kính' in name or 'kính' in name or 'glasses' in name:
            return 'Mắt Kính'
        elif 'khăn' in name or 'scarf' in name:
            return 'Khăn'
        elif 'tất' in name or 'socks' in name:
            return 'Tất'
        elif 'găng tay' in name or 'gloves' in name:
            return 'Găng Tay'
        elif 'đồng hồ' in name or 'watch' in name:
            return 'Đồng Hồ'
        elif 'vòng' in name or 'bracelet' in name:
            return 'Vòng'
        elif 'thắt lưng' in name or 'belt' in name:
            return 'Thắt Lưng'
        elif 'bộ' in name or 'set' in name:
            return 'Bộ'
        elif any(x in name for x in ['khuyên tai', 'kẹp', 'dây chuyền', 'bông tai', 'gắp', 'nhẫn', 'earrings', 'necklace', 'ring']):
            return 'Phụ kiện'
        else:
            return 'Khác'

# Remove unnecessary features

### 1. Tiki data

In [3]:
processor = DataProcessor('Data/tiki_products_modified.csv', sep=',')
processor.classify_product()
processor.save_to_csv('Data/tiki_products_final.csv')

## 2. Uniqlo data

In [4]:
processor = DataProcessor('Data/uniqlo_products_modified.csv', sep=',')
processor.classify_product()
processor.save_to_csv('Data/uniqlo_products_final.csv')

In [5]:
processor = DataProcessor('Data/adidas_nikes_products_modified.csv', sep=',')
processor.classify_product()
processor.save_to_csv('Data/adidas_nikes_products_final.csv')

In [6]:
csv_files = ['Data/tiki_products_final.csv', 'Data/uniqlo_products_final.csv', 'Data/adidas_nikes_products_final.csv']

# Read and concatenate all CSV files
dfs = [pd.read_csv(file) for file in csv_files]
merged_df = pd.concat(dfs, ignore_index=True)

In [7]:
# Fill missing 'Tên cửa hàng' with 'UNIQLO' and 'Thương hiệu' with 'UNIQLO'
merged_df.loc[merged_df['Tên cửa hàng'].isnull(), 'Tên cửa hàng'] = 'UNIQLO'
merged_df.loc[merged_df['Tên cửa hàng'].str.contains('UNIQLO', case=False, na=False), 'Thương hiệu'] = merged_df['Thương hiệu'].fillna('UNIQLO')
    
# For 'Tên cửa hàng' containing 'addidas' or 'Nike', auto-fill 'Thương hiệu' as 'addidas' or 'Nike'
merged_df.loc[merged_df['Tên cửa hàng'].str.contains('adidas', case=False, na=False), 'Thương hiệu'] = merged_df['Thương hiệu'].fillna('adidas')
merged_df.loc[merged_df['Tên cửa hàng'].str.contains('Nike', case=False, na=False), 'Thương hiệu'] = merged_df['Thương hiệu'].fillna('Nike')

# Fill missing 'Thương hiệu' with 'OEM'
merged_df['Thương hiệu'] = merged_df['Thương hiệu'].fillna('OEM')

# Fill missing numerical columns with 0
numerical_columns = merged_df.select_dtypes(include=['float64', 'int64']).columns
merged_df[numerical_columns] = merged_df[numerical_columns].fillna(0)

# Fill missing 'Currency' with 'VND'
merged_df['Currency'] = merged_df['Currency'].fillna('VND')

In [8]:
# REMOVE ALL INSTANCE WITH 'Lượt đánh giá' | 'Điểm đánh giá trung bình' < 0
merged_df = merged_df[merged_df['Lượt đánh giá'] > 0]
merged_df = merged_df[merged_df['Điểm đánh giá trung bình'] > 0]

# Drop 'Fit' column due to irrelevance
merged_df = merged_df.drop(columns=['Fit'])

In [9]:
# Lưu kết quả vào file CSV mới
merged_df.to_csv('./Data/Data.csv', index=False)