In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error,mean_squared_error 
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [53]:
data = pd.read_csv("styles.csv", on_bad_lines='skip')
data.head(10)


Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt
5,1855,Men,Apparel,Topwear,Tshirts,Grey,Summer,2011.0,Casual,Inkfruit Mens Chain Reaction T-shirt
6,30805,Men,Apparel,Topwear,Shirts,Green,Summer,2012.0,Ethnic,Fabindia Men Striped Green Shirt
7,26960,Women,Apparel,Topwear,Shirts,Purple,Summer,2012.0,Casual,Jealous 21 Women Purple Shirt
8,29114,Men,Accessories,Socks,Socks,Navy Blue,Summer,2012.0,Casual,Puma Men Pack of 3 Socks
9,30039,Men,Accessories,Watches,Watches,Black,Winter,2016.0,Casual,Skagen Men Black Watch


In [54]:
data.isnull().sum()

id                      0
gender                  0
masterCategory          0
subCategory             0
articleType             0
baseColour             15
season                 21
year                    1
usage                 317
productDisplayName      7
dtype: int64

In [55]:
# Drop the columns
data = data.drop(['productDisplayName','year','subCategory','baseColour'], axis=1)

In [56]:
# Fill missing values in 'gender', 'articletype', and 'colour' with mode

data['season'].fillna('na', inplace=True)

data['usage'].fillna('na', inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['season'].fillna('na', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['usage'].fillna('na', inplace=True)


In [57]:
# Step 1: Define the mapping to group 'Boys' into 'Men' and 'Girls' into 'Women'
gender_mapping = {
    'Men': 'Men',
    'Women': 'Women',
    'Boys': 'Men',
    'Girls': 'Women',
    'Unisex': 'Unisex'
}

# Apply the gender mapping to create a new column 'gender_'
data['gender_'] = data['gender'].replace(gender_mapping)

In [58]:
# Step 2: Identify categories in 'usage' with less than 2000 items
usage_counts = data['usage'].value_counts()
small_usage_categories = usage_counts[usage_counts < 2000].index

# Step 3: Replace categories in 'usage' with fewer than 2000 items with 'Other'
data['usage_'] = data['usage'].replace(small_usage_categories, 'Other')

In [59]:
# Step 4: Count occurrences of each category in 'masterCategory'
category_counts = data['masterCategory'].value_counts()

# Step 5: Identify categories in 'masterCategory' with less than 2000 items
small_categories = category_counts[category_counts < 2000].index

# Step 6: Replace categories with fewer than 2000 items with 'Other'
data['mC_'] = data['masterCategory'].replace(small_categories, 'Other')


In [60]:
columns_to_ohe = ['gender_', 'season', 'usage_', 'mC_']

In [61]:
non_ohe_columns = data[['id', 'articleType']]

In [63]:
# Step 7: Define the column transformer for one-hot encoding, excluding 'article'
transformer = ColumnTransformer(
    transformers=[
        ('new', OneHotEncoder(sparse=False, drop='first'), columns_to_ohe),
    ],
    remainder='drop'  # Drop other columns not specified
)

In [64]:
# Step 8: Apply the transformer to the data
data_transformed = transformer.fit_transform(data[columns_to_ohe])





In [65]:
feature_names = (
    transformer.named_transformers_['new']
    .get_feature_names_out(columns_to_ohe)
)


In [66]:
data_transformed_df = pd.DataFrame(data_transformed, columns=feature_names)


In [67]:
final_data_df = pd.concat([non_ohe_columns.reset_index(drop=True), data_transformed_df], axis=1)


# Verify the transformed data (optional)
print(final_data_df.head())

      id  articleType  gender__Unisex  gender__Women  season_Spring  \
0  15970       Shirts             0.0            0.0            0.0   
1  39386        Jeans             0.0            0.0            0.0   
2  59263      Watches             0.0            1.0            0.0   
3  21379  Track Pants             0.0            0.0            0.0   
4  53759      Tshirts             0.0            0.0            0.0   

   season_Summer  season_Winter  season_na  usage__Ethnic  usage__Formal  \
0            0.0            0.0        0.0            0.0            0.0   
1            1.0            0.0        0.0            0.0            0.0   
2            0.0            1.0        0.0            0.0            0.0   
3            0.0            0.0        0.0            0.0            0.0   
4            1.0            0.0        0.0            0.0            0.0   

   usage__Other  usage__Sports  mC__Apparel  mC__Footwear  mC__Other  \
0           0.0            0.0          1.0 

In [68]:
final_data_df.head()

Unnamed: 0,id,articleType,gender__Unisex,gender__Women,season_Spring,season_Summer,season_Winter,season_na,usage__Ethnic,usage__Formal,usage__Other,usage__Sports,mC__Apparel,mC__Footwear,mC__Other,mC__Personal Care
0,15970,Shirts,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,39386,Jeans,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,59263,Watches,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,21379,Track Pants,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,53759,Tshirts,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [72]:
import pandas as pd

# Define the path to your image features pickle file
image_features_path = 'final_combined_features.pkl'  # Change this to your actual path
image_features_array = pd.read_pickle(image_features_path)

# Convert the NumPy array to a DataFrame
num_features = image_features_array.shape[1]
image_features_df = pd.DataFrame(image_features_array, columns=[f'feature_{i}' for i in range(num_features)])

In [73]:
image_features_df

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_2038,feature_2039,feature_2040,feature_2041,feature_2042,feature_2043,feature_2044,feature_2045,feature_2046,feature_2047
0,0.000000,0.019580,0.000000,0.009986,0.000000,0.000000,0.023176,0.039126,0.015070,0.000000,...,0.008082,0.004744,0.105801,0.000000,0.000000,0.000833,0.076037,0.000000,0.012249,0.004576
1,0.003304,0.014251,0.011516,0.003230,0.000000,0.000000,0.031109,0.009059,0.037253,0.006620,...,0.013074,0.000000,0.049441,0.000000,0.000000,0.000000,0.017049,0.008319,0.019859,0.000000
2,0.014620,0.014922,0.003859,0.000588,0.045892,0.000000,0.009296,0.046186,0.003088,0.000000,...,0.029845,0.012989,0.000000,0.084675,0.010639,0.000000,0.015109,0.005990,0.005688,0.033060
3,0.000481,0.060857,0.003077,0.002457,0.016203,0.008984,0.013755,0.031430,0.002546,0.010184,...,0.010133,0.034288,0.008307,0.063428,0.027466,0.016652,0.037235,0.002308,0.017937,0.038164
4,0.007774,0.067035,0.011014,0.005366,0.019164,0.010067,0.003787,0.022912,0.024118,0.000000,...,0.015097,0.049037,0.016746,0.061395,0.015871,0.004185,0.014783,0.000000,0.029446,0.018527
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44436,0.002533,0.010464,0.000549,0.010782,0.000000,0.000000,0.021608,0.034511,0.023890,0.034610,...,0.000000,0.000000,0.033798,0.007708,0.006122,0.000000,0.022729,0.008162,0.009994,0.005742
44437,0.000339,0.000000,0.002954,0.001694,0.005528,0.000000,0.021429,0.013689,0.014954,0.000000,...,0.000000,0.011651,0.000000,0.016659,0.030053,0.011900,0.017697,0.000000,0.041651,0.009129
44438,0.003864,0.017424,0.011113,0.008296,0.028530,0.018317,0.030619,0.034460,0.012635,0.002025,...,0.017633,0.013523,0.024076,0.004806,0.000000,0.013266,0.015558,0.001715,0.033830,0.001758
44439,0.005003,0.007160,0.003824,0.015751,0.039608,0.004368,0.032553,0.024031,0.010147,0.001832,...,0.007748,0.001031,0.034759,0.000000,0.001230,0.014399,0.003670,0.005859,0.026014,0.010879


In [77]:
# Concatenate the two DataFrames along the columns
merged_data_df = pd.concat([final_data_df.reset_index(drop=True), image_features_df.reset_index(drop=True)], axis=1)

# Verify by printing the first few rows of the merged DataFrame
print(merged_data_df.head())
print(merged_data_df.shape)

        id  articleType  gender__Unisex  gender__Women  season_Spring  \
0  15970.0       Shirts             0.0            0.0            0.0   
1  39386.0        Jeans             0.0            0.0            0.0   
2  59263.0      Watches             0.0            1.0            0.0   
3  21379.0  Track Pants             0.0            0.0            0.0   
4  53759.0      Tshirts             0.0            0.0            0.0   

   season_Summer  season_Winter  season_na  usage__Ethnic  usage__Formal  ...  \
0            0.0            0.0        0.0            0.0            0.0  ...   
1            1.0            0.0        0.0            0.0            0.0  ...   
2            0.0            1.0        0.0            0.0            0.0  ...   
3            0.0            0.0        0.0            0.0            0.0  ...   
4            1.0            0.0        0.0            0.0            0.0  ...   

   feature_2038  feature_2039  feature_2040  feature_2041  feature_2042  \

In [78]:
# Save the merged DataFrame as a pickle file
merged_data_df.to_pickle('final_data.pkl')  # Change to your desired path


In [79]:
merged_data_df.to_csv(
    'final_data.csv',  # File path
    sep=',',                                  # Delimiter (default is ',')
    index=False,                              # No index column
    quoting=1                                 # Quote strings for safety
)
