In [1]:
"""
Process Name            : STEP 6 : PROTOTYPE MODEL
"""
#=======================================================================================================
### Required Imports ###
#============================================================================================================
try:
    import sys  # System-specific parameters and functions
    import logging # for logs
    import socket # network communication
    import warnings  # Warning control
    import os # Interacting with the operating system
    import pandas as pd  # Data handling
    import numpy as np  # Numerical operations
    import difflib  # Sequence matching
    import pycountry  # Country data
    from geopy.geocoders import Nominatim  # Geocoding
    import re  # Regex
    from datetime import datetime  # Date/time
    from matplotlib import colors  # Color utilities
    from spellchecker import SpellChecker  # Spell check
    from sklearn.model_selection import train_test_split  # Data splitting
    from colour import Color  # Color manipulation
    import nltk  # NLP toolkit
    from nltk.corpus import stopwords  # Stopwords
    import seaborn as sns  # Visualization
    import matplotlib.pyplot as plt  # Plotting
    import json  # JSON handling
    from sklearn.ensemble import RandomForestClassifier  # Random Forest
    import lightgbm as lgb  # LightGBM
    from sklearn.metrics import accuracy_score, confusion_matrix, classification_report  # Metrics
    from sklearn.multioutput import MultiOutputClassifier  # Multi-output classifier
    from sklearn.preprocessing import LabelEncoder  # Label encoding
    import xgboost as xgb  # XGBoost
    from joblib import dump, load  # Model saving/loading
    from sklearn.cluster import KMeans  # Clustering
    from sklearn.preprocessing import StandardScaler  # Feature scaling
    from sklearn.preprocessing import MinMaxScaler  # Min-max scaling
    from tensorflow.keras.models import Sequential  # Neural network
    from tensorflow.keras.layers import LSTM, Dense  # LSTM and Dense layers
    from tensorflow.keras.callbacks import EarlyStopping  # Early stopping
    from tensorflow.keras.models import load_model #for model loading
    from selenium import webdriver  # Web browser automation
    from selenium.webdriver.common.by import By  # Locate elements on a page
    from selenium.webdriver.support import expected_conditions  # Wait for conditions to be met
    from selenium.webdriver.support.wait import WebDriverWait  # Explicit wait
    from selenium.webdriver.common.keys import Keys  # Keyboard actions
    from selenium.webdriver.chrome.options import Options  # Chrome browser options
    from selenium.webdriver.support import expected_conditions as EC  # Alias for expected conditions
    from selenium.webdriver.support.ui import Select  # Handle dropdown menus
    from selenium.webdriver.chrome.service import Service  # Manage ChromeDriver service
    from time import sleep  # Pause execution
    import requests  # Send HTTP requests
except Exception as err:
    print("Exception raised while importing the packages")
    print(f'Exception: {err}')
    #input("press Enter to Close")
    sys.exit()


2024-09-05 16:22:26.291519: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
#=========================================================================
### Initialization ###
#=========================================================================
try:
    path = os.getcwd()
    curr_time = datetime.now()

    ### Log Files declaration ###
    log_folder = os.path.join(path, 'Logs')
    log_date_fmt = str(curr_time.strftime('%Y')) + '-' + str(curr_time.strftime('%m')) + '-' + str(curr_time.strftime('%d')) + "_" + str(curr_time.strftime("%H")) + "-" + str(curr_time.strftime("%M"))

    audit_log_file = "Audit_prototype_model.log"
    audit_log_file = os.path.join(log_folder, audit_log_file)
    
    error_log_file = "Error_prototype_model.log"
    error_log_file = os.path.join(log_folder, error_log_file)

    ### Creating log folder ###
    if not os.path.exists(log_folder):
        os.makedirs(log_folder)
        
    ### Function: Logger setup ###
    def setup_logger(logger_name, log_file, level=logging.INFO):
        logger = logging.getLogger(logger_name)
        formatter = logging.Formatter(socket.gethostname()+' : '+'%(asctime)s : %(levelname)s : [%(filename)s:%(lineno)d] : %(message)s')

        fileHandler = logging.FileHandler(log_file, mode='w')

        fileHandler.setFormatter(formatter)

        streamHandler = logging.StreamHandler(sys.stdout)
        streamHandler.setFormatter(formatter)

        logger.setLevel(level)
        logger.addHandler(fileHandler)
        logger.addHandler(streamHandler)
        return logger
        
    ### Setting up the logger ###
    setup_logger('audit', audit_log_file, level=logging.INFO)
    setup_logger('error', error_log_file, level=logging.ERROR)

    audit_logger = logging.getLogger('audit')
    error_logger = logging.getLogger('error')
    audit_logger.info('Process start')

except Exception as err:
    print('Setting up the logger failed')
    print(f'Exception: {err}')
    #input("press Enter to Close")
    sys.exit()

192.168.1.112 : 2024-09-05 16:22:28,446 : INFO : [3048668818.py:45] : Process start


In [3]:
#==================================================================================================
### Ignore Warnings ### 
#==================================================================================================
try:
    audit_logger.info('Ignore Warnings')
    warnings.filterwarnings('ignore') ## Suppress all warnings
except Exception as err:
    audit_logger.info('Ignore Warnings - Failed')
    error_logger.error('Ignore Warnings - Failed')
    error_logger.error('Exception: ', exc_info=True)
    #input("press Enter to Close")
    sys.exit()



In [4]:
#==================================================================================================
### Download NLTK STOPWAORDS ### 
#==================================================================================================
try:
    audit_logger.info('Downloading NLTK STOPWAORDS')
    spell = SpellChecker()
    nltk.download('stopwords') # Download the stopwords from NLTK
    stop_words = set(stopwords.words('english')) # Create a set of English stopwords
except Exception as err:
    audit_logger.info('Downloading NLTK STOPWAORDS - Failed')
    error_logger.error('Downloading NLTK STOPWAORDS - Failed')
    error_logger.error('Exception: ', exc_info=True)
    #input("press Enter to Close")
    sys.exit()

192.168.1.112 : 2024-09-05 16:22:28,455 : INFO : [1589805589.py:5] : Downloading NLTK STOPWAORDS


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shrutipatkar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
#==================================================================================================
### Create folder structure ### 
#==================================================================================================
try:
    audit_logger.info('Create folder structure')
    path
    input_folder = os.path.join(path, 'Input')
    if not os.path.isdir(input_folder):
        audit_logger.info('No input folder')
        error_logger.error('No input folder')
        raise Exception
    excel_folder = os.path.join(path, 'Excel_files')
    if not os.path.isdir(excel_folder):
        os.mkdir(excel_folder)
    saved_folder = os.path.join(path, 'Saved_files')
    if not os.path.isdir(saved_folder):
        os.mkdir(saved_folder)
    output_folder = os.path.join(path, 'Output')
    if not os.path.isdir(output_folder):
        os.mkdir(output_folder)
except Exception as err:
    audit_logger.info('Create folder structure - Failed')
    error_logger.error('Create folder structure - Failed')
    error_logger.error('Exception: ', exc_info=True)
    #input("press Enter to Close")
    sys.exit()

192.168.1.112 : 2024-09-05 16:22:28,665 : INFO : [300162134.py:5] : Create folder structure


In [6]:
audit_logger.info('Saving Functions') 
# Function to find the best match and its confidence
def get_best_match_with_confidence(country):
    country = country.replace("City", "") #drop word "City
    if "Englan" in country:
        country = "United Kingdom"
    if "rkiye" in country and country.startswith("T"):
        country = "Turkey"
    try:
        try:
            country = alpha_2_mapping[country]
        except:
            pass
        matches = difflib.get_close_matches(country, country_names, n=1)
        if matches:
            best_match = matches[0]
            confidence = difflib.SequenceMatcher(None, country, best_match).ratio() #calculate the confidence level (0 to 1)
            if confidence >= 0.80:
                return best_match, confidence
            else:
                return country, 0            
        else:
            return country, 0
    except:
        return country, 0
    
# Function to calculate age from birthdate
def calculateAge(birthDate):
    try:
        today = datetime.today()
        age = today.year - birthDate.year -((today.month, today.day) <(birthDate.month, birthDate.day))    
    except:
        return None
    return age

# Function to normalize sizes like "4XL" into "XXXXL" for XLs and XSs
def normalize_size(size):
    match = re.match(r"(\d+)(XL)", size.upper().strip())
    match_2 = re.match(r"(\d+)(XS)", size.upper().strip())
    if match:
        number = int(match.group(1))
        return "X" * number + "L"
    elif match_2:
        number = int(match_2.group(1))
        return "X" * number + "S"
    else:
        return size.strip()
    
# Function to convert clothes sizes to numerical values
def convert_size_to_numbers(sizes):
    if pd.isna(sizes):
        return 0
    sizes = sizes.split(',')
    numeric_sizes = [size_mapping[size.strip()] for size in sizes if size.strip() in size_mapping]
    return numeric_sizes if numeric_sizes else None

# Function to convert personality types to numerical values
def convert_personality_to_number(personality):
    for key in personality_mapping:
        if key == personality:
            return personality_mapping[key]
    return 4
    
#Function to get valid color as per capital lettes
def split_by_capital(s):
    if len(s.split(" ")) == 1 and s[0].isupper() and s[1].islower():
        # Split the string at each point a new capital letter starts, except for the very beginning of the string
        parts = re.findall('[A-Z][^A-Z]*', s)
        return parts
    else:
        return s.split(" ")

#Function to extract and convert color to rgba
def custom_to_rgba(name):
    try:
        int32_values = []
        color_name_2=[]
        # Split the string based on capital letters
        color_name = split_by_capital(name)
        color_name  = [i for i in color_name if len(i)>2]
        for i in color_name:
            i = i.rstrip('s') #handle words like "blacks"
            i = i.rstrip('ish') #handle words like "blackish"
            if spell.correction(i.lower()) is None:
                color_name_2.append(i.lower())
            else:
                color_name_2.append(spell.correction(i.lower()))
                color_name_2.append(i.lower().strip())
        color_names_3  = [i for i in color_name_2 if Color(i) is True]
        if len(color_names_3) <= 0:
            color_names_3 = color_name_2
        color_names_3 = list(set(color_names_3))
        for color_name_3 in color_names_3:
            color_name_3 = color_name_3.lower().replace(" ", "").strip()
            #Handle common typo errors
            if color_name_3.startswith("b") and color_name_3.endswith("ck"):
                color_name_3 = "black"
            elif color_name_3.startswith("whit"):
                color_name_3 = "white"
            elif color_name_3.startswith("bl") and color_name_3.endswith("ck") == False:
                color_name_3 = "blue"   
            elif color_name_3.startswith("voil"):
                color_name_3 = "violet" 
            elif color_name_3.startswith("ros") or color_name_3.endswith("ink"):
                color_name_3 = "pink"  
            elif color_name_3.endswith("ojo") or color_name_3.startswith("verm"):
                color_name_3 = "red"   
            elif color_name_3.startswith("verd"):
                color_name_3 = "green"  
            elif color_name_3.startswith("azu"):
                color_name_3 = "blue"
            elif color_name_3.startswith("negr") or color_name_3.startswith("ne"):
                color_name_3 = "black"
            elif color_name_3.startswith("bian"):
                color_name_3 = "white"
            elif color_name_3.startswith("saf") and color_name_3.endswith("on"):
                color_name_3 = "orange"
            elif "denim" in color_name_3 or "jean" in color_name_3:
                color_name_3 = "blue"
            try:
                rgba = colors.to_rgba(color_name_3)
                int32_values.append(list(rgba))
            except:
                #some common color not handled by matplotlib.color module
                if color_name_3 == "mauve":
                    rgba = (213, 184, 255, 1)
                    int32_values.append(list(rgba))
                elif color_name_3.startswith("lil") :
                    rgba = (157,126,183,1.00)
                    int32_values.append(list(rgba))
                elif color_name_3.startswith("peac"):
                    rgba = (255,176,124,1.00)
                    int32_values.append(list(rgba))
                elif color_name_3 == "cream":
                    rgba = (245,239,214,1.00)
                    int32_values.append(list(rgba)) 
                elif (color_name_3.startswith("b") and color_name_3.endswith("ge")) or color_name_3.startswith("offw"):
                    rgba = (245, 245, 220, 1)
                    int32_values.append(list(rgba))
                elif color_name_3 == "burgundy":
                    rgba =(144,0,32,1.00)
                    int32_values.append(list(rgba))
                elif color_name_3 == "nude":
                    rgba =(227, 188, 154, 1.00)
                    int32_values.append(list(rgba))
                elif color_name_3 == "khaki" or color_name_3 == "kakhi" or color_name_3 == "kaki" or color_name_3 == "olive":
                    rgba =(181, 179, 92, 1)
                    int32_values.append(list(rgba))
                elif color_name_3.startswith("sand") or color_name_3 == "mustard":
                    rgba =(231,196,150, 1)
                    int32_values.append(list(rgba))
        if len(int32_values) < 1:
            raise Exception
        else:
            return int32_values
    except Exception:
        rgba = (0, 0, 0, 1)
        int32_values.append(list(rgba))
        return int32_values

#Function to remove braceted text
def remove_bracketed_text(text):
    if pd.isna(text):
        return text
    return re.sub(r'\(.*?\)', '', text).strip()

# Function to remove stop words
def remove_stop_words(text):
    if pd.isna(text):
        return text
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

#Function to extract type from Clothing_type column
def replace_clothing_type(text):
    text = text.lower()  
    if 'street' in text:
        return 'streetwear'
    elif 'sport' in text:
        return 'sportswear'
    elif 'dress' in text or 'saree' in text or 'sadi' in text or 'traditional' in text or 'religi' in text:
        return 'traditional'
    elif 'offic' in text or 'formal' in text:
        return 'formal'
    elif 'athleisure' in text:
        return 'casual'
    elif 'basic' in text or 'casual' in text or 'smart' in text:
        return 'casual'
    elif 'party' in text:
        return 'partywear'
    elif text in allowed_types:
        return text
    else:
        return "other"

#Function to convert RGBA to color
def rgba_to_named_color(rgba):
    min_dist = float('inf')
    closest_color = None
    rgba = np.append(rgba,[1])
    for name, hex_color in colors.CSS4_COLORS.items():
        # Convert hex to RGBA
        color_rgba = colors.to_rgba(hex_color)
        
        # Calculate the distance between the colors
        dist = np.linalg.norm(np.array(color_rgba) - np.array(rgba))
        
        if dist < min_dist:
            min_dist = dist
            closest_color = name    
    return closest_color

192.168.1.112 : 2024-09-05 16:22:28,685 : INFO : [3680746.py:1] : Saving Functions


In [7]:
#==================================================================================================
### Load the inputs ### 
#==================================================================================================
try:
    audit_logger.info('Loading inputs') 
    df = pd.read_excel(os.path.join(input_folder, "input.xlsx"))
    country_map = pd.read_excel(os.path.join(input_folder, "country_to_lat_long.xlsx"))
    
    # dictionary to map the current column names to new names
    new_column_names = {'Country of Birth': 'Country_of_Birth',
        'Country of Residence': 'Country_of_Residence',
        'Birthdate': 'Birthdate',
        'Gender': 'Gender',
        'Approximate weight in kilogram': 'Weight_kg',
        'Approximate height in centimetre': 'Height_cm',
        'What type of clothes you prefer to wear': 'Preferred_Clothing_Type',
        'How do you define yourself': 'Personality',
        'Size of clothes you wear': 'Clothing_Size',
        'What is your favourite color.': 'Favourite_Color',
        'Do you think this color looks best on you and boost your confidence?': 'Color_Boost_Confidence',
        'Which color do you think looks best on you and makes you feel confident?': 'Best_Color_Confidence'}
    
    # Define a mapping from size to a numerical value using the provided values
    size_mapping = {'XXXXXS': 1, 'XXXXS': 2, 'XXXS': 3, 'XXS': 4, 'XS': 5, 'S': 6, 'M': 7, 'L': 8,
                    'XL': 9, 'XXL': 10, 'XXXL': 11, 'XXXXL': 12, 'XXXXXL': 13}
    
    personality_mapping = {'Introvert (Are you shy, reticent person?)': 1,
        'Ambivert (Are you able to balance between extrovert and introvert?)': 2,
        'Extrovert (Are you outgoing, socially confident?)': 3,}
    
    allowed_types = ['semiformal', 'casual', 'formal', 'partywear', 'traditional', 'sportswear', 'streetwear']
    
    # Define the mapping for the 'Preferred_clothing' column
    preferred_clothing_mapping = {'Single Piece Wear': 0, 'Two Piece Wear': 1}
    
    columns_to_process = ['Preferred_Bottoms', 'Bottoms_Material', 'Bottoms_Length', 'Bottoms_Fitting', 'Bottoms_Color', 
        'Preferred_Upperwear', 'Upperwear_Material', 'Upperwear_Length', 'Upperwear_Neckline', 'Upperwear_Sleeve_Type', 
        'Upperwear_Pattern', 'Upperwear_Color', 'Favourite_Single_Piece', 'Single_Piece_Material', 
        'Single_Piece_Fitting', 'Single_Piece_Length', 'Single_Piece_Sleeves', 'Single_Piece_Neckline', 
        'Single_Piece_Pattern', 'Single_Piece_Color']

    #output columns
    columns_to_process = ['Preferred_Bottoms', 'Bottoms_Material', 'Bottoms_Length', 'Bottoms_Fitting', 'Bottoms_Color', 
        'Preferred_Upperwear', 'Upperwear_Material', 'Upperwear_Length', 'Upperwear_Neckline', 'Upperwear_Sleeve_Type', 
        'Upperwear_Pattern', 'Upperwear_Color', 'Favourite_Single_Piece', 'Single_Piece_Material', 
        'Single_Piece_Fitting', 'Single_Piece_Length', 'Single_Piece_Sleeves', 'Single_Piece_Neckline', 
        'Single_Piece_Pattern', 'Single_Piece_Color']
    
    # Rename the columns in the DataFrame
    df.rename(columns=new_column_names, inplace=True)
    #handle country names
    country_name_dic = {"Uk":"United Kingdom", 
                                    "Usa": "United States",
                                    "Us":"United States",
                                    "America":"United States",
                                    "United States Of America": "United States",
                                    "United State Of America" : "United States",
                                    "United States Of American":"United States",
                                   "England": "United Kingdom",
                                   "Wales":"United Kingdom",
                                   "Scotland":"United Kingdom",
                                   "Ksa": "Saudi Arabia",
                                   "Rsa": "South Africa",
                                   "Great Britain":"United Kingdom",
                                   "Britain": "United Kingdom",
                                   "Uae":"United Arab Emirates",
                                   "United Kingdom Of Great Britain And Northern Ireland":"United Kingdom",
                                "Mumbai":"India",
                                "Czechia": "Czech Republic",
                                "Bharat":"India",
                                "Edinburgh":"United Kingdom",
                                "Korea":"South Korea",
                                "Russian Federation": "Russia",
                                "Thane":"India",
                                "Dubai": "United Arab Emirates",
                                "Democratic Republic Of Congo": "Congo",
                                "Europe":"United Kingdom",
                                "Italia":"Italy",
                                "Sssr" : "Russia"}

except Exception as err:
    audit_logger.info('Loading inputs - Failed')
    error_logger.error('Loading inputs - Failed')
    error_logger.error('Exception: ', exc_info=True)
    #input("press Enter to Close")
    sys.exit()
    

192.168.1.112 : 2024-09-05 16:22:28,695 : INFO : [948062006.py:5] : Loading inputs


In [8]:
#==================================================================================================
### Data Preprocessing ### 
#==================================================================================================
try:
    audit_logger.info('Data Preprocessing')

    df.loc[df['Country_of_Residence'] == 'Same as Country of Birth', 'Country_of_Residence'] = df['Country_of_Birth']
    df.loc[df['Color_Boost_Confidence'] == 'Yes', 'Best_Color_Confidence'] = df['Favourite_Color']
    
    df["Color_Boost_Confidence"].replace({"Yes":1, "No":0}, inplace=True)
    df["Country_of_Birth"] = df["Country_of_Birth"].astype(str)
    df["Country_of_Residence"] = df["Country_of_Residence"].astype(str)   
    df["Country_of_Birth"] = df["Country_of_Birth"].str.strip().str.title()
    df["Country_of_Residence"] = df["Country_of_Residence"].str.strip().str.title()
    
    # Create a list of official country names
    country_names = [country.name for country in pycountry.countries]
    alpha_2_mapping = {country.alpha_2.title(): country.name.title() for country in pycountry.countries}
    
    df["Country_of_Birth"] = df["Country_of_Birth"].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x) if isinstance(x, str) else x)
    df["Country_of_Birth"] = df["Country_of_Birth"].apply(lambda x: x.split(",")[-1] if isinstance(x, str) else x)
    df["Country_of_Birth"] = df["Country_of_Birth"].str.title()
    df["Country_of_Birth"] = df["Country_of_Birth"].apply(lambda x:x.replace("The ", ""))
    df["Country_of_Birth"] = df["Country_of_Birth"].apply(lambda x:"Us" if x.find("America") != -1 else x)
    
    df["Country_of_Residence"] = df["Country_of_Residence"].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x) if isinstance(x, str) else x)
    df["Country_of_Residence"] = df["Country_of_Residence"].apply(lambda x: x.split(",")[-1] if isinstance(x, str) else x)
    df["Country_of_Residence"] = df["Country_of_Residence"].str.title()
    df["Country_of_Residence"] = df["Country_of_Residence"].apply(lambda x:x.replace("The ", ""))
    df["Country_of_Residence"] = df["Country_of_Residence"].apply(lambda x:"Us" if x.find("America") != -1 else x)
    
    df["Country_of_Birth"] = df["Country_of_Birth"].replace(country_name_dic)
    
    df["Country_of_Residence"] = df["Country_of_Residence"].replace(country_name_dic)
    
    # Replace blank, NaN, or unreadable entries in 'Country_of_Birth' with 'Country_of_Residence' and vice versa
    df["Country_of_Birth"] = df["Country_of_Birth"].apply(lambda x: np.nan if isinstance(x, str) and x.strip() in ['', 'Cannot Read Text'] else x)
    df["Country_of_Residence"] = df["Country_of_Residence"].apply(lambda x: np.nan if isinstance(x, str) and x.strip() in ['', 'Cannot Read Text'] else x)
    df["Country_of_Birth"].fillna(df["Country_of_Residence"], inplace=True)
    df["Country_of_Residence"].fillna(df["Country_of_Birth"], inplace=True)
    
    # Get the unique values from the 'Country_of_Birth' column
    unique_countries_1 = list(df['Country_of_Birth'].unique())
    unique_countries_2 = list(df['Country_of_Residence'].unique())
    unique_countries = np.union1d(unique_countries_1, unique_countries_2)
    
    # Apply the function to the unique values
    matched_countries = {country: get_best_match_with_confidence(country) for country in unique_countries}
    
    # Create two new columns in the DataFrame by mapping the results back to the original 'Country_of_Birth' column
    df['Country_of_Birth'] = df['Country_of_Birth'].apply(lambda x: matched_countries[x][0] if x in matched_countries else x)
    df['Country_of_Residence'] = df['Country_of_Residence'].apply(lambda x: matched_countries[x][0] if x in matched_countries else x)
    
    # Create a mapping from country names to numerical values
    country_mapping = {country: idx + 1 for idx, country in enumerate(unique_countries)}
    
    unique_countries_1 = list(df['Country_of_Birth'].unique())
    unique_countries_2 = list(df['Country_of_Residence'].unique())
    unique_countries = np.union1d(unique_countries_1, unique_countries_2)
    
    country_map['name'] = country_map['name'].str.strip().str.lower().str.replace(' ', '', regex=True).str.replace(r"\s*\([^()]*\)", "", regex=True)
    
    country_map = country_map.drop_duplicates(subset=['name'])
    
    countries_normalized = np.char.replace(np.char.strip(np.char.lower(unique_countries)), ' ', '')
    
    # Create a dictionary from the DataFrame for quick lookup
    country_dict = dict(zip(country_map['name'], zip(country_map['latitude'], country_map['longitude'])))
    
    # Map the countries in the array to their coordinates using the dictionary
    coordinates = [country_dict.get(country, (None, None)) for country in countries_normalized]
    
    # Convert to DataFrame for better visualization or further use
    coordinates_df = pd.DataFrame(coordinates, columns=['Latitude', 'Longitude'], index=unique_countries).reset_index()
    
    df.reset_index(inplace=True)
    df = df.merge(coordinates_df, left_on='Country_of_Birth', right_on='index')
    df = df.set_index(df["index_x"])
    df.drop(columns=["index_x","index_y", "Country_of_Birth"], inplace=True)
    df.rename(columns={'Latitude': 'birthplace_lat', 'Longitude': 'birthplace_lon'}, inplace=True)
    df.reset_index(inplace=True)
    df = df.merge(coordinates_df, left_on='Country_of_Residence', right_on='index')
    df = df.set_index(df["index_x"])
    df.drop(columns=["index_x","index", "Country_of_Residence"], inplace=True)
    df.rename(columns={'Latitude': 'resi_lat', 'Longitude': 'resi_lon'}, inplace=True)                                                             
    df['resi_lat'].fillna(df['birthplace_lat'], inplace=True)
    df['birthplace_lat'].fillna(df['resi_lat'], inplace=True)
    df['resi_lon'].fillna(df['birthplace_lon'], inplace=True)
    df['birthplace_lon'].fillna(df['resi_lon'], inplace=True)

    df['Birthdate'] = pd.to_datetime(df['Birthdate'], errors='coerce')
    df['Age'] = df['Birthdate'].apply(calculateAge)
    
    df['Weight_kg'] = df['Weight_kg'].apply(lambda x: x * 100 if x < 1 else x)
    df['Weight_kg'] = df['Weight_kg'].apply(lambda x: x * 10 if x < 10 else x)
    df["Weight_kg"] = df["Weight_kg"].round(0)
    
    df['Height_cm'] = df['Height_cm'].apply(lambda x: x * 100 if x < 2 else x)
    df['Height_cm'] = df['Height_cm'].apply(lambda x: x * 30.48 if x < 15 else x)
    df['Height_cm'] = df['Height_cm'].apply(lambda x: x * 2.54 if x < 100 else x)
    df["Height_cm"] = df["Height_cm"].round(0)
    
    # Normalize the sizes in the 'Size of clothes you wear' column
    df['Clothing_Size'] = df['Clothing_Size'].apply(lambda sizes: ','.join([normalize_size(size) for size in sizes.split(',')]))
    
    # Apply the function to the 'Size of clothes you wear' column
    df['Clothing_Size'] = df['Clothing_Size'].apply(convert_size_to_numbers)
    
    # Apply the function to the 'How do you define yourself' column
    df['Personality'] = df['Personality'].apply(convert_personality_to_number)
    
    df['Favourite_Color'] = df['Favourite_Color'].str.split(',')
    df['Best_Color_Confidence'] = df['Best_Color_Confidence'].str.split(',')
    df = df.explode('Favourite_Color')
    df = df.explode("Clothing_Size")
    df = df.explode("Best_Color_Confidence")
    
    df["Favourite_Color"] = df["Favourite_Color"].apply(lambda x: re.sub(r'[^a-zA-Z\s]', ' ', x) if isinstance(x, str) else x)
    df['Favourite_Color'] = df['Favourite_Color'].apply(custom_to_rgba)
    df["Best_Color_Confidence"] = df["Best_Color_Confidence"].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x) if isinstance(x, str) else x)
    df['Best_Color_Confidence'] = df['Best_Color_Confidence'].apply(custom_to_rgba)                                                   
    df = df.explode('Favourite_Color')
    df = df.explode("Best_Color_Confidence")
    df[['Favourite_Color_r', 'Favourite_Color_g', 'Favourite_Color_b', 'Favourite_Color_a']] = pd.DataFrame(df['Favourite_Color'].tolist(), index=df.index)
    df[['Best_Color_Confidence_r', 'Best_Color_Confidence_g', 'Best_Color_Confidence_b', 'Best_Color_Confidence_a']] = pd.DataFrame(df['Favourite_Color'].tolist(), index=df.index)
    
    df['Gender'] = df['Gender'].apply(lambda x: 1 if x.lower() == 'male' else (3 if x.lower() == 'female' else 2))
    
    df['Preferred_Clothing_Type'] = df['Preferred_Clothing_Type'].apply(lambda x:str(x).lower().strip())
    df['Preferred_Clothing_Type'] = df['Preferred_Clothing_Type'].apply(remove_bracketed_text)
    df['Preferred_Clothing_Type'] = df['Preferred_Clothing_Type'].apply(lambda x:x.replace("both", "").replace("combination", "").replace("smart", ""))
    df['Preferred_Clothing_Type'] = df['Preferred_Clothing_Type'].str.split('and')
    df = df.explode("Preferred_Clothing_Type")
    df['Preferred_Clothing_Type'] = df['Preferred_Clothing_Type'].str.split('/')
    df = df.explode("Preferred_Clothing_Type")
    # Apply the function to a specific column, e.g., 'How do you define yourself'
    df['Preferred_Clothing_Type'] = df['Preferred_Clothing_Type'].apply(remove_stop_words)
    df['Preferred_Clothing_Type'] = df['Preferred_Clothing_Type'].apply(lambda x:x.replace(" ", ""))
    # Apply the function to the cleaned column
    df['Preferred_Clothing_Type'] = df['Preferred_Clothing_Type'].apply(replace_clothing_type)
    df['Preferred_Clothing_Type'] = df['Preferred_Clothing_Type'].str.split(',')
    df = df.explode('Preferred_Clothing_Type')
    with open(os.path.join(saved_folder, 'Preferred_Clothing_Type.json'), 'r') as f:
        clothing_type_mapping = json.load(f)
    # Apply the function to the final cleaned column
    df['Preferred_Clothing_Type'] = df['Preferred_Clothing_Type'].map(clothing_type_mapping)
    
    df = df[['Gender', 'Weight_kg', 'Height_cm','Preferred_Clothing_Type', 'Personality', 'Clothing_Size','Favourite_Color_r', 
             'Favourite_Color_g', 'Favourite_Color_b','Best_Color_Confidence_r', 'Best_Color_Confidence_g','Best_Color_Confidence_b', 
             'birthplace_lat', 'birthplace_lon', 'resi_lat', 'resi_lon', 'Age',]]
    df.fillna(0, inplace=True)
except Exception as err:
    audit_logger.info('Data Preprocessing - Failed')
    error_logger.error('Data Preprocessing - Failed')
    error_logger.error('Exception: ', exc_info=True)
    #input("press Enter to Close")
    sys.exit()

192.168.1.112 : 2024-09-05 16:22:28,825 : INFO : [406014992.py:5] : Data Preprocessing


In [9]:
#==================================================================================================
### Cluster model ### 
#==================================================================================================
try:
    audit_logger.info('Cluster Model')
    scaler = load(os.path.join(saved_folder, 'scaler_1.joblib'))
    kmeans = load(os.path.join(saved_folder, 'kmeans_model.joblib'))
    
    data_for_cluster = df[['Gender', 'Weight_kg', 'Height_cm','Preferred_Clothing_Type', 'Clothing_Size',
                           'Favourite_Color_r', 'Favourite_Color_g', 'Favourite_Color_b','Best_Color_Confidence_r', 
                           'Best_Color_Confidence_g','Best_Color_Confidence_b', 'birthplace_lat','birthplace_lon', 
                           'resi_lat', 'resi_lon', 'Age', 'Personality']]
    features_scaled = scaler.transform(data_for_cluster)
    
    # Predict the cluster for the new data
    cluster = kmeans.predict(features_scaled)
    df["Cluster"] = cluster
    audit_logger.info(df["Cluster"])
except Exception as err:
    audit_logger.info('Cluster Model - Failed')
    error_logger.error('Cluster Model - Failed')
    error_logger.error('Exception: ', exc_info=True)
    #input("press Enter to Close")
    sys.exit()    

192.168.1.112 : 2024-09-05 16:22:28,879 : INFO : [98284998.py:5] : Cluster Model


192.168.1.112 : 2024-09-05 16:22:28,937 : INFO : [98284998.py:18] : index_x
0    1
Name: Cluster, dtype: int32


In [10]:
#==================================================================================================
### Model for Preferred_clothing ### 
#==================================================================================================
try:
    audit_logger.info('Model for Preferred_clothing')
    X_train_1 = df[['Gender', 'Weight_kg', 'Height_cm','Preferred_Clothing_Type', 'Clothing_Size',
                       'Favourite_Color_r', 'Favourite_Color_g', 'Favourite_Color_b','Best_Color_Confidence_r', 
                       'Best_Color_Confidence_g','Best_Color_Confidence_b', 'birthplace_lat','birthplace_lon', 'Cluster',
                       'resi_lat', 'resi_lon', 'Age', 'Personality']]
    X_train_1["Clothing_Size"] = X_train_1["Clothing_Size"].astype(int)
    # Load the saved models
    rf_model = load(os.path.join(saved_folder,'random_forest_model_stage_1.joblib'))
    lgb_model = load(os.path.join(saved_folder,'lightgbm_model_stage_1.joblib'))
    
    rf_probs = rf_model.predict_proba(X_train_1)[:, 1]
    lgb_probs = lgb_model.predict_proba(X_train_1)[:, 1]
    blended_probs = (rf_probs + lgb_probs) / 2
    final_predictions = (blended_probs > 0.5).astype(int)
    df["Preferred_Clothing"] = final_predictions
    audit_logger.info(df["Preferred_Clothing"])
    df_2 = df[df["Preferred_Clothing"] == 1]
    df_1 = df[df["Preferred_Clothing"] == 0]
except Exception as err:
    audit_logger.info('Model for Preferred_clothing - Failed')
    error_logger.error('Model for Preferred_clothing - Failed')
    error_logger.error('Exception: ', exc_info=True)
    #input("press Enter to Close")
    sys.exit()   

192.168.1.112 : 2024-09-05 16:22:28,944 : INFO : [1993931470.py:5] : Model for Preferred_clothing


192.168.1.112 : 2024-09-05 16:22:29,263 : INFO : [1993931470.py:20] : index_x
0    0
Name: Preferred_Clothing, dtype: int64


In [11]:
#==================================================================================================
### Model for Two piece clothing ### 
#==================================================================================================
try:
    if len(df_2) > 0:
        audit_logger.info('Model for Two piece clothing')
        multiclass_model_2 = load(os.path.join(saved_folder, 'multiclass_random_forest_2_stage_2.joblib'))
    
        column_y=['Preferred_Bottoms', 'Bottoms_Material', 'Bottoms_Length', 'Bottoms_Fitting', 'Preferred_Upperwear',
                                               'Upperwear_Material', 'Upperwear_Length', 'Upperwear_Neckline', 'Upperwear_Sleeve_Type', 'Upperwear_Pattern']
        
        X_twopiece = df_2[['Gender', 'Weight_kg', 'Height_cm', 'Preferred_Clothing_Type','Personality', 'Clothing_Size','Favourite_Color_r', 
              'Favourite_Color_g', 'Favourite_Color_b','Best_Color_Confidence_r', 'Best_Color_Confidence_g','Best_Color_Confidence_b', 
              'birthplace_lat', 'birthplace_lon', 'resi_lat', 'resi_lon', 'Age']]
    
        y_pred = multiclass_model_2.predict(X_twopiece)
        y_pred_df = pd.DataFrame(y_pred, columns=column_y)
        y_pred_df.index = X_twopiece.index  # Adjust the index if necessary
    
        result_df_2 = pd.concat([X_twopiece, y_pred_df], axis=1)
        for col in column_y:
            with open(os.path.join(saved_folder, col+'.json'), 'r') as f:
                mapping = json.load(f)
            reversed_mapping = {value: key for key, value in mapping.items()}
            result_df_2[col] = result_df_2[col].map(reversed_mapping)
        result_df_2["Preferred_Clothing"] = 'Two Piece Wear'
    
        X_twopiece = df_2[['Gender', 'Weight_kg', 'Height_cm', 'Preferred_Clothing_Type', 'Personality', 'Clothing_Size',
                       'Favourite_Color_r', 'Favourite_Color_g', 'Favourite_Color_b', 'Best_Color_Confidence_r',
                       'Best_Color_Confidence_g', 'Best_Color_Confidence_b', 'birthplace_lat', 'birthplace_lon',
                       'resi_lat', 'resi_lon', 'Age']]
    
        scaler_X_twop = load(os.path.join(saved_folder, 'scaler_X_twop.joblib'))
        scaler_y_twop = load(os.path.join(saved_folder, 'scaler_y_twop.joblib'))
    
        X_twopiece = scaler_X_twop.transform(X_twopiece)
    
        # Reshape input to be [samples, features, 1] if your model expects 3D input (e.g., CNN, LSTM)
        X_twopiece = X_twopiece.reshape((X_twopiece.shape[0], X_twopiece.shape[1], 1))
    
        from tensorflow.keras.models import load_model
        color_model_1 = load_model(os.path.join(saved_folder, 'cnn_model_twopiece_standard.h5'))
        
        y_pred = color_model_1.predict(X_twopiece)
        
        y_pred_rescaled = scaler_y_twop.inverse_transform(y_pred)
    
        bottoms = []
        upperwear = []
        for i in y_pred_rescaled:
            bottoms.append(rgba_to_named_color(i[1:4]))
            upperwear.append(rgba_to_named_color(i[5:]))
                    
        result_df_2["Bottoms_Color"] = bottoms
        result_df_2["Upperwear_Color"] = upperwear
        # Create a sentence for lowerwear
        result_df_2['Bottoms'] = result_df_2["Bottoms_Color"].astype(str) + " " + \
                             result_df_2['Bottoms_Fitting'].astype(str) + " fit " + \
                             result_df_2['Bottoms_Length'].astype(str) + " length " + \
                             result_df_2['Bottoms_Material'].astype(str) + " " + \
                             result_df_2['Preferred_Bottoms'].astype(str)
        result_df_2['Bottoms'] = result_df_2['Bottoms'].apply(lambda x: str(x).lower().replace(" any length ", " ").replace(" any fit ", " ").replace(" any ", " "))
        result_df_2['Bottoms'] = result_df_2['Bottoms'].apply(lambda x: str(x).lower().replace(" nan length ", " ").replace(" nan fit ", " ").replace(" nan ", " "))

        
        result_df_2['Upperwear'] = result_df_2['Upperwear_Color'].astype(str) + " " + \
                               result_df_2['Upperwear_Sleeve_Type'].astype(str) + " sleeves " + \
                                result_df_2['Upperwear_Neckline'].astype(str) + " neck " + \
                               result_df_2['Upperwear_Length'].astype(str) + " length " + \
                                result_df_2['Upperwear_Pattern'].astype(str) + " " + \
                               result_df_2['Upperwear_Material'].astype(str) + " " + \
                               result_df_2['Preferred_Upperwear'].astype(str)
        
        result_df_2['Upperwear'] = result_df_2['Upperwear'].apply(lambda x: str(x).lower().replace(" any sleeves ", "").replace(" any neck ", "").replace(" any length ", " ").replace(" any ", " "))
        result_df_2['Upperwear'] = result_df_2['Upperwear'].apply(lambda x: str(x).lower().replace(" nan sleeves ", "").replace(" nan neck ", "").replace(" nan length ", " ").replace(" nan ", " "))
                     
except Exception as err:
    audit_logger.info('Model for Two piece clothing - Failed')
    error_logger.error('Model for Two piece clothing - Failed')
    error_logger.error('Exception: ', exc_info=True)
    #input("press Enter to Close")
    sys.exit()                              


In [12]:
#==================================================================================================
### Model for One piece clothing ### 
#==================================================================================================
try:
    if len(df_1) > 0:
        audit_logger.info('Model for One piece clothing')
        multiclass_model_1 = load(os.path.join(saved_folder,'multiclass_random_forest_1_stage_2.joblib'))
    
        column_y=['Favourite_Single_Piece', 'Single_Piece_Material', 'Single_Piece_Fitting', 'Single_Piece_Length',
              'Single_Piece_Sleeves', 'Single_Piece_Neckline', 'Single_Piece_Pattern']
        
        X_onepiece = df_1[['Gender', 'Weight_kg', 'Height_cm', 'Preferred_Clothing_Type','Personality', 'Clothing_Size','Favourite_Color_r', 
              'Favourite_Color_g', 'Favourite_Color_b','Best_Color_Confidence_r', 'Best_Color_Confidence_g','Best_Color_Confidence_b', 
              'birthplace_lat', 'birthplace_lon', 'resi_lat', 'resi_lon', 'Age']]
    
        y_pred = multiclass_model_1.predict(X_onepiece)
    
        y_pred_df = pd.DataFrame(y_pred, columns=column_y)
    
        y_pred_df.index = X_onepiece.index  # Adjust the index if necessary
    
        result_df_1 = pd.concat([X_onepiece, y_pred_df], axis=1)
        for col in column_y:
            with open(os.path.join(saved_folder,col+'.json'), 'r') as f:
                mapping = json.load(f)
            reversed_mapping = {value: key for key, value in mapping.items()}
            result_df_1[col] = result_df_1[col].map(reversed_mapping)
        result_df_1["Preferred_Clothing"] = 'One Piece Wear'
    
        X_onepiece = df_1[['Gender', 'Weight_kg', 'Height_cm', 'Preferred_Clothing_Type', 'Clothing_Size',
           'Favourite_Color_r', 'Favourite_Color_g', 'Favourite_Color_b','Best_Color_Confidence_r', 'Best_Color_Confidence_g',
            'Best_Color_Confidence_b', 'birthplace_lat', 'birthplace_lon', 'resi_lat', 'resi_lon', 'Age', 'Personality']]
    
        scaler_X_onep = load(os.path.join(saved_folder,'scaler_X_onep.joblib'))
        scaler_y_onep = load(os.path.join(saved_folder,'scaler_y_onep.joblib'))
    
        X_onepiece = scaler_X_onep.transform(X_onepiece)
    
        # Reshape input to be [samples, features, 1] if your model expects 3D input (e.g., CNN, LSTM)
        X_onepiece = X_onepiece.reshape((X_onepiece.shape[0], X_onepiece.shape[1], 1))
    
        color_model_1 = load_model(os.path.join(saved_folder,'cnn_model_onepiece.h5'))
        
        y_pred = color_model_1.predict(X_onepiece)
        
        y_pred_rescaled = scaler_y_onep.inverse_transform(y_pred)
    
        singlepiece = []
        for i in y_pred_rescaled:
            singlepiece.append(rgba_to_named_color(i[1:4]))
                    
        result_df_1["Singlepiece_Color"] = singlepiece
        # Create a sentence for lowerwear
        result_df_1['Singlepiece']  = result_df_1['Singlepiece_Color'].astype(str) + " " + \
                               result_df_1['Single_Piece_Sleeves'].astype(str) + " sleeves " + \
                                result_df_1['Single_Piece_Neckline'].astype(str) + " neck " + \
                               result_df_1['Single_Piece_Length'].astype(str) + " length " + \
                                result_df_1['Single_Piece_Fitting'].astype(str) + " fit " + \
                               result_df_1['Single_Piece_Pattern'].astype(str) + " " + \
                                result_df_1['Single_Piece_Material'].astype(str) + " " + \
                               result_df_1['Favourite_Single_Piece'].astype(str)
        result_df_1['Singlepiece'] = result_df_1['Singlepiece'].apply(lambda x: str(x).lower().replace(" any sleeves ", "").replace(" any neck ", "").replace(" any length ", " ").replace(" any fit ", " ").replace(" any ", " "))
        result_df_1['Singlepiece'] = result_df_1['Singlepiece'].apply(lambda x: str(x).lower().replace(" nan sleeves ", "").replace(" nan neck ", "").replace(" nan length ", " ").replace(" nan fit ", " ").replace(" nan ", " "))
except Exception as err:
    audit_logger.info('Model for One piece clothing - Failed')
    error_logger.error('Model for One piece clothing - Failed')
    error_logger.error('Exception: ', exc_info=True)
    #input("press Enter to Close")
    sys.exit()    


192.168.1.112 : 2024-09-05 16:22:29,293 : INFO : [3531442730.py:6] : Model for One piece clothing


2024-09-05 16:22:32.709234: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-09-05 16:22:32.712103: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)






2024-09-05 16:22:33.670699: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


In [13]:
#==================================================================================================
### Final Results ### 
#==================================================================================================
try:
    audit_logger.info('Final Results')
    if len(df_1) > 0 and len(df_2) > 0:
        result_df = pd.concat([result_df_1, result_df_2])
        result_df = result_df[["Preferred_Clothing", "Singlepiece", "Bottoms", "Upperwear"]] 
    elif len(df_1) > 0:
        result_df = result_df_1[["Preferred_Clothing", "Singlepiece"]] 
        result_df["Bottoms"] = "NA"
        result_df["Upperwear"] = "NA"
    else:
        result_df = result_df_2[["Preferred_Clothing", "Bottoms", "Upperwear"]] 
        result_df["Singlepiece"] = "NA"
    df = pd.read_excel(os.path.join(input_folder, "input.xlsx"))
    df = df[['Country of Birth', 'Country of Residence', 'Birthdate', 'Gender',
           'Approximate weight in kilogram', 'Approximate height in centimetre',
           'What type of clothes you prefer to wear', 'How do you define yourself',
           'Size of clothes you wear', 'What is your favourite color.',
           'Do you think this color looks best on you and boost your confidence?',
           'Which color do you think looks best on you and makes you feel confident?']]
    result_df = pd.merge(df, result_df, how="left", left_index=True, right_index=True)
    result_df.to_excel(os.path.join(output_folder, "output.xlsx"))
    audit_logger.info(result_df)
except Exception as err:
    audit_logger.info('Final Results - Failed')
    error_logger.error('Final Results - Failed')
    error_logger.error('Exception: ', exc_info=True)
    #input("press Enter to Close")
    sys.exit()  

192.168.1.112 : 2024-09-05 16:22:33,813 : INFO : [2463263403.py:5] : Final Results


192.168.1.112 : 2024-09-05 16:22:33,849 : INFO : [2463263403.py:25] :   Country of Birth Country of Residence  Birthdate  Gender  \
0            India                   UK 1996-07-01  Female   

   Approximate weight in kilogram  Approximate height in centimetre  \
0                              80                               154   

  What type of clothes you prefer to wear  \
0                             Semi Formal   

                          How do you define yourself Size of clothes you wear  \
0  Extrovert (Are you outgoing, socially confident?)                       XL   

  What is your favourite color.  \
0                        Yellow   

  Do you think this color looks best on you and boost your confidence?  \
0                                                 No                     

  Which color do you think looks best on you and makes you feel confident?  \
0                                               Pink                         

  Preferred_Clothing           

In [14]:
audit_logger.info('Process run success')
total_time = datetime.now() - curr_time
audit_logger.info(f'Total time taken = {total_time}')

192.168.1.112 : 2024-09-05 16:22:33,860 : INFO : [2214019538.py:1] : Process run success


192.168.1.112 : 2024-09-05 16:22:33,860 : INFO : [2214019538.py:3] : Total time taken = 0:00:05.415354
