In [74]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer
import re
from sklearn.feature_extraction.text import CountVectorizer
import tensorflow as tf
import keras
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [75]:
# csv 파일을 dataframe으로 변환
df_outfit = pd.read_csv('../data/outfit(male)/outfit(male).csv')
df_weather = pd.read_csv('../data/2022-08-01_to_2024-04-30.csv', encoding='cp949')
# 필요한 columns만 추출
df_outfit = df_outfit[['userId', '상의', '아우터', '하의', '신발', '액세서리', '작성일']].copy()
df_temp = df_weather[['일시', '평균기온(°C)', '최저기온(°C)', '최고기온(°C)', '강수 계속시간(hr)', '평균 풍속(m/s)', '평균 상대습도(%)']].copy()

# '작성일'과 '일시' 열을 datetime 형식으로 변환
df_outfit['작성일'] = pd.to_datetime(df_outfit['작성일'], format='%Y년 %m월 %d일')
df_temp['일시'] = pd.to_datetime(df_temp['일시'])

# 두 dataframe을 날짜를 기준으로 병합
df_merged = pd.merge(df_outfit, df_temp, left_on='작성일', right_on='일시').drop('일시', axis=1)

df_merged

Unnamed: 0,userId,상의,아우터,하의,신발,액세서리,작성일,평균기온(°C),최저기온(°C),최고기온(°C),강수 계속시간(hr),평균 풍속(m/s),평균 상대습도(%)
0,1,"반팔 티, 셔츠/블라우스",재킷,반바지,구두/로퍼,,2024-04-24,13.2,11.0,16.5,7.00,3.0,80.1
1,1,반팔 티,재킷,반바지,운동화,기타 모자,2024-04-19,17.6,11.5,24.3,,2.1,51.8
2,1,반팔 티,재킷,반바지,구두/로퍼,장목양말,2024-04-15,16.0,11.7,20.0,12.17,2.5,77.4
3,1,반팔 티,,나일론 팬츠,구두/로퍼,,2024-04-09,15.3,10.6,20.8,,3.5,32.4
4,1,반팔 티,집업,면바지,구두/로퍼,,2024-04-05,14.0,10.2,18.8,,2.5,51.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,14,반팔 티,,반바지,운동화,"기타 모자, 장목양말",2024-04-23,17.3,13.2,24.2,3.33,3.6,55.0
1334,14,반팔 티,,카고바지,운동화,기타 모자,2024-04-24,13.2,11.0,16.5,7.00,3.0,80.1
1335,14,반팔 티,집업,나일론 팬츠,스니커즈/캔버스,기타 모자,2024-04-25,14.4,10.3,20.8,,1.9,75.6
1336,14,"반팔 티, 셔츠/블라우스",,반바지,구두/로퍼,장목양말,2024-04-26,17.8,11.4,26.0,,1.9,61.1


In [76]:
# 평균기온(°C), 최저기온(°C), 최고기온(°C), 강수 계속시간(hr), 평균 풍속(m/s), 평균 상대습도(%) 열의 결측치를 0으로 대체
df_notnull = df_merged.copy()
w_columns = ['평균기온(°C)', '최저기온(°C)', '최고기온(°C)', '강수 계속시간(hr)', '평균 풍속(m/s)', '평균 상대습도(%)']
df_notnull[w_columns] = df_notnull[w_columns].fillna(0)
df_notnull['강수 계속시간(hr)'] = df_notnull['강수 계속시간(hr)'].apply(lambda x: 1 if x > 0 else 0)
df_notnull.isna().sum()
df_notnull

Unnamed: 0,userId,상의,아우터,하의,신발,액세서리,작성일,평균기온(°C),최저기온(°C),최고기온(°C),강수 계속시간(hr),평균 풍속(m/s),평균 상대습도(%)
0,1,"반팔 티, 셔츠/블라우스",재킷,반바지,구두/로퍼,,2024-04-24,13.2,11.0,16.5,1,3.0,80.1
1,1,반팔 티,재킷,반바지,운동화,기타 모자,2024-04-19,17.6,11.5,24.3,0,2.1,51.8
2,1,반팔 티,재킷,반바지,구두/로퍼,장목양말,2024-04-15,16.0,11.7,20.0,1,2.5,77.4
3,1,반팔 티,,나일론 팬츠,구두/로퍼,,2024-04-09,15.3,10.6,20.8,0,3.5,32.4
4,1,반팔 티,집업,면바지,구두/로퍼,,2024-04-05,14.0,10.2,18.8,0,2.5,51.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,14,반팔 티,,반바지,운동화,"기타 모자, 장목양말",2024-04-23,17.3,13.2,24.2,1,3.6,55.0
1334,14,반팔 티,,카고바지,운동화,기타 모자,2024-04-24,13.2,11.0,16.5,1,3.0,80.1
1335,14,반팔 티,집업,나일론 팬츠,스니커즈/캔버스,기타 모자,2024-04-25,14.4,10.3,20.8,0,1.9,75.6
1336,14,"반팔 티, 셔츠/블라우스",,반바지,구두/로퍼,장목양말,2024-04-26,17.8,11.4,26.0,0,1.9,61.1


In [77]:
df_fill = df_notnull.copy()
# '상의', '아우터', '하의', '신발', '엑세서리' 열의 결측값을 '~ 없음'으로 대체
o_columns = ['상의', '아우터', '하의', '신발', '액세서리']
for column in o_columns:
    df_fill[column] = df_notnull[column].fillna(column + ' 없음')
df_fill.isna().sum()

userId         0
상의             0
아우터            0
하의             0
신발             0
액세서리           0
작성일            0
평균기온(°C)       0
최저기온(°C)       0
최고기온(°C)       0
강수 계속시간(hr)    0
평균 풍속(m/s)     0
평균 상대습도(%)     0
dtype: int64

In [78]:
# 평균기온(°C), 최저기온(°C), 최고기온(°C), 강수 계속시간(hr), 평균 풍속(m/s), 평균 상대습도(%) 열을 표준화
scaler = StandardScaler()
df_scaled = df_fill.copy()
df_scaled[w_columns] = scaler.fit_transform(df_notnull[w_columns])
df_scaled

Unnamed: 0,userId,상의,아우터,하의,신발,액세서리,작성일,평균기온(°C),최저기온(°C),최고기온(°C),강수 계속시간(hr),평균 풍속(m/s),평균 상대습도(%)
0,1,"반팔 티, 셔츠/블라우스",재킷,반바지,구두/로퍼,액세서리 없음,2024-04-24,-0.105343,0.074868,-0.242682,1.213754,1.061750,0.923333
1,1,반팔 티,재킷,반바지,운동화,기타 모자,2024-04-19,0.335939,0.123847,0.536906,-0.823890,-0.268558,-1.014123
2,1,반팔 티,재킷,반바지,구두/로퍼,장목양말,2024-04-15,0.175472,0.143439,0.107133,1.213754,0.322690,0.738487
3,1,반팔 티,아우터 없음,나일론 팬츠,구두/로퍼,액세서리 없음,2024-04-09,0.105269,0.035684,0.187091,-0.823890,1.800811,-2.342274
4,1,반팔 티,집업,면바지,구두/로퍼,액세서리 없음,2024-04-05,-0.025110,-0.003500,-0.012803,-0.823890,0.322690,-1.048354
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,14,반팔 티,아우터 없음,반바지,운동화,"기타 모자, 장목양말",2024-04-23,0.305851,0.290378,0.526911,1.213754,1.948623,-0.795047
1334,14,반팔 티,아우터 없음,카고바지,운동화,기타 모자,2024-04-24,-0.105343,0.074868,-0.242682,1.213754,1.061750,0.923333
1335,14,반팔 티,집업,나일론 팬츠,스니커즈/캔버스,기타 모자,2024-04-25,0.015006,0.006296,0.187091,-0.823890,-0.564183,0.615257
1336,14,"반팔 티, 셔츠/블라우스",아우터 없음,반바지,구두/로퍼,장목양말,2024-04-26,0.355997,0.114051,0.706816,-0.823890,-0.564183,-0.377433


In [79]:
# 현재 날씨를 입력 하고 변환을 적용			
df_today = pd.DataFrame(columns=['평균기온(°C)', '최저기온(°C)', '최고기온(°C)', '강수 계속시간(hr)', '평균 풍속(m/s)', '평균 상대습도(%)'])
# 13.3	12.2	15	16.17	3	77.3로 초기화
df_today.loc[0] = [13.3, 12.2, 15, 16.17, 3, 77.3]
df_today[w_columns] = df_today[w_columns].fillna(0)
df_today['강수 계속시간(hr)'] = df_today['강수 계속시간(hr)'].apply(lambda x: 1 if x > 0 else 0)
# 표준화 적용
df_today[w_columns] = scaler.transform(df_today)
df_today

Unnamed: 0,평균기온(°C),최저기온(°C),최고기온(°C),강수 계속시간(hr),평균 풍속(m/s),평균 상대습도(%)
0,-0.095314,0.192419,-0.392603,1.213754,1.06175,0.731641


In [80]:
df_scaled

Unnamed: 0,userId,상의,아우터,하의,신발,액세서리,작성일,평균기온(°C),최저기온(°C),최고기온(°C),강수 계속시간(hr),평균 풍속(m/s),평균 상대습도(%)
0,1,"반팔 티, 셔츠/블라우스",재킷,반바지,구두/로퍼,액세서리 없음,2024-04-24,-0.105343,0.074868,-0.242682,1.213754,1.061750,0.923333
1,1,반팔 티,재킷,반바지,운동화,기타 모자,2024-04-19,0.335939,0.123847,0.536906,-0.823890,-0.268558,-1.014123
2,1,반팔 티,재킷,반바지,구두/로퍼,장목양말,2024-04-15,0.175472,0.143439,0.107133,1.213754,0.322690,0.738487
3,1,반팔 티,아우터 없음,나일론 팬츠,구두/로퍼,액세서리 없음,2024-04-09,0.105269,0.035684,0.187091,-0.823890,1.800811,-2.342274
4,1,반팔 티,집업,면바지,구두/로퍼,액세서리 없음,2024-04-05,-0.025110,-0.003500,-0.012803,-0.823890,0.322690,-1.048354
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,14,반팔 티,아우터 없음,반바지,운동화,"기타 모자, 장목양말",2024-04-23,0.305851,0.290378,0.526911,1.213754,1.948623,-0.795047
1334,14,반팔 티,아우터 없음,카고바지,운동화,기타 모자,2024-04-24,-0.105343,0.074868,-0.242682,1.213754,1.061750,0.923333
1335,14,반팔 티,집업,나일론 팬츠,스니커즈/캔버스,기타 모자,2024-04-25,0.015006,0.006296,0.187091,-0.823890,-0.564183,0.615257
1336,14,"반팔 티, 셔츠/블라우스",아우터 없음,반바지,구두/로퍼,장목양말,2024-04-26,0.355997,0.114051,0.706816,-0.823890,-0.564183,-0.377433


In [81]:
# 각 userid마다 현재 날씨와 기록된 날씨의 cosine similarity를 계산하고 각 userid마다 가장 높은 similarity를 가지는 3개의 row를 추출
user_ids = df_fill['userId'].unique()
similarities = []
indices = []
for user_id in user_ids:
    df_user = df_scaled[df_scaled['userId'] == user_id]
    df_user = df_user[w_columns]
    df_user = pd.concat([df_today, df_user])
    similarity = cosine_similarity(df_user)
    similarities.append(similarity[0][1:])
    # 3개의 row index를 추출
    index = np.argsort(similarity[0][1:])[-3:]
    indices.append(index)

In [82]:
indices

[array([68, 76,  0]),
 array([98,  0, 64]),
 array([12,  8,  4]),
 array([ 5, 21, 16]),
 array([4, 3, 6]),
 array([36, 22,  0]),
 array([51, 18, 19]),
 array([39, 59,  4]),
 array([48, 24, 54]),
 array([33, 20, 23]),
 array([20,  8, 13]),
 array([71, 55, 92]),
 array([13, 41, 42]),
 array([ 20, 222, 226])]

In [83]:
# user마다 가장 높은 similarity를 가지는 3개의 row를 df_fill에서 추출
df_recommend = pd.DataFrame(columns=df_fill.columns)
for i, index in enumerate(indices):
    df_user = df_fill[df_fill['userId'] == user_ids[i]]
    df_user = df_user.iloc[index]
    df_recommend = pd.concat([df_recommend, df_user])
df_recommend.to_csv('recommendation.csv', index=False)

  df_recommend = pd.concat([df_recommend, df_user])
