## Import

In [17]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

import random
import os

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Lasso

#----------------------------------------

## Fixed Random-Seed

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) #Seed 고점

## Load Data

file_path1 = 'C:/Users/human/Downloads/웹 로그 기반 조회수 예측 해커톤/train.csv'

df_train = pd.read_csv(file_path1, encoding='utf8')

print(df_train)

print('*'*80)



             sessionID       userID  TARGET           browser         OS  \
0       SESSION_000000  USER_000000    17.0            Chrome  Macintosh   
1       SESSION_000001  USER_000001     3.0            Chrome    Windows   
2       SESSION_000002  USER_000002     1.0  Samsung Internet    Android   
3       SESSION_000003  USER_000003     1.0            Chrome  Macintosh   
4       SESSION_000004  USER_000004     1.0            Chrome        iOS   
...                ...          ...     ...               ...        ...   
252284  SESSION_252284  USER_206021     1.0            Chrome    Android   
252285  SESSION_252285  USER_206022     1.0            Chrome  Macintosh   
252286  SESSION_252286  USER_164240     5.0            Chrome  Macintosh   
252287  SESSION_252287  USER_206023     1.0   Android Webview    Android   
252288  SESSION_252288  USER_161023     6.0            Chrome  Macintosh   

         device  new  quality  duration  bounced  transaction  \
0       desktop    0  

In [18]:
file_path2 = 'C:/Users/human/Downloads/웹 로그 기반 조회수 예측 해커톤/test.csv'
df_test = pd.read_csv(file_path2, encoding='utf8')
print(df_test)

            sessionID       userID            browser             OS   device  \
0      SESSION_252289  USER_206024             Chrome          Linux  desktop   
1      SESSION_252290  USER_206025             Safari            iOS   tablet   
2      SESSION_252291  USER_206026             Chrome        Windows  desktop   
3      SESSION_252292  USER_206027             Chrome        Android   mobile   
4      SESSION_252293  USER_206028  Internet Explorer  Windows Phone   mobile   
...               ...          ...                ...            ...      ...   
79781  SESSION_332070  USER_268999  Internet Explorer        Windows   tablet   
79782  SESSION_332071  USER_269000             Chrome      Macintosh  desktop   
79783  SESSION_332072  USER_269001             Safari      Macintosh  desktop   
79784  SESSION_332073  USER_269002             Safari            iOS   mobile   
79785  SESSION_332074  USER_269003             Chrome        Android   mobile   

       new  quality  durati

In [19]:
# 모델에게 입력할 학습데이터와 추론데이터에 대해 결측값을 대체해보자.
# 단, 추론 데이터 결측값 대체시에 추론데이터의 통계량을 활용할 경우 Data Leakage에 해당됨.

# train 데이터셋에서 결측값 확인

train_missing_values = df_train.isnull().sum()

print(train_missing_values)

print('*'*80)

print('Train 데이터셋의 결측값 : ')
print(train_missing_values[train_missing_values > 0]) # 결측값이 있는 열만 출력.

print('*'*80)

print(f'Train 데이터셋의 결측값 : \n{train_missing_values} > 0')

sessionID                   0
userID                      0
TARGET                      0
browser                     0
OS                          0
device                      0
new                         0
quality                     0
duration                    0
bounced                     0
transaction                 0
transaction_revenue         0
continent                   0
subcontinent                0
country                     0
traffic_source              0
traffic_medium              0
keyword                137675
referral_path          161107
dtype: int64
********************************************************************************
Train 데이터셋의 결측값 : 
keyword          137675
referral_path    161107
dtype: int64
********************************************************************************
Train 데이터셋의 결측값 : 
sessionID                   0
userID                      0
TARGET                      0
browser                     0
OS                          0
device

In [20]:
# test 데이터의 결측값 대체

#df_train.dropna(subset=

df_test['keyword'].fillna(df_train['keyword'].mode()[0], inplace=True) # train의 최빈값 사용

df_test['referral_path'].fillna(df_train['referral_path'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test['keyword'].fillna(df_train['keyword'].mode()[0], inplace=True) # train의 최빈값 사용
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_test['referral_path'].fillna(df_train['referral_path'].mode()[0], inplace=True)


## 데이터 전처리 2: Label 인코딩 및 변수 제거

In [21]:
# Label인코딩을 위해 범주형 변수와 그 고유값의 개수를 확인해보자.

In [22]:
categorical_features = list(df_train.dtypes[df_train.dtypes == 'object'].index)

for i in categorical_features:
    count = df_train[i].nunique()
    print(f'{i}: {count}')

sessionID: 252289
userID: 206024
browser: 70
OS: 21
device: 3
continent: 6
subcontinent: 23
country: 205
traffic_source: 168
traffic_medium: 7
keyword: 623
referral_path: 1578


In [23]:
df_train = df_train.drop(columns=['sessionID', 'userID'],axis=1)

df_test = df_test.drop(columns=['sessionID', 'userID'], axis=1)

In [24]:
encoding_target = list(df_train.dtypes[df_train.dtypes == 'object'].index)

for i in encoding_target:
    le = LabelEncoder()
    le.fit(df_train[i])
    df_train[i] = le.transform(df_train[i])

    # test 데이터의 새로운 카테고리에 대해 le.classes_ 배열에 추가
    # test 데이터에 대해서 직접적으로 fit을 수행할 경우 Data Leakage
    for case in np.unique(df_test[i]):
        if case not in le.classes_:
            le.classes_ = np.append(le.classes_, case)

    df_test[i] = le.transform(df_test[i])

## X와 Y 설정

In [25]:
df_train_x = df_train.drop(columns=['TARGET'])

df_train_y = df_train['TARGET']

df_test_x = df_test

## Model Learning and Prediction

In [26]:
### Lasso 모델을 사용하여 학습하고, 예측을 수행합니다.

lasso = Lasso(alpha=0.1).fit(df_train_x, df_train_y)
preds = lasso.predict(df_test_x)

## Submission

In [27]:
submission = pd.read_csv('C:/Users/human/Downloads/웹 로그 기반 조회수 예측 해커톤/sample_submission.csv')
print(submission)

            sessionID  TARGET
0      SESSION_252289       0
1      SESSION_252290       0
2      SESSION_252291       0
3      SESSION_252292       0
4      SESSION_252293       0
...               ...     ...
79781  SESSION_332070       0
79782  SESSION_332071       0
79783  SESSION_332072       0
79784  SESSION_332073       0
79785  SESSION_332074       0

[79786 rows x 2 columns]


In [28]:
submission['TARGET'] = preds
print(submission)

            sessionID     TARGET
0      SESSION_252289  22.572787
1      SESSION_252290   0.814696
2      SESSION_252291   3.587189
3      SESSION_252292   3.646200
4      SESSION_252293   5.358456
...               ...        ...
79781  SESSION_332070   0.837404
79782  SESSION_332071   0.873058
79783  SESSION_332072   3.548467
79784  SESSION_332073   3.979518
79785  SESSION_332074   0.809208

[79786 rows x 2 columns]


In [29]:
submission.to_csv('./baseline_submission.csv', index=False)