## 기본코드

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [2]:
code = '005930' # 삼성전자 코드
url = f'https://finance.naver.com/item/sise_day.naver?code={code}'
req = requests.get(url, headers={'User-agent': 'Mozilla/5.0'})
html = BeautifulSoup(req.text, "lxml")

In [3]:
headers = {'User-agent': 'Mozilla/5.0'}

In [4]:
pgrr = html.find('td', class_='pgRR')
print(pgrr.a['href'])

/item/sise_day.naver?code=005930&page=667


In [5]:
# .prettify() 통해 getText 속성값을 계층적으로 보기좋게 출력 가능
print(pgrr.prettify())

<td class="pgRR">
 <a href="/item/sise_day.naver?code=005930&amp;page=667">
  맨뒤
  <img alt="" border="0" height="5" src="https://ssl.pstatic.net/static/n/cmn/bu_pgarRR.gif" width="8"/>
 </a>
</td>



In [6]:
print(pgrr.text)


맨뒤
				




In [7]:
s = pgrr.a['href'].split('=')
print(s)

['/item/sise_day.naver?code', '005930&page', '667']


In [8]:
last_page = s[-1]
print(last_page)

667


In [9]:
code = '005930' # 삼성전자 코드
url = f'https://finance.naver.com/item/sise_day.naver?code={code}'
df = None

for page in range(1, int(last_page)+1):
    req = requests.get(f'{url}&page={page}', headers=headers)
    df = pd.concat([df, pd.read_html(req.text, encoding = 'euc-kr')[0]], ignore_index=True)

In [10]:
# 데이터가 없는 행 일괄 삭제
df.dropna(inplace = True)

# 인덱스 재배열
df.reset_index(drop=True, inplace=True)

In [11]:
df # 삼성전자 일별 시세

Unnamed: 0,날짜,종가,전일비,시가,고가,저가,거래량
0,2023.01.20,61800.0,300.0,62100.0,62300.0,61100.0,9598133.0
1,2023.01.19,61500.0,1100.0,60500.0,61500.0,60400.0,12808490.0
2,2023.01.18,60400.0,600.0,60700.0,61000.0,59900.0,11584041.0
3,2023.01.17,61000.0,100.0,61200.0,61500.0,60600.0,9831456.0
4,2023.01.16,61100.0,300.0,61300.0,61600.0,60800.0,10039972.0
...,...,...,...,...,...,...,...
6665,1996.07.01,72000.0,3500.0,69000.0,72300.0,68800.0,147310.0
6666,1996.06.29,68500.0,400.0,68100.0,69100.0,67100.0,96710.0
6667,1996.06.28,68100.0,1200.0,67300.0,68500.0,67200.0,138430.0
6668,1996.06.27,66900.0,800.0,67500.0,67700.0,66700.0,155450.0


In [12]:
# 1년 평균 250거래일
df = df.loc[:751].copy()

In [13]:
df

Unnamed: 0,날짜,종가,전일비,시가,고가,저가,거래량
0,2023.01.20,61800.0,300.0,62100.0,62300.0,61100.0,9598133.0
1,2023.01.19,61500.0,1100.0,60500.0,61500.0,60400.0,12808490.0
2,2023.01.18,60400.0,600.0,60700.0,61000.0,59900.0,11584041.0
3,2023.01.17,61000.0,100.0,61200.0,61500.0,60600.0,9831456.0
4,2023.01.16,61100.0,300.0,61300.0,61600.0,60800.0,10039972.0
...,...,...,...,...,...,...,...
747,2020.01.15,59000.0,1000.0,59500.0,59600.0,58900.0,14300928.0
748,2020.01.14,60000.0,0.0,60400.0,61000.0,59900.0,16906295.0
749,2020.01.13,60000.0,500.0,59600.0,60000.0,59100.0,11359139.0
750,2020.01.10,59500.0,900.0,58800.0,59700.0,58300.0,16000170.0


### 종가 등락률 계산

In [14]:
df['종가_등락률'] = round((df['종가']-df['종가'].shift(-1)) / df['종가'].shift(-1) *100, 2)

In [15]:
df.head()

Unnamed: 0,날짜,종가,전일비,시가,고가,저가,거래량,종가_등락률
0,2023.01.20,61800.0,300.0,62100.0,62300.0,61100.0,9598133.0,0.49
1,2023.01.19,61500.0,1100.0,60500.0,61500.0,60400.0,12808490.0,1.82
2,2023.01.18,60400.0,600.0,60700.0,61000.0,59900.0,11584041.0,-0.98
3,2023.01.17,61000.0,100.0,61200.0,61500.0,60600.0,9831456.0,-0.16
4,2023.01.16,61100.0,300.0,61300.0,61600.0,60800.0,10039972.0,0.49


### 거래량 등락률 계산

In [16]:
df['거래량_등락률'] = round((df['거래량']-df['거래량'].shift(-1)) / df['거래량'].shift(-1) *100, 2)

In [17]:
df.head()

Unnamed: 0,날짜,종가,전일비,시가,고가,저가,거래량,종가_등락률,거래량_등락률
0,2023.01.20,61800.0,300.0,62100.0,62300.0,61100.0,9598133.0,0.49,-25.06
1,2023.01.19,61500.0,1100.0,60500.0,61500.0,60400.0,12808490.0,1.82,10.57
2,2023.01.18,60400.0,600.0,60700.0,61000.0,59900.0,11584041.0,-0.98,17.83
3,2023.01.17,61000.0,100.0,61200.0,61500.0,60600.0,9831456.0,-0.16,-2.08
4,2023.01.16,61100.0,300.0,61300.0,61600.0,60800.0,10039972.0,0.49,-19.75


### 당일 고가-당일 종가의 비율

In [18]:
df['고가-종가_비율'] = round((df['고가']-df['종가']) / df['고가'] * 100, 2)

In [19]:
df.head()

Unnamed: 0,날짜,종가,전일비,시가,고가,저가,거래량,종가_등락률,거래량_등락률,고가-종가_비율
0,2023.01.20,61800.0,300.0,62100.0,62300.0,61100.0,9598133.0,0.49,-25.06,0.8
1,2023.01.19,61500.0,1100.0,60500.0,61500.0,60400.0,12808490.0,1.82,10.57,0.0
2,2023.01.18,60400.0,600.0,60700.0,61000.0,59900.0,11584041.0,-0.98,17.83,0.98
3,2023.01.17,61000.0,100.0,61200.0,61500.0,60600.0,9831456.0,-0.16,-2.08,0.81
4,2023.01.16,61100.0,300.0,61300.0,61600.0,60800.0,10039972.0,0.49,-19.75,0.81


### 당일 종가-당일 저가의 비율

In [20]:
df['종가-저가_비율'] = round((df['종가']-df['저가']) / df['종가'] * 100, 2)

In [21]:
df.head()

Unnamed: 0,날짜,종가,전일비,시가,고가,저가,거래량,종가_등락률,거래량_등락률,고가-종가_비율,종가-저가_비율
0,2023.01.20,61800.0,300.0,62100.0,62300.0,61100.0,9598133.0,0.49,-25.06,0.8,1.13
1,2023.01.19,61500.0,1100.0,60500.0,61500.0,60400.0,12808490.0,1.82,10.57,0.0,1.79
2,2023.01.18,60400.0,600.0,60700.0,61000.0,59900.0,11584041.0,-0.98,17.83,0.98,0.83
3,2023.01.17,61000.0,100.0,61200.0,61500.0,60600.0,9831456.0,-0.16,-2.08,0.81,0.66
4,2023.01.16,61100.0,300.0,61300.0,61600.0,60800.0,10039972.0,0.49,-19.75,0.81,0.49


### 연속 상승 마감, 연속 하락 마감

In [22]:
df['연속상승'] = np.nan

In [23]:
df['연속하락'] = np.nan

In [24]:
for i in range(len(df)-1):
    if df['종가_등락률'][i] > 0 and df['종가_등락률'][i+1] > 0:
        df['연속상승'][i] = 1
    else: df['연속상승'][i] = 0
    
    if df['종가_등락률'][i] < 0 and df['종가_등락률'][i+1] < 0:
        df['연속하락'][i] = 1
    else: df['연속하락'][i] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['연속상승'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  else: df['연속하락'][i] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  else: df['연속상승'][i] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['연속하락'][i] = 1


In [25]:
df.head()

Unnamed: 0,날짜,종가,전일비,시가,고가,저가,거래량,종가_등락률,거래량_등락률,고가-종가_비율,종가-저가_비율,연속상승,연속하락
0,2023.01.20,61800.0,300.0,62100.0,62300.0,61100.0,9598133.0,0.49,-25.06,0.8,1.13,1.0,0.0
1,2023.01.19,61500.0,1100.0,60500.0,61500.0,60400.0,12808490.0,1.82,10.57,0.0,1.79,0.0,0.0
2,2023.01.18,60400.0,600.0,60700.0,61000.0,59900.0,11584041.0,-0.98,17.83,0.98,0.83,0.0,1.0
3,2023.01.17,61000.0,100.0,61200.0,61500.0,60600.0,9831456.0,-0.16,-2.08,0.81,0.66,0.0,0.0
4,2023.01.16,61100.0,300.0,61300.0,61600.0,60800.0,10039972.0,0.49,-19.75,0.81,0.49,1.0,0.0


### target 값 생성

In [26]:
df['target'] = np.nan

In [27]:
for i in range(1, len(df)-1):
    if df['종가_등락률'][i-1] > 0:
        df['target'][i] = 1
    else:
        df['target'][i] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['target'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['target'][i] = 0


In [28]:
df

Unnamed: 0,날짜,종가,전일비,시가,고가,저가,거래량,종가_등락률,거래량_등락률,고가-종가_비율,종가-저가_비율,연속상승,연속하락,target
0,2023.01.20,61800.0,300.0,62100.0,62300.0,61100.0,9598133.0,0.49,-25.06,0.80,1.13,1.0,0.0,
1,2023.01.19,61500.0,1100.0,60500.0,61500.0,60400.0,12808490.0,1.82,10.57,0.00,1.79,0.0,0.0,1.0
2,2023.01.18,60400.0,600.0,60700.0,61000.0,59900.0,11584041.0,-0.98,17.83,0.98,0.83,0.0,1.0,1.0
3,2023.01.17,61000.0,100.0,61200.0,61500.0,60600.0,9831456.0,-0.16,-2.08,0.81,0.66,0.0,0.0,0.0
4,2023.01.16,61100.0,300.0,61300.0,61600.0,60800.0,10039972.0,0.49,-19.75,0.81,0.49,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
747,2020.01.15,59000.0,1000.0,59500.0,59600.0,58900.0,14300928.0,-1.67,-15.41,1.01,0.17,0.0,0.0,1.0
748,2020.01.14,60000.0,0.0,60400.0,61000.0,59900.0,16906295.0,0.00,48.83,1.64,0.17,0.0,0.0,0.0
749,2020.01.13,60000.0,500.0,59600.0,60000.0,59100.0,11359139.0,0.84,-29.01,0.00,1.50,1.0,0.0,0.0
750,2020.01.10,59500.0,900.0,58800.0,59700.0,58300.0,16000170.0,1.54,-33.62,0.34,2.02,0.0,0.0,1.0


In [29]:
df_result = df.iloc[1:-1,7:].copy()
df_result['target'] = df_result['target'].astype('int')

In [31]:
df_result

Unnamed: 0,종가_등락률,거래량_등락률,고가-종가_비율,종가-저가_비율,연속상승,연속하락,target
1,1.82,10.57,0.00,1.79,0.0,0.0,1
2,-0.98,17.83,0.98,0.83,0.0,1.0,1
3,-0.16,-2.08,0.81,0.66,0.0,0.0,0
4,0.49,-19.75,0.81,0.49,1.0,0.0,0
5,0.50,-22.31,0.65,0.66,0.0,0.0,1
...,...,...,...,...,...,...,...
746,2.88,0.57,0.00,2.80,0.0,0.0,1
747,-1.67,-15.41,1.01,0.17,0.0,0.0,1
748,0.00,48.83,1.64,0.17,0.0,0.0,0
749,0.84,-29.01,0.00,1.50,1.0,0.0,0


In [32]:
df_result.to_csv('df_result.csv', index=False)