## 텍스트 토큰화

In [1]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import nltk

In [2]:
# 구두점 데이터 다운로드
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
# 텍스트 데이터 생성

string_temp = "The science of today is the technology of tomorrow. Today is great. WTF"

token_temp = word_tokenize(string_temp)
print(token_temp)

sent_temp = sent_tokenize(string_temp)
print(sent_temp)

['The', 'science', 'of', 'today', 'is', 'the', 'technology', 'of', 'tomorrow', '.', 'Today', 'is', 'great', '.', 'WTF']
['The science of today is the technology of tomorrow.', 'Today is great.', 'WTF']


## 불용어 삭제

In [6]:
from nltk.corpus import stopwords
nltk.download('stopwords')    # 불용어 데이터 다운

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
# 단어 토큰 생성
tokenized_words = ['i','am','going','to','go','to','the','store','and','park']

In [14]:
# 불용어 로드
stop_words = stopwords.words('english')
print("불용어 리스트 길이 >>", len(stop_words))
print("불용어 리스트 >>", stop_words)

불용어 리스트 길이 >> 179
불용어 리스트 >> ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', '

In [16]:
# 불용어 삭제
# [word for word in tokenized_words if word not in stop_words]

for word in tokenized_words:
  if word not in stop_words:
    print(word)

going
go
store
park


In [15]:
# 불용어 확인
stop_data = stop_words[:5]
print("불용어 확인", stop_data)

불용어 확인 ['i', 'me', 'my', 'myself', 'we']


## 어간 추출

In [17]:
from nltk.stem.porter import PorterStemmer

In [18]:
# 단어 토큰을 만듭니다.
tokenized_words = ['i','am','humbled','by','this','traditional','meeting']

In [19]:
# 어간 추출기를 만듭니다.
porter = PorterStemmer()

In [20]:
word_list = []

In [21]:
# 어간 추출기를 적용합니다.
for word in tokenized_words:
  word_list.append(porter.stem(word))

print(word_list)

['i', 'am', 'humbl', 'by', 'thi', 'tradit', 'meet']


## 품사 태깅

In [23]:
from nltk import pos_tag
from nltk import word_tokenize

# 태거를 다운로드
nltk.download('averaged_perceptron_tagger')

# 샘플 텍스트 데이터
text_data = "Chris loved outdoor running"

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [24]:
# 사전 훈련된 품사 태깅을 사용합니다.
text_tagged = pos_tag(word_tokenize(text_data))
print(text_tagged)    # 품사확인(단어와 품사 태그로 이루어진 튜플의 리스트)

[('Chris', 'NNP'), ('loved', 'VBD'), ('outdoor', 'RP'), ('running', 'VBG')]


## 단어 중요도에 가중치 부여하기


In [25]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [31]:
text_data = np.array(['I love Korea. Korea!',
                      'Korea is best',
                      'Germany beats both'])

# tf-idf 특성 행렬을 만든다
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)
feature_matrix      # tf-idf 특성 행렬 확인
print(feature_matrix)
feature_matrix.toarray()    # tf-idf 특성 행렬을 밀집 배열로 확인

# 특성 이름을 확인
tf = tfidf.vocabulary_
print('\n',tf)

  (0, 5)	0.8355915419449176
  (0, 6)	0.5493512310263033
  (1, 1)	0.6227660078332259
  (1, 4)	0.6227660078332259
  (1, 5)	0.4736296010332684
  (2, 2)	0.5773502691896257
  (2, 0)	0.5773502691896257
  (2, 3)	0.5773502691896257

 {'love': 6, 'korea': 5, 'is': 4, 'best': 1, 'germany': 3, 'beats': 0, 'both': 2}


## word cloud


In [32]:
!pip install pytagcloud

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytagcloud
  Downloading pytagcloud-0.3.5.tar.gz (754 kB)
[K     |████████████████████████████████| 754 kB 7.4 MB/s 
[?25hBuilding wheels for collected packages: pytagcloud
  Building wheel for pytagcloud (setup.py) ... [?25l[?25hdone
  Created wheel for pytagcloud: filename=pytagcloud-0.3.5-py3-none-any.whl size=759868 sha256=766caf15e7c625ed7c3d7eccb1273a516a179469e8af7970ffe7fc5830092086
  Stored in directory: /root/.cache/pip/wheels/be/9b/7d/911eafd2b3a1ab76aafb0d5929e7aede8db74f5436af7a2b9e
Successfully built pytagcloud
Installing collected packages: pytagcloud
Successfully installed pytagcloud-0.3.5


In [33]:
!pip install pygame

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pygame
  Downloading pygame-2.1.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21.8 MB)
[K     |████████████████████████████████| 21.8 MB 1.3 MB/s 
[?25hInstalling collected packages: pygame
Successfully installed pygame-2.1.2


In [35]:
!pip install simplejson

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting simplejson
  Downloading simplejson-3.18.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (139 kB)
[K     |████████████████████████████████| 139 kB 5.9 MB/s 
[?25hInstalling collected packages: simplejson
Successfully installed simplejson-3.18.0


In [36]:
import pytagcloud
import pygame

tag =[('hello', 100), ('world', 80), ('Python', 120), ('kdb', 70), 
      ('nice', 60), ('Deep Learning', 20), ('DB', 40), ('great', 120), 
      ('MySQL', 110), ('DT', 125), ('SVM', 10), ('Text Data mining', 170), 
    ('kaggle', 45), ('randomForest', 55), ('Regresstion', 160), ('Loss Function', 195 ),
]

In [38]:
tag_list = pytagcloud.make_tags(tag, maxsize=50)  # tag화 시켜줌

# 이미지 생성
pytagcloud.create_tag_image(tag_list, 'word_cloud.jpg',
                            size=(900,600), rectangular=False)

print(tag_list)

[{'color': (121, 133, 21), 'size': 52, 'tag': 'Loss Function'}, {'color': (94, 161, 190), 'size': 46, 'tag': 'Text Data mining'}, {'color': (124, 140, 167), 'size': 44, 'tag': 'Regresstion'}, {'color': (89, 93, 172), 'size': 37, 'tag': 'DT'}, {'color': (195, 113, 125), 'size': 36, 'tag': 'Python'}, {'color': (188, 18, 177), 'size': 36, 'tag': 'great'}, {'color': (169, 214, 54), 'size': 34, 'tag': 'MySQL'}, {'color': (163, 190, 21), 'size': 31, 'tag': 'hello'}, {'color': (71, 61, 114), 'size': 27, 'tag': 'world'}, {'color': (39, 141, 131), 'size': 24, 'tag': 'kdb'}, {'color': (171, 33, 27), 'size': 22, 'tag': 'nice'}, {'color': (186, 161, 141), 'size': 20, 'tag': 'randomForest'}, {'color': (119, 73, 110), 'size': 18, 'tag': 'kaggle'}, {'color': (178, 143, 13), 'size': 16, 'tag': 'DB'}, {'color': (159, 38, 95), 'size': 10, 'tag': 'Deep Learning'}, {'color': (20, 129, 29), 'size': 7, 'tag': 'SVM'}]


In [44]:
import webbrowser
webbrowser.open('word_cloud.jpg')


False

## 넷플릭스 영화 추천 시스템 구현 예제


In [97]:
import pandas as pd
import numpy as np
# 데이터 읽어오기

movies = pd.read_csv("./movies.csv")
ratings = pd.read_csv("./ratings.csv")
print(movies)
print(ratings)
# 아이템 기반 협업 필터링 구현

data =pd.merge(ratings, movies, on="movieId")
column = ['userId', 'movieId', 'rating','title','genres']
data = data[column]
print(data)
moviedata = data.pivot_table(index="movieId", columns='userId')['rating']
print(moviedata)

# NaN 값을 -1로 변경

moviedata.fillna(-1, inplace=True)
print(moviedata)
from math import sqrt

# kdd 유사도 함수

def sim_distance(data, n1, n2):
  sum=0
  # 두 사용자가 모두 본 영화를 기준
  for i in data.loc[n1, data.loc[n1, :] >= 0].index:
    if data.loc[n2,i] >=0:
      # 누적합
      sum+= pow(data.loc[n1,i]-data.loc[n2,i], 2)

  return sqrt(1/(sum+1))   # 유사도 형식으로 출력
# 나와 유사도가 높은 유저 매칭 함수

def top_match(data, name, rank = 5, simf = sim_distance):
  simList = []
  for i in data.index[-10:]:
    if name != i:
      simList.append((simf(data,name, i),i ))
      
  simList.sort()
  simList.reverse()
  return simList[:rank]
# 추천 시스템 함수

def recommendation(data, person, simf = sim_distance):
  result_top = top_match(data, person, len(data))
  score_dic = {}
  sim_dic = {}
  my_list = []
  for sim, name in result_top:
    if sim < 0 :
      continue
    for movie in data.loc[person, data.loc[person, :] < 0].index:
      sim_sum = 0
      if data.loc[name, movie] >= 0:
        sim_sum += data.loc[name, movie]

        score_dic.setdefault(movie, 0)
        score_dic[movie] += sim_sum

        sim_dic.setdefault(movie, 0)
        sim_dic[movie] += sim

    for key in score_dic:
      my_list.append((score_dic[key]/sim_dic[key], key))
    my_list.sort()
    my_list.reverse()
    return my_list
# 20번 user가 안본 영화 중에서 
# 추천 점수가 가장 높은 순으로 예상 평점과 영화 제목을 추천(15개까지)

movie_list = []
for rate, m_id in recommendation(moviedata, 20):
  movie_list.append(
      (rate, movies.loc[movies['movieId']== m_id,'title'].values[0]))
  if len(movie_list) == 15:
    break

print(movie_list[:15])

       movieId                               title  \
0            1                    Toy Story (1995)   
1            2                      Jumanji (1995)   
2            3             Grumpier Old Men (1995)   
3            4            Waiting to Exhale (1995)   
4            5  Father of the Bride Part II (1995)   
...        ...                                 ...   
27273   131254        Kein Bund für's Leben (2007)   
27274   131256       Feuer, Eis & Dosenbier (2002)   
27275   131258                  The Pirates (2014)   
27276   131260                 Rentun Ruusu (2001)   
27277   131262                    Innocence (2014)   

                                            genres  
0      Adventure|Animation|Children|Comedy|Fantasy  
1                       Adventure|Children|Fantasy  
2                                   Comedy|Romance  
3                             Comedy|Drama|Romance  
4                                           Comedy  
...                              

In [72]:
# 데이터 읽어오기

movies = pd.read_csv("./movies.csv")
ratings = pd.read_csv("./ratings.csv")

In [49]:
print(movies)
print(ratings)

       movieId                               title  \
0            1                    Toy Story (1995)   
1            2                      Jumanji (1995)   
2            3             Grumpier Old Men (1995)   
3            4            Waiting to Exhale (1995)   
4            5  Father of the Bride Part II (1995)   
...        ...                                 ...   
27273   131254        Kein Bund für's Leben (2007)   
27274   131256       Feuer, Eis & Dosenbier (2002)   
27275   131258                  The Pirates (2014)   
27276   131260                 Rentun Ruusu (2001)   
27277   131262                    Innocence (2014)   

                                            genres  
0      Adventure|Animation|Children|Comedy|Fantasy  
1                       Adventure|Children|Fantasy  
2                                   Comedy|Romance  
3                             Comedy|Drama|Romance  
4                                           Comedy  
...                              

In [73]:
# 아이템 기반 협업 필터링 구현

data =pd.merge(ratings, movies, on="movieId")
column = ['userId', 'movieId', 'rating','title','genres']
data = data[column]
print(data)

         userId  movieId  rating  \
0             1        2     3.5   
1             5        2     3.0   
2            13        2     3.0   
3            29        2     3.0   
4            34        2     3.0   
...         ...      ...     ...   
1048570    7066    88572     1.5   
1048571    7066   112412     4.5   
1048572    7077    32013     3.5   
1048573    7086   102596     4.5   
1048574    7110    65651     2.0   

                                                     title  \
0                                           Jumanji (1995)   
1                                           Jumanji (1995)   
2                                           Jumanji (1995)   
3                                           Jumanji (1995)   
4                                           Jumanji (1995)   
...                                                    ...   
1048570                             Fred: The Movie (2010)   
1048571                             Perfect Sisters (2014)   
1048572  

In [74]:
moviedata = data.pivot_table(index="movieId", columns='userId')['rating']
print(moviedata)


userId   1     2     3     4     5     6     7     8     9     10    ...  \
movieId                                                              ...   
1         NaN   NaN   4.0   NaN   NaN   5.0   NaN   4.0   NaN   4.0  ...   
2         3.5   NaN   NaN   NaN   3.0   NaN   NaN   NaN   NaN   NaN  ...   
3         NaN   4.0   NaN   NaN   NaN   3.0   3.0   5.0   NaN   NaN  ...   
4         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
5         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
...       ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
130073    NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
130219    NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
130462    NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
130490    NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
130642    NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   

userId   71

In [75]:
# NaN 값을 -1로 변경

moviedata.fillna(-1, inplace=True)
print(moviedata)

userId   1     2     3     4     5     6     7     8     9     10    ...  \
movieId                                                              ...   
1        -1.0  -1.0   4.0  -1.0  -1.0   5.0  -1.0   4.0  -1.0   4.0  ...   
2         3.5  -1.0  -1.0  -1.0   3.0  -1.0  -1.0  -1.0  -1.0  -1.0  ...   
3        -1.0   4.0  -1.0  -1.0  -1.0   3.0   3.0   5.0  -1.0  -1.0  ...   
4        -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  ...   
5        -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  ...   
...       ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
130073   -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  ...   
130219   -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  ...   
130462   -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  ...   
130490   -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  ...   
130642   -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  -1.0  ...   

userId   71

In [98]:
from math import sqrt

# kdd 유사도 함수

def sim_distance(data, n1, n2):
  sum=0
  # 두 사용자가 모두 본 영화를 기준
  for i in data.loc[n1, data.loc[n1, :] >= 0].index:
    if data.loc[n2,i] >=0:
      # 누적합
      sum+= pow(data.loc[n1,i]-data.loc[n2,i], 2)

  return sqrt(1/(sum+1))   # 유사도 형식으로 출력

In [99]:
# 나와 유사도가 높은 유저 매칭 함수

def top_match(data, name, rank = 5, simf = sim_distance):
  simList = []
  for i in data.index[-10:]:
    if name != i:
      simList.append((simf(data,name, i),i ))
      
  simList.sort()
  simList.reverse()
  return simList[:rank]

In [100]:
# 추천 시스템 함수

def recommendation(data, person, simf = sim_distance):
  result_top = top_match(data, person, len(data))
  score_dic = {}
  sim_dic = {}
  my_list = []
  for sim, name in result_top:
    if sim < 0 :
      continue
    for movie in data.loc[person, data.loc[person, :] < 0].index:
      sim_sum = 0
      if data.loc[name, movie] >= 0:
        sim_sum += data.loc[name, movie]

        score_dic.setdefault(movie, 0)
        score_dic[movie] += sim_sum

        sim_dic.setdefault(movie, 0)
        sim_dic[movie] += sim

    for key in score_dic:
      my_list.append((score_dic[key]/sim_dic[key], key))
    my_list.sort()
    my_list.reverse()
    return my_list

In [101]:
# 20번 user가 안본 영화 중에서 
# 추천 점수가 가장 높은 순으로 예상 평점과 영화 제목을 추천(15개까지)

movie_list = []
for rate, m_id in recommendation(moviedata, 20):
  movie_list.append(
      (rate, movies.loc[movies['movieId']== m_id,'title'].values[0]))
  if len(movie_list) == 15:
    break

print(movie_list[:15])

[(3.0, 'Cecil B. DeMented (2000)')]


# 날짜데이터

## 문자열을 날짜로 변환

In [108]:
date_strings = np.array(['03-12-2022 04:23 AM',
                        '03-12-2022 06:11 PM',
                        '28-08-2022 08:48 PM'])

# Timestamp 객체로 변환
for date in date_strings:
  pd.to_datetime(date, format='%d-%m-%Y %I:%M %p')

# for date in date_strings:
#   pd.to_datetime(date, format="%d-%m-%Y %I:%M %p", errors="ignore")

data = pd.to_datetime(date_strings)
print(data)

DatetimeIndex(['2022-03-12 04:23:00', '2022-03-12 18:11:00',
               '2022-08-28 20:48:00'],
              dtype='datetime64[ns]', freq=None)


In [109]:
date_strings = np.array(['03-12-2022 04:23 AM',
                        '03-12-2022 06:11 PM',
                        '28-08-2022 08:48 PM'])

# Timestamp 객체로 변환
for date in date_strings:
  pd.to_datetime(date, format='%d-%m-%Y %I:%M %p')

for date in date_strings:
  pd.to_datetime(date, format="%d-%m-%Y %I:%M %p", errors="ignore")

data = pd.to_datetime(date_strings)
print(data)

DatetimeIndex(['2022-03-12 04:23:00', '2022-03-12 18:11:00',
               '2022-08-28 20:48:00'],
              dtype='datetime64[ns]', freq=None)


## 시간대 데이터 처리


In [89]:
# datetime 만들기
pd.Timestamp('2017-05-01 06:00:00', tz='Europe/London')
date = pd.Timestamp('2017-05-01 06:00:00')
print('\n', date)

# 시간대 지정
date_in_london = date.tz_localize('Europe/London')
date_in_london
print('\n', date_in_london)

# 시간대를 변환
date_in_london.tz_convert('Africa/Abidjan')

# 세 개의 날짜를 만듭니다.
dates = pd.Series(pd.date_range('2/2/2002', periods=3, freq='M'))
temp = dates.dt.tz_localize('Africa/Abidjan')     # 시간대 지정

print('\n', temp)


 2017-05-01 06:00:00

 2017-05-01 06:00:00+01:00

 0   2002-02-28 00:00:00+00:00
1   2002-03-31 00:00:00+00:00
2   2002-04-30 00:00:00+00:00
dtype: datetime64[ns, Africa/Abidjan]


In [91]:
!pip install pytz

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [95]:
from pytz import all_timezones
from pytz import timezone

dates = pd.Series(pd.date_range('2/2/2002', periods=3, freq='M'))

# 두 개의 시간대를 확인
all_timezones[0:2]
print(all_timezones[0:2])
temp = dates.dt.tz_localize('dateutil/Asia/Seoul')
print('\n',temp)

tz = timezone('Asia/Seoul')
temp01 = dates.dt.tz_localize(tz)

print('\n',temp01)

['Africa/Abidjan', 'Africa/Accra']

 0   2002-02-28 00:00:00+09:00
1   2002-03-31 00:00:00+09:00
2   2002-04-30 00:00:00+09:00
dtype: datetime64[ns, tzfile('/usr/share/zoneinfo/Asia/Seoul')]

 0   2002-02-28 00:00:00+09:00
1   2002-03-31 00:00:00+09:00
2   2002-04-30 00:00:00+09:00
dtype: datetime64[ns, Asia/Seoul]


## 날짜와 시간 선택하기

In [107]:
dataframe = pd.DataFrame()
# datetime 만들기
# periods 매개변수는 date_range함수에 전달된 시작날짜와 종료날짜를
# periods 매개변수에 전달된 기간만큼 동일하게 나누어 출력해줍니다.
dataframe['date'] = pd.date_range('1/1/2001', periods=100000, freq='H')


# 두 datetime 사이의 샘플을 선택합니다.
dataframe[(dataframe['date'] > '2002-1-1 01:00:00') &
          (dataframe['date'] <= '2002-1-1 06:00:00')]
dataframe = dataframe.set_index(dataframe['date'])    # datetime을 만든다

# 두 datetime 사이 샘플 선택
temp = dataframe.loc['2002-1-1 01:00:00':'2002-1-1 06:00:00']

print(temp)

                                   date
date                                   
2002-01-01 01:00:00 2002-01-01 01:00:00
2002-01-01 02:00:00 2002-01-01 02:00:00
2002-01-01 03:00:00 2002-01-01 03:00:00
2002-01-01 04:00:00 2002-01-01 04:00:00
2002-01-01 05:00:00 2002-01-01 05:00:00
2002-01-01 06:00:00 2002-01-01 06:00:00


## 날짜 데이터를 여러 특성으로 분할

In [115]:
dataframe = pd.DataFrame()
# 다섯개의 날짜 생성
dataframe['date'] = pd.date_range('1/1/2022', periods=150, freq='W')

# data = dataframe['date'] = pd.date_range('1/1/2022', periods=150, freq='W')
# print(data)

# 연, 월, 일, 시, 분에 대한 특성 생성
dataframe['year'] = dataframe['date'].dt.year
dataframe['month'] = dataframe['date'].dt.month
dataframe['day'] = dataframe['date'].dt.day
dataframe['hour'] = dataframe['date'].dt.hour
dataframe['minute'] = dataframe['date'].dt.minute

# data = dataframe['year'] = dataframe['date'].dt.year
# print(data)

print(dataframe.head(7))

        date  year  month  day  hour  minute
0 2022-01-02  2022      1    2     0       0
1 2022-01-09  2022      1    9     0       0
2 2022-01-16  2022      1   16     0       0
3 2022-01-23  2022      1   23     0       0
4 2022-01-30  2022      1   30     0       0
5 2022-02-06  2022      2    6     0       0
6 2022-02-13  2022      2   13     0       0


## 날짜 간의 차이 계산


In [119]:
dataframe = pd.DataFrame()

# 두 datatime 특성 생성
dataframe['Arrived'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-04-2017')]
dataframe['Left'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-06-2017')]
print(dataframe['Arrived'],'\n',dataframe['Left'])

# 특성 사이의 차이를 계산
dataframe['Left'] - dataframe['Arrived']

data = pd.Series(delta.days for delta in (
    dataframe['Left'] - dataframe['Arrived']))    # 특성 간 기간계산

print(data)

0   2017-01-01
1   2017-01-04
Name: Arrived, dtype: datetime64[ns] 
 0   2017-01-01
1   2017-01-06
Name: Left, dtype: datetime64[ns]
0    0
1    2
dtype: int64


## 시계열 데이터에서 누락된 값 처리


In [124]:
time_index = pd.date_range("01/01/2010", periods=5, freq="M")
dataframe = pd.DataFrame(index=time_index)  # 데이터프레임을 만들고 인덱스를 지정
print(dataframe,'\n')

data1 = dataframe["Sales"] = [1.0, 2.0, np.nan, np.nan, 5.0]     # 누락된 값이 있는 특성 생성
print(data1,'\n')

data2 = dataframe.interpolate()   # 누락된 값을 보간
print(data2,'\n')

data3 = dataframe.ffill()     # 앞쪽으로 채우기(Forward-fill)
print(data3,'\n')

data4 = dataframe.bfill()     # 뒤쪽으로 채우기(Back-fill)
print(data4,'\n')

data5 = dataframe.interpolate(method ="quadratic")    # 비선형일 경우 보간 방법 변경
print(data5,'\n')

print(dataframe.interpolate(limit=1, limit_direction="forward"))     # 보간 방향 지정


Empty DataFrame
Columns: []
Index: [2010-01-31 00:00:00, 2010-02-28 00:00:00, 2010-03-31 00:00:00, 2010-04-30 00:00:00, 2010-05-31 00:00:00] 

[1.0, 2.0, nan, nan, 5.0] 

            Sales
2010-01-31    1.0
2010-02-28    2.0
2010-03-31    3.0
2010-04-30    4.0
2010-05-31    5.0 

            Sales
2010-01-31    1.0
2010-02-28    2.0
2010-03-31    2.0
2010-04-30    2.0
2010-05-31    5.0 

            Sales
2010-01-31    1.0
2010-02-28    2.0
2010-03-31    5.0
2010-04-30    5.0
2010-05-31    5.0 

               Sales
2010-01-31  1.000000
2010-02-28  2.000000
2010-03-31  3.059808
2010-04-30  4.038069
2010-05-31  5.000000 

            Sales
2010-01-31    1.0
2010-02-28    2.0
2010-03-31    3.0
2010-04-30    NaN
2010-05-31    5.0
