In [1]:
import numpy as np
import pandas as pd
import re
import create_regex

In [28]:
df = pd.read_csv('../dataset/raw_data/title_data/CryptoCurrency/2020-01-01.csv')

In [3]:
def pre_process(raw_df, column_name):
    if "selftext" in raw_df.columns: #제목의 경우 결측치가 많아 제거해 사용합니다.
        raw_df.drop('selftext',axis=1,inplace=True)
    df = raw_df.dropna() #여백 및 결측치를 제거해준다.
    df = df.drop_duplicates([column_name],keep='first') #중복 처리를 해준다.
    return df

In [4]:
def find_pattern(df, patterns, column_name):
    valid_index = [True] * df.shape[0]
    res = [False] * df.shape[0]
    for patt in patterns:
        patt_find = df[column_name].str.findall(patt, flags = re.IGNORECASE)
        #해당 패턴이 하나 이상 발견된 행들을 반환
        valid_index = np.where(patt_find.apply(len) > 0, True, False)
        res = res | valid_index
    return res 

In [5]:
def get_valid_rows(df,column_name):
    pattern = create_regex.get_word_patt(False) #create_regex.py에 저장된 비트코인 연관 단어 패턴들을 가져온다.
    valid_index = find_pattern(df, pattern, column_name) #해당 단어가 포함된 글들은 유효하다.
    return df.loc[valid_index]

In [6]:
def remove_invalid_rows(df, column_name):
    patterns = create_regex.remove_patterns # 기본적으로 지워야할 url, nan등
    patterns.extend(create_regex.get_word_patt())# 광고 글에 자주 사용되는 단어들
    invalid_index = find_pattern(df, patterns, column_name) #삭제해야할 행 인덱스
    valid_df = df.loc[~invalid_index].copy()
    valid_df[column_name].replace(r"[^a-zA-Z ]","",regex = True, inplace=True) #이모지, 특수문자 제거
    valid_df[column_name].replace("", float("NaN"),inplace=True) #이모지로만 구성되거나 특수문자로만 구성됐던 글들 결측치로 변환
    valid_df.dropna(inplace = True) #결측치 전부 제거
    return valid_df

In [29]:
df

Unnamed: 0.1,Unnamed: 0,id,author,vader,title,created_utc,num_comments,score
0,0,epd4oi,grayfoxlouis,0.0000,The Future of Bitcoin ampamp Cryptos Blockchai...,1579143357,2,1
1,1,epcm30,A_solo_tripper,-0.6597,rCrazyIdea Alt coins band together and file a ...,1579140942,13,1
2,2,epcj7n,Minute-Guava,0.0000,ELI Craig Wright,1579140565,0,1
3,3,epccia,D_willow,-0.5868,Not sure why this isnt causing more excitement,1579139680,3,1
4,4,epc82i,good_data,0.0000,Ethereum Classic ETC Price History July Today,1579139086,0,1
...,...,...,...,...,...,...,...,...
2477,2556,ell3x1,yoyosales,0.5574,Bitcoin Revolution Bitcoin makes people rich,1578445705,1,1
2478,2557,elkwyo,khyzyto,0.0000,Probit Exchange,1578444773,1,1
2479,2558,elksme,thats_so_over,0.0772,Do people want to post memes again,1578444198,2,1
2480,2559,elkn4n,Smithtrades,0.2500,Gold Poised For New ATH DAILY CRYPTO amp FORE...,1578443535,0,1


In [30]:
#unnamed 0은 잘못들어간 열이니 날립니다.
df.drop(df.columns[0],axis=1,inplace=True)
column_name = 'title'

In [31]:
df = pre_process(df,column_name)

In [32]:
df

Unnamed: 0,id,author,vader,title,created_utc,num_comments,score
0,epd4oi,grayfoxlouis,0.0000,The Future of Bitcoin ampamp Cryptos Blockchai...,1579143357,2,1
1,epcm30,A_solo_tripper,-0.6597,rCrazyIdea Alt coins band together and file a ...,1579140942,13,1
2,epcj7n,Minute-Guava,0.0000,ELI Craig Wright,1579140565,0,1
3,epccia,D_willow,-0.5868,Not sure why this isnt causing more excitement,1579139680,3,1
4,epc82i,good_data,0.0000,Ethereum Classic ETC Price History July Today,1579139086,0,1
...,...,...,...,...,...,...,...
2477,ell3x1,yoyosales,0.5574,Bitcoin Revolution Bitcoin makes people rich,1578445705,1,1
2478,elkwyo,khyzyto,0.0000,Probit Exchange,1578444773,1,1
2479,elksme,thats_so_over,0.0772,Do people want to post memes again,1578444198,2,1
2480,elkn4n,Smithtrades,0.2500,Gold Poised For New ATH DAILY CRYPTO amp FORE...,1578443535,0,1


In [33]:
df = get_valid_rows(df,column_name) #비트코인과 관련된 단어가 있는 글만 출력합니다.

In [34]:
df

Unnamed: 0,id,author,vader,title,created_utc,num_comments,score
0,epd4oi,grayfoxlouis,0.0000,The Future of Bitcoin ampamp Cryptos Blockchai...,1579143357,2,1
1,epcm30,A_solo_tripper,-0.6597,rCrazyIdea Alt coins band together and file a ...,1579140942,13,1
4,epc82i,good_data,0.0000,Ethereum Classic ETC Price History July Today,1579139086,0,1
5,epbx8d,good_data,0.0000,Ripple XRP Price History January Today,1579137657,2,1
6,epbtpc,woodydeck,0.0000,I Made an Ethereum Based Draw Poker Game and F...,1579137165,0,1
...,...,...,...,...,...,...,...
2474,ellmgx,goldenbzzz,-0.8126,Bitcoin holders oh shit here comes the bear ag...,1578448085,9,1
2475,ellgzk,vincenthint,-0.2732,Earlier Than Bitcoin HalvingBitmain to Further...,1578447379,3,1
2476,ell73x,Mirzaak,0.0000,How do you become expert in crypto,1578446123,1,1
2477,ell3x1,yoyosales,0.5574,Bitcoin Revolution Bitcoin makes people rich,1578445705,1,1


In [35]:
df = remove_invalid_rows(df,column_name)

In [36]:
df

Unnamed: 0,id,author,vader,title,created_utc,num_comments,score
0,epd4oi,grayfoxlouis,0.0000,The Future of Bitcoin ampamp Cryptos Blockchai...,1579143357,2,1
1,epcm30,A_solo_tripper,-0.6597,rCrazyIdea Alt coins band together and file a ...,1579140942,13,1
4,epc82i,good_data,0.0000,Ethereum Classic ETC Price History July Today,1579139086,0,1
5,epbx8d,good_data,0.0000,Ripple XRP Price History January Today,1579137657,2,1
6,epbtpc,woodydeck,0.0000,I Made an Ethereum Based Draw Poker Game and F...,1579137165,0,1
...,...,...,...,...,...,...,...
2474,ellmgx,goldenbzzz,-0.8126,Bitcoin holders oh shit here comes the bear ag...,1578448085,9,1
2475,ellgzk,vincenthint,-0.2732,Earlier Than Bitcoin HalvingBitmain to Further...,1578447379,3,1
2476,ell73x,Mirzaak,0.0000,How do you become expert in crypto,1578446123,1,1
2477,ell3x1,yoyosales,0.5574,Bitcoin Revolution Bitcoin makes people rich,1578445705,1,1


* 결과 값으로 나온 df가 최종 df이를 모델에 넣어주면 됩니다.
