In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from tqdm import tqdm

from keybert import KeyBERT

import re

from wordcloud import WordCloud
from collections import defaultdict # 데이터프레임 만들때, 딕셔너리형 만들때 사용
import ast # 워드클라우드 전처리에 사용
from sklearn.feature_extraction.text import TfidfVectorizer # idf제

plt.rc("font", family = "Malgun Gothic")
plt.rcParams["axes.unicode_minus"] = False

## 크롤링 - 1960~2020 토론 크롤링하여 데이터 생성

In [None]:
moderators =['QUINCY HOWE, MODERATOR', 'MR. HOWE', 'MODERATOR', 'QUINCY HOWE', 'MR. SINGISER', 'MR. CRONKITE', 
            'MR. EDWARDS', 'MR. CRONKITE','MR. CHANCELLOR','MR. VON FREMD', 'MR. CATER','MR. DRUMMOND',
             'BILL SHADEL, MODERATOR', 'MR. SHADEL', 'MR. McGEE','MR. VON FREMD','FRANK McGEE, MODERATOR',
             'MR. NIVEN', 'MR. MORGAN', 'MR. SPIVAK', 'MR. LEVY', 'HOWARD K. SMITH, MODERATOR', 'MR. SMITH',
             'MR. FLEMING', 'MR. WARREN', 'MR. VANOCUR', 'MR. SM1TH','MR. NOVINS','Mr. SMITH'
             'THE MODERATOR','MR. KRAFT', 'MR. MAYNARD', 'MR. NELSON', 'MR. TREWHITT', 'MR. VALERIANI',
             'MR. FRANKEL', 'MR. REYNOLDS', 'MR. GANNON', 'MS. DREW', 'MS. WALTERS', 'MR. HILLIARD',
             'MRS. HINERFELD', 'MR. STONE', 'MR. ELLIS', 'MR. HILLIARD', 'Ms. Ridings', 'Mr. Newman',
             'Ms. Geyer', 'Mr. Kalb', 'Mr. Kondracke', 'Mr. Trewhitt', 'Ms. Ridings', 'Mr. Wieghart', 
             'Mr. Barnes', 'SHAW', 'COMPTON,', 'WARNER', 'MITCHELL', 'LEHRER','MASHEK', 'GROER', 
             'JENNINGS', 'THE MODERATOR', 'Ms. DREW', 'WELKER', 'WALLACE', 'PARTICIPANTS', 'Contact',
             'MODERATOR', 'WALLACE', 'COOPER', 'QUESTION', 'MODERATORS', 'RADDATZ', 'HOLT', 'Moderator Bob Schieffer',
             'Republican Presidential Nominee W. Mitt Romney', 'Mr. Schieffer', 'Ms. Crowley', 'Q', 'Moderator Candy Crowley',
             'Jim Lehrer', 'Mr. Lehrer', 'Moderator Jim Lehrer', 'SCHIEFFER', 'QUESTION', 'BROKAW', 
             'Mr. Schieffer', 'Bob Schieffer', 'Senator John F. Kerry', 'Cheryl Otis', 'Mr. Gibson', 
             'Charles Gibson', 'Anthony Baldi', 'James Hubb','Linda Grabel', 'Daniel Farley', 'Rob Fowler',
             'Nikki Washington', 'Robin Dahle', 'Jonathan Michaelson', 'John Horstman','Norma-Jean Laurent',
             'Elizabeth Long', 'Mr. Dahle', 'Jane Barrow', 'James Varner', 'Sarah Degenhart','Randee Jacobs',  
             'Ann Bronsing', 'James Hubb', 'Jim Lehrer', 'Mr. Lehrer', 'Jim Lehrer', 'Mr. Lehrer','Senator Bob Dole',
             'MR. SPIVAK', 'Ms. FRANKEL', 'MS. DREW', 'MS. DREW', 'MRS. HINERFELD', 'MS. WALTERS', 'Ms. Geyer',
             'Ms. Ridings', 'Ms. Walters', 'Ms. Sawyer',
            ]

save_dir = r'E:\est\kdt\project\president_prediction\data\Debates'

In [None]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

def get_soup_para(url):
    req = Request(url, headers={"user-agent": "chrome"})
    response = urlopen(req)
    soup = BeautifulSoup(response, "html.parser")
    article_body = soup.select_one('#block-system-main > div > div > div.col-sm-8 > div.field-docs-content')
    paragraphs = article_body.find_all('p')
    
    return soup, paragraphs

def extract_speaker(soup, tag):
    elements = soup.select(tag)
    unique_elements = list(set(elements))
    text_list = [element.text.strip() for element in unique_elements]
#     print(text_list)
    return text_list

def extract_speech_col(paragraphs, speakers):
    speeches = {speaker: [] for speaker in speakers}
    current_speaker = None

    for para in paragraphs:
        text = para.get_text(strip=True)  #paragraphs가 문자열 리스트라고 가정
        for speaker in speakers:
            if text.startswith(f'{speaker}:'):
                current_speaker = speaker
                speeches[speaker].append(text[len(f'{speaker}:'):].strip()) #발언을 추가할 때에는 발언자 이름 다음의 콜론 이후의 문자열을 추출
                break
        else: #모든 발언자 이름에 대해 text가 {speaker}:로 시작하지 않는 경우 실행
            if current_speaker and not any(text.startswith(f'{moderator}:') for moderator in moderators):
                speeches[current_speaker].append(text) #current_speaker의 발언으로 간주

    return speeches #발언자:[연설문] 형태의 딕셔너리형태

def extract_speech_dot(paragraphs, speakers): # 이름.
    speeches = {speaker: [] for speaker in speakers}
    current_speaker = None

    for para in paragraphs:
        text = para.get_text(strip=True)  #paragraphs가 문자열 리스트라고 가정
        for speaker in speakers:
            if text.startswith(f'{speaker}.'):
                current_speaker = speaker
                speeches[speaker].append(text[len(f'{speaker}.'):].strip()) #발언을 추가할 때에는 발언자 이름 다음의 콜론 이후의 문자열을 추출
                break
        else: #모든 발언자 이름에 대해 text가 {speaker}.로 시작하지 않는 경우 실행
            if current_speaker and not any(text.startswith(moderator) for moderator in moderators):
                speeches[current_speaker].append(text) #current_speaker의 발언으로 간주

    return speeches

In [None]:
# 1960
url_1960_1 = 'https://www.presidency.ucsb.edu/documents/presidential-debate-new-york'
url_1960_2 = 'https://www.presidency.ucsb.edu/documents/presidential-debate-broadcast-from-new-york-and-los-angeles'
url_1960_3 = 'https://www.presidency.ucsb.edu/documents/presidential-debate-washington-dc'
url_1960_4 = 'https://www.presidency.ucsb.edu/documents/presidential-debate-chicago'

soup_1960_1, para_1960_1 = get_soup_para(url_1960_1)
soup_1960_2, para_1960_2 = get_soup_para(url_1960_2)
soup_1960_3, para_1960_3 = get_soup_para(url_1960_3)
soup_1960_4, para_1960_4 = get_soup_para(url_1960_4)

speaker_1960_1 = ['MR. NIXON','MR. KENNEDY']

speech_1960_1 = extract_speech_col(para_1960_1, speaker_1960_1)
speech_1960_2 = extract_speech_col(para_1960_2, speaker_1960_1)
speech_1960_3 = extract_speech_col(para_1960_3, speaker_1960_1)
speech_1960_4 = extract_speech_col(para_1960_4, speaker_1960_1)

In [None]:
year = '1960'
name = ['Richard Nixon', 'John F. Kennedy']
date = ['October 21, 1960', 'October 13, 1960','October 7, 1960','September 26, 1960']
data1 = {
    'name':[name[0]]*4,
    'date':date,
    'speech':[speech_1960_1[speaker_1960_1[0]], speech_1960_2[speaker_1960_1[0]], speech_1960_3[speaker_1960_1[0]], speech_1960_4[speaker_1960_1[0]]]
}
data2 = {
    'name':[name[1]]*4,
    'date':date,
    'speech':[speech_1960_1[speaker_1960_1[1]], speech_1960_2[speaker_1960_1[1]], speech_1960_3[speaker_1960_1[1]], speech_1960_4[speaker_1960_1[1]]]
}


df1_1960 = pd.DataFrame(data1)
df2_1960 = pd.DataFrame(data2)
filename1 = f'{save_dir}/{year}_{name[0]}_debate.csv'
filename2 = f'{save_dir}/{year}_{name[1]}_debate.csv'
# df1_1960.to_csv(filename1, index=False)
# df2_1960.to_csv(filename2, index=False)

In [None]:
df2_1960

Unnamed: 0,name,date,speech
0,John F. Kennedy,"October 21, 1960","[Good evening, Mr. Howe., Mr. Howe, Mr. Vice P..."
1,John F. Kennedy,"October 13, 1960","[Good evening, Mr. Shadel., Mr. McGee, we have..."
2,John F. Kennedy,"October 7, 1960",[In the first place I've never suggested that ...
3,John F. Kennedy,"September 26, 1960","[Well, the Vice President and I came to the Co..."


In [None]:
# 1976
url_1976_1 = 'https://www.presidency.ucsb.edu/documents/presidential-campaign-debate-0'
url_1976_2 = 'https://www.presidency.ucsb.edu/documents/presidential-campaign-debate'
url_1976_3 = 'https://www.presidency.ucsb.edu/documents/presidential-campaign-debate-1'

soup_1976_1, para_1976_1 = get_soup_para(url_1976_1)
soup_1976_2, para_1976_2 = get_soup_para(url_1976_2)
soup_1976_3, para_1976_3 = get_soup_para(url_1976_3)

speaker_1976_1 = ['MR. CARTER', 'THE PRESIDENT']

speech_1976_1 = extract_speech_dot(para_1976_1, speaker_1976_1)
speech_1976_2 = extract_speech_dot(para_1976_2, speaker_1976_1)
speech_1976_3 = extract_speech_dot(para_1976_3, speaker_1976_1)

In [None]:
year = '1976'
name = ['Jimmy Carter', 'Gerald R. Ford']
date = ['October 22, 1976','October 6, 1976','September 23, 1976']
k=3
data1 = {
    'name':[name[0]]*k,
    'date':date,
    'speech':[speech_1976_1[speaker_1976_1[0]], speech_1976_2[speaker_1976_1[0]], speech_1976_3[speaker_1976_1[0]]]
}
data2 = {
    'name':[name[1]]*k,
    'date':date,
    'speech':[speech_1976_1[speaker_1976_1[1]], speech_1976_2[speaker_1976_1[1]], speech_1976_3[speaker_1976_1[1]]]
}


df1_1976 = pd.DataFrame(data1)
df2_1976 = pd.DataFrame(data2)
filename1 = f'{save_dir}/{year}_{name[0]}_debate.csv'
filename2 = f'{save_dir}/{year}_{name[1]}_debate.csv'
# df1_1976.to_csv(filename1, index=False)
# df2_1976.to_csv(filename2, index=False)

In [None]:
df1_1976

Unnamed: 0,name,date,speech
0,Jimmy Carter,"October 22, 1976","[Well, I might say first of all, that I think ..."
1,Jimmy Carter,"October 6, 1976","[Well, I think this Republican administration ..."
2,Jimmy Carter,"September 23, 1976",[Yes. First of all it's to recognize the treme...


In [None]:
# 1980 카터대통령이 2번째 토론참석 x
url_1980_1 = 'https://www.presidency.ucsb.edu/documents/presidential-debate-cleveland'
soup_1980_1, para_1980_1 = get_soup_para(url_1980_1)

speaker_1980_1 = ['GOVERNOR REAGAN', 'THE PRESIDENT']

speech_1980_1 = extract_speech_dot(para_1980_1, speaker_1980_1)

In [None]:
year = '1980'
name = ['Ronald Reagan', 'Jimmy Carter']
date = ['October 28, 1980']
k=1
data1 = {
    'name':[name[0]]*k,
    'date':date,
    'speech':[speech_1980_1[speaker_1980_1[0]]]
}
data2 = {
    'name':[name[1]]*k,
    'date':date,
    'speech':[speech_1980_1[speaker_1980_1[1]]]
}


df1_1980 = pd.DataFrame(data1)
df2_1980 = pd.DataFrame(data2)
filename1 = f'{save_dir}/{year}_{name[0]}_debate.csv'
filename2 = f'{save_dir}/{year}_{name[1]}_debate.csv'
# df1_1980.to_csv(filename1, index=False)
# df2_1980.to_csv(filename2, index=False)

In [None]:
df1_1980

Unnamed: 0,name,date,speech
0,Ronald Reagan,"October 28, 1980","[I don't know what the differences might be, b..."


In [None]:
# 1984
url_1984_1 = 'https://www.presidency.ucsb.edu/documents/debate-between-the-president-and-former-vice-president-walter-f-mondale-kansas-city'
url_1984_2 = 'https://www.presidency.ucsb.edu/documents/debate-between-the-president-and-former-vice-president-walter-f-mondale-louisville'

soup_1984_1, para_1984_1 = get_soup_para(url_1984_1)
soup_1984_2, para_1984_2 = get_soup_para(url_1984_2)


speaker_1984_1 = ['Mr. Mondale', 'The President']

speech_1984_1 = extract_speech_dot(para_1984_1, speaker_1984_1)
speech_1984_2 = extract_speech_dot(para_1984_2, speaker_1984_1)

In [None]:
year = '1984'
name = ['Walter F. Mondale','Ronald Reagan']
date = ['October 21, 1984','October 7, 1984']
k=2
data1 = {
    'name':[name[0]]*k,
    'date':date,
    'speech':[speech_1984_1[speaker_1984_1[0]],speech_1984_2[speaker_1984_1[0]]]
}
data2 = {
    'name':[name[1]]*k,
    'date':date,
    'speech':[speech_1984_1[speaker_1984_1[1]],speech_1984_2[speaker_1984_1[1]]]
}


df1_1984 = pd.DataFrame(data1)
df2_1984 = pd.DataFrame(data2)
filename1 = f'{save_dir}/{year}_{name[0]}_debate.csv'
filename2 = f'{save_dir}/{year}_{name[1]}_debate.csv'
# df1_1984.to_csv(filename1, index=False)
# df2_1984.to_csv(filename2, index=False)

In [None]:
df1_1984

Unnamed: 0,name,date,speech
0,Walter F. Mondale,"October 21, 1984",[I believe that the question oversimplifies th...
1,Walter F. Mondale,"October 7, 1984",[One of the key tests of leadership is whether...


In [None]:
# 1988
url_1988_1 = 'https://www.presidency.ucsb.edu/documents/presidential-debate-the-university-california-los-angeles'
url_1988_2 = 'https://www.presidency.ucsb.edu/documents/presidential-debate-winston-salem-north-carolina'

soup_1988_1, para_1988_1 = get_soup_para(url_1988_1)
soup_1988_2, para_1988_2 = get_soup_para(url_1988_2)

speaker_1988_1 = ['BUSH', 'DUKAKIS']

speech_1988_1 = extract_speech_col(para_1988_1, speaker_1988_1)
speech_1988_2 = extract_speech_col(para_1988_2, speaker_1988_1)

In [None]:
year = '1988'
name = ['George Bush','Michael S. Dukakis']
date = ['October 13, 1988','September 25, 1988']
k=2
data1 = {
    'name':[name[0]]*k,
    'date':date,
    'speech':[speech_1988_1[speaker_1988_1[0]],speech_1988_2[speaker_1988_1[0]]]
}
data2 = {
    'name':[name[1]]*k,
    'date':date,
    'speech':[speech_1988_1[speaker_1988_1[1]],speech_1988_2[speaker_1988_1[1]]]
}


df1_1988 = pd.DataFrame(data1)
df2_1988 = pd.DataFrame(data2)
filename1 = f'{save_dir}/{year}_{name[0]}_debate.csv'
filename2 = f'{save_dir}/{year}_{name[1]}_debate.csv'
# df1_1988.to_csv(filename1, index=False)
# df2_1988.to_csv(filename2, index=False)

In [None]:
df1_1988

Unnamed: 0,name,date,speech
0,George Bush,"October 13, 1988","[Well, a lot of what this campaign is about, i..."
1,George Bush,"September 25, 1988",[I think we've seen a deterioration of values....


In [None]:
# 1996
url_1996_1 = 'https://www.presidency.ucsb.edu/documents/presidential-debate-san-diego'
url_1996_2 = 'https://www.presidency.ucsb.edu/documents/presidential-debate-hartford'

soup_1996_1, para_1996_1 = get_soup_para(url_1996_1)
soup_1996_2, para_1996_2 = get_soup_para(url_1996_2)

extract_speaker(soup_1996_1, 'em')
extract_speaker(soup_1996_2, 'em')

speaker_1996_1 = ['The President', 'Senator Dole']

speech_1996_1 = extract_speech_dot(para_1996_1, speaker_1996_1)
speech_1996_2 = extract_speech_dot(para_1996_2, speaker_1996_1)

In [None]:
year = '1996'
name = ['William J. Clinton','Robert Dole']
date = ['October 16, 1996','October 6, 1996']
k=2
data1 = {
    'name':[name[0]]*k,
    'date':date,
    'speech':[speech_1996_1[speaker_1996_1[0]],speech_1996_2[speaker_1996_1[0]]]
}
data2 = {
    'name':[name[1]]*k,
    'date':date,
    'speech':[speech_1996_1[speaker_1996_1[1]],speech_1996_2[speaker_1996_1[1]]]
}


df1_1996 = pd.DataFrame(data1)
df2_1996 = pd.DataFrame(data2)
filename1 = f'{save_dir}/{year}_{name[0]}_debate.csv'
filename2 = f'{save_dir}/{year}_{name[1]}_debate.csv'
# df1_1996.to_csv(filename1, index=False)
# df2_1996.to_csv(filename2, index=False)

In [None]:
df1_1996

Unnamed: 0,name,date,speech
0,William J. Clinton,"October 16, 1996","[I was going to applaud, too. [Laughter], Well..."
1,William J. Clinton,"October 6, 1996","[Thank you, Jim, and thank you to the people o..."


In [None]:
# 2004
url_2004_1 = 'https://www.presidency.ucsb.edu/documents/presidential-debate-tempe-arizona'
url_2004_2 = 'https://www.presidency.ucsb.edu/documents/presidential-debate-st-louis-missouri'
url_2004_3 = 'https://www.presidency.ucsb.edu/documents/presidential-debate-coral-gables-florida'

soup_2004_1, para_2004_1 = get_soup_para(url_2004_1)
soup_2004_2, para_2004_2 = get_soup_para(url_2004_2)
soup_2004_3, para_2004_3 = get_soup_para(url_2004_3)

extract_speaker(soup_2004_1, 'em')
extract_speaker(soup_2004_2, 'em')
extract_speaker(soup_2004_3, 'em')

speaker_2004_1 = ['President Bush', 'Senator Kerry']

speech_2004_1 = extract_speech_dot(para_2004_1, speaker_2004_1)
speech_2004_2 = extract_speech_dot(para_2004_2, speaker_2004_1)
speech_2004_3 = extract_speech_dot(para_2004_3, speaker_2004_1)

In [None]:
year = '2004'
name = ['George W. Bush', 'John F. Kerry']
date = ['October 13, 2004','October 8, 2004','September 30, 2004']
k=3
data1 = {
    'name':[name[0]]*k,
    'date':date,
    'speech':[speech_2004_1[speaker_2004_1[0]], speech_2004_2[speaker_2004_1[0]], speech_2004_3[speaker_2004_1[0]]]
}
data2 = {
    'name':[name[1]]*k,
    'date':date,
    'speech':[speech_2004_1[speaker_2004_1[1]], speech_2004_2[speaker_2004_1[1]], speech_2004_3[speaker_2004_1[1]]]
}


df1_2004 = pd.DataFrame(data1)
df2_2004 = pd.DataFrame(data2)
filename1 = f'{save_dir}/{year}_{name[0]}_debate.csv'
filename2 = f'{save_dir}/{year}_{name[1]}_debate.csv'
# df1_2004.to_csv(filename1, index=False)
# df2_2004.to_csv(filename2, index=False)

In [None]:
df1_2004

Unnamed: 0,name,date,speech
0,George W. Bush,"October 13, 2004","[Bob, thank you very much. I want to thank Ari..."
1,George W. Bush,"October 8, 2004","[Charlie, thank you, and thank our panelists. ..."
2,George W. Bush,"September 30, 2004",[I too thank the University of Miami and say o...


In [None]:
# 2008
url_2008_1 = 'https://www.presidency.ucsb.edu/documents/presidential-debate-hofstra-university-hempstead-new-york'
url_2008_2 = 'https://www.presidency.ucsb.edu/documents/presidential-debate-belmont-university-nashville-tennessee'
url_2008_3 = 'https://www.presidency.ucsb.edu/documents/presidential-debate-the-university-mississippi-oxford'

soup_2008_1, para_2008_1 = get_soup_para(url_2008_1)
soup_2008_2, para_2008_2 = get_soup_para(url_2008_2)
soup_2008_3, para_2008_3 = get_soup_para(url_2008_3)

extract_speaker(soup_2008_1, 'strong')
extract_speaker(soup_2008_2, 'strong')
extract_speaker(soup_2008_3, 'strong')

speaker_2008_1 = ['OBAMA', 'MCCAIN']

speech_2008_1 = extract_speech_col(para_2008_1, speaker_2008_1)
speech_2008_2 = extract_speech_col(para_2008_2, speaker_2008_1)
speech_2008_3 = extract_speech_col(para_2008_3, speaker_2008_1)

In [None]:
year = '2008'
name = ['Barack Obama', 'John McCain']
date = ['October 15, 2008','October 7, 2008','September 26, 2008']
k=3
data1 = {
    'name':[name[0]]*k,
    'date':date,
    'speech':[speech_2008_1[speaker_2008_1[0]], speech_2008_2[speaker_2008_1[0]], speech_2008_3[speaker_2008_1[0]]]
}
data2 = {
    'name':[name[1]]*k,
    'date':date,
    'speech':[speech_2008_1[speaker_2008_1[1]], speech_2008_2[speaker_2008_1[1]], speech_2008_3[speaker_2008_1[1]]]
}


df1_2008 = pd.DataFrame(data1)
df2_2008 = pd.DataFrame(data2)
filename1 = f'{save_dir}/{year}_{name[0]}_debate.csv'
filename2 = f'{save_dir}/{year}_{name[1]}_debate.csv'
# df1_2008.to_csv(filename1, index=False)
# df2_2008.to_csv(filename2, index=False)

In [None]:
df1_2008

Unnamed: 0,name,date,speech
0,Barack Obama,"October 15, 2008","[Well, first of all, I want to thank Hofstra U..."
1,Barack Obama,"October 7, 2008","[Well, Alan, thank you very much for the quest..."
2,Barack Obama,"September 26, 2008","[Well, thank you very much, Jim, and thanks to..."


In [None]:
# 2012
url_2012_1 = 'https://www.presidency.ucsb.edu/documents/presidential-debate-boca-raton-florida'
url_2012_2 = 'https://www.presidency.ucsb.edu/documents/presidential-debate-hempstead-new-york'
url_2012_3 = 'https://www.presidency.ucsb.edu/documents/presidential-debate-denver-colorado'

soup_2012_1, para_2012_1 = get_soup_para(url_2012_1)
soup_2012_2, para_2012_2 = get_soup_para(url_2012_2)
soup_2012_3, para_2012_3 = get_soup_para(url_2012_3)

extract_speaker(soup_2012_1, '.field-docs-content > p > i > i')
extract_speaker(soup_2012_2, '.field-docs-content > p > i')
extract_speaker(soup_2012_3, '.field-docs-content > p > i')

speaker_2012_1 = ['Gov. Romney', 'The President']

speech_2012_1 = extract_speech_dot(para_2012_1, speaker_2012_1)
speech_2012_2 = extract_speech_dot(para_2012_2, speaker_2012_1)
speech_2012_3 = extract_speech_dot(para_2012_3, speaker_2012_1)

In [None]:
year = '2012'
name = ['Governor Mitt Romney','Barack Obama']
date = ['October 22, 2012','October 16, 2012','October 3, 2012']
k=3
data1 = {
    'name':[name[0]]*k,
    'date':date,
    'speech':[speech_2012_1[speaker_2012_1[0]], speech_2012_2[speaker_2012_1[0]], speech_2012_3[speaker_2012_1[0]]]
}
data2 = {
    'name':[name[1]]*k,
    'date':date,
    'speech':[speech_2012_1[speaker_2012_1[1]], speech_2012_2[speaker_2012_1[1]], speech_2012_3[speaker_2012_1[1]]]
}


df1_2012 = pd.DataFrame(data1)
df2_2012 = pd.DataFrame(data2)
filename1 = f'{save_dir}/{year}_{name[0]}_debate.csv'
filename2 = f'{save_dir}/{year}_{name[1]}_debate.csv'
# df1_2012.to_csv(filename1, index=False)
# df2_2012.to_csv(filename2, index=False)

In [None]:
df1_2012

Unnamed: 0,name,date,speech
0,Governor Mitt Romney,"October 22, 2012","[Well, my strategy is pretty straightforward, ..."
1,Governor Mitt Romney,"October 16, 2012","[2014. When you come out in 2014, I presume I'..."
2,Governor Mitt Romney,"October 3, 2012","[Well, sure, I'd like to clear up the record a..."


In [None]:
# 2016
url_2016_1 = 'https://www.presidency.ucsb.edu/documents/presidential-debate-the-university-nevada-las-vegas'
url_2016_2 = 'https://www.presidency.ucsb.edu/documents/presidential-debate-washington-university-st-louis-missouri'
url_2016_3 = 'https://www.presidency.ucsb.edu/documents/presidential-debate-hofstra-university-hempstead-new-york-0'

soup_2016_1, para_2016_1 = get_soup_para(url_2016_1)
soup_2016_2, para_2016_2 = get_soup_para(url_2016_2)
soup_2016_3, para_2016_3 = get_soup_para(url_2016_3)

extract_speaker(soup_2016_1, 'strong')
extract_speaker(soup_2016_2, 'strong')
extract_speaker(soup_2016_3, 'strong')

speaker_2016_1 = ['TRUMP', 'CLINTON']

speech_2016_1 = extract_speech_col(para_2016_1, speaker_2016_1)
speech_2016_2 = extract_speech_col(para_2016_2, speaker_2016_1)
speech_2016_3 = extract_speech_col(para_2016_2, speaker_2016_1)

In [None]:
year = '2016'
name = ['Donald J. Trump','Hillary Clinton']
date = ['October 19, 2016','October 9, 2016','September 26, 2016']
k=3
data1 = {
    'name':[name[0]]*k,
    'date':date,
    'speech':[speech_2016_1[speaker_2016_1[0]], speech_2016_2[speaker_2016_1[0]], speech_2016_3[speaker_2016_1[0]]]
}
data2 = {
    'name':[name[1]]*k,
    'date':date,
    'speech':[speech_2016_1[speaker_2016_1[1]], speech_2016_2[speaker_2016_1[1]], speech_2016_3[speaker_2016_1[1]]]
}


df1_2016 = pd.DataFrame(data1)
df2_2016 = pd.DataFrame(data2)
filename1 = f'{save_dir}/{year}_{name[0]}_debate.csv'
filename2 = f'{save_dir}/{year}_{name[1]}_debate.csv'
# df1_2016.to_csv(filename1, index=False)
# df2_2016.to_csv(filename2, index=False)

In [None]:
df1_2016

Unnamed: 0,name,date,speech
0,Donald J. Trump,"October 19, 2016","[Well, first of all, it's great to be with you..."
1,Donald J. Trump,"October 9, 2016","[Well, I actually agree with that. I agree wit..."
2,Donald J. Trump,"September 26, 2016","[Well, I actually agree with that. I agree wit..."


In [None]:
# 2020
url_2020_1 = 'https://www.presidency.ucsb.edu/documents/presidential-debate-belmont-university-nashville-tennessee-0'
url_2020_2 = 'https://www.presidency.ucsb.edu/documents/presidential-debate-case-western-reserve-university-cleveland-ohio'

soup_2020_1, para_2020_1 = get_soup_para(url_2020_1)
soup_2020_2, para_2020_2 = get_soup_para(url_2020_2)

extract_speaker(soup_2020_1, 'b')
extract_speaker(soup_2020_2, 'b')

speaker_2020_1 = ['BIDEN', 'TRUMP']

speech_2020_1 = extract_speech_col(para_2020_1, speaker_2020_1)
speech_2020_2 = extract_speech_col(para_2020_2, speaker_2020_1)

In [None]:
year = '2020'
name = ['Joseph R. Biden','Donald J. Trump']
date = ['October 22, 2020','September 29, 2020']
k=2
data1 = {
    'name':[name[0]]*k,
    'date':date,
    'speech':[speech_2020_1[speaker_2020_1[0]],speech_2020_2[speaker_2020_1[0]]]
}
data2 = {
    'name':[name[1]]*k,
    'date':date,
    'speech':[speech_2020_1[speaker_2020_1[1]],speech_2020_2[speaker_2020_1[1]]]
}


df1_2020 = pd.DataFrame(data1)
df2_2020 = pd.DataFrame(data2)
filename1 = f'{save_dir}/{year}_{name[0]}_debate.csv'
filename2 = f'{save_dir}/{year}_{name[1]}_debate.csv'
# df1_2020.to_csv(filename1, index=False)
# df2_2020.to_csv(filename2, index=False)

In [None]:
df1_2020

Unnamed: 0,name,date,speech
0,Joseph R. Biden,"October 22, 2020","[220,000 Americans dead. If you hear nothing e..."
1,Joseph R. Biden,"September 29, 2020","[How you doing, man?, I'm well., Well, first o..."


In [None]:
debates = [df1_1960, df2_1960, df1_1976, df2_1976, df1_1980, df2_1980, df1_1984, df2_1984, df1_1988, df2_1988, df1_1996, 
           df2_1996, df1_2004, df2_2004, df1_2008, df2_2008, df1_2012, df2_2012, df1_2016, df2_2016, df1_2020, df2_2020]

In [None]:
total_debates = pd.concat(debates, axis=0)
total_debates.to_csv(filename1, index=False)

In [None]:
total_debates = total_debates.reset_index(drop=True)
total_debates

Unnamed: 0,name,date,speech
0,Richard Nixon,"October 21, 1960","[Good evening, Mr. Howe., Mr. Howe, Senator Ke..."
1,Richard Nixon,"October 13, 1960","[Good evening, Mr. Shadel., Yes. As a matter o..."
2,Richard Nixon,"October 7, 1960","[Well first of all, I don't agree with Senator..."
3,Richard Nixon,"September 26, 1960","[Mr. Smith, Senator Kennedy. The things that S..."
4,John F. Kennedy,"October 21, 1960","[Good evening, Mr. Howe., Mr. Howe, Mr. Vice P..."
5,John F. Kennedy,"October 13, 1960","[Good evening, Mr. Shadel., Mr. McGee, we have..."
6,John F. Kennedy,"October 7, 1960",[In the first place I've never suggested that ...
7,John F. Kennedy,"September 26, 1960","[Well, the Vice President and I came to the Co..."
8,Jimmy Carter,"October 22, 1976","[Well, I might say first of all, that I think ..."
9,Jimmy Carter,"October 6, 1976","[Well, I think this Republican administration ..."


### 당선자 컬럼 추가

In [None]:
elected_list = [(1836, 'Buren'),
(1840, 'Harrison'),
(1844, 'Polk'),
(1848, 'Taylor'),
(1852, 'Pierce'),
(1856, 'Buchanan'),
(1860, 'Lincoln'),
(1864, 'Lincoln'),
(1868, 'Grant'),
(1872, 'Grant'),
(1876, 'Hayes'),
(1880, 'Garfield'),
(1884, 'Cleveland'),
(1888, 'Harrison'),
(1892, 'Cleveland'),
(1896, 'McKinley'),
(1900, 'McKinley'),
(1904, 'Roosevelt'),
(1908, 'Taft'),
(1912, 'Wilson'),
(1916, 'Wilson'),
(1920, 'Harding'),
(1924, 'Coolidge'),
(1928, 'Hoover'),
(1932, 'Roosevelt'),
(1936, 'Roosevelt'),
(1940, 'Roosevelt'),
(1944, 'Roosevelt'),
(1948, 'Truman'),
(1952, 'Eisenhower'),
(1956, 'Eisenhower'),
(1960, 'Kennedy'),
(1964, 'Johnson'),
(1968, 'Nixon'),
(1972, 'Nixon'),
(1976, 'Carter'),
(1980, 'Reagan'),
(1984, 'Reagan'),
(1988, 'Bush'),
(1992, 'Clinton'),
(1996, 'Clinton'),
(2000, 'Bush'),
(2004, 'Bush'),
(2008, 'Obama'),
(2012, 'Obama'),
(2016, 'Trump'),
(2020, 'Biden')]

df = pd.DataFrame(elected_list)
df = df.rename(columns={0:'Year',1:'Last Name'})
df.head(2)

Unnamed: 0,Year,Last Name
0,1836,Buren
1,1840,Harrison


In [None]:
total_debates["elected"] = 0
total_debates.head(2)

Unnamed: 0,name,date,speech,elected
0,Richard Nixon,"October 21, 1960","[Good evening, Mr. Howe., Mr. Howe, Senator Ke...",0
1,Richard Nixon,"October 13, 1960","[Good evening, Mr. Shadel., Yes. As a matter o...",0


In [None]:
for idx, data in enumerate(total_debates.values):
    name = data[0]
    year = int(data[1].split(' ')[-1])

    if year >= 2000:
        start = 41
    elif year >= 1900:
        start = 16
    else:
        start = 0
    for result in elected_list[start:]:
        if result[0]-year <= 2:
            if str.lower(result[1]) in str.lower(name):
                total_debates["elected"].iloc[idx] = 1
                break
total_debates.loc[15,'elected'] = 0                

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  total_debates["elected"].iloc[idx] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  total_debates["elected"].iloc[idx] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  total_debates["elected"].iloc[idx] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  total_debates["elected"].iloc[idx] = 1
A value is t

In [None]:
years =[]
for idx, data in enumerate(total_debates.values):
    years.append(int(data[1].split(' ')[-1]))
total_debates['Year'] = years
total_debates.head(2)

Unnamed: 0,name,date,speech,elected,Year
0,Richard Nixon,"October 21, 1960","[Good evening, Mr. Howe., Mr. Howe, Senator Ke...",0,1960
1,Richard Nixon,"October 13, 1960","[Good evening, Mr. Shadel., Yes. As a matter o...",0,1960


In [None]:
total_debates.columns

Index(['name', 'date', 'speech', 'elected', 'Year'], dtype='object')

In [None]:
filename = r'E:\est\kdt\project\president_prediction\data\Debates\debate_combined.csv'
# total_debates.to_csv(filename, index=False)

In [None]:
total_debates

Unnamed: 0,name,date,speech,elected,Year
0,Richard Nixon,"October 21, 1960","[Good evening, Mr. Howe., Mr. Howe, Senator Ke...",0,1960
1,Richard Nixon,"October 13, 1960","[Good evening, Mr. Shadel., Yes. As a matter o...",0,1960
2,Richard Nixon,"October 7, 1960","[Well first of all, I don't agree with Senator...",0,1960
3,Richard Nixon,"September 26, 1960","[Mr. Smith, Senator Kennedy. The things that S...",0,1960
4,John F. Kennedy,"October 21, 1960","[Good evening, Mr. Howe., Mr. Howe, Mr. Vice P...",1,1960
5,John F. Kennedy,"October 13, 1960","[Good evening, Mr. Shadel., Mr. McGee, we have...",1,1960
6,John F. Kennedy,"October 7, 1960",[In the first place I've never suggested that ...,1,1960
7,John F. Kennedy,"September 26, 1960","[Well, the Vice President and I came to the Co...",1,1960
8,Jimmy Carter,"October 22, 1976","[Well, I might say first of all, that I think ...",1,1976
9,Jimmy Carter,"October 6, 1976","[Well, I think this Republican administration ...",1,1976


## 원본데이터 생성

- 기존 자료 + 토론자료 합본 생성 및 저장

In [None]:
df_speech = pd.read_csv(r"E:\est\kdt\project\president_prediction\data\speech_combined.csv", index_col=0) # 기존 크롤링자료
df_debate = pd.read_csv(r"E:\est\kdt\project\president_prediction\data\debate_combined_ver1.csv", index_col=0) # 2024 바이든/트럼프 토론 자료

df_list = [df_speech, df_debate]
total_speech = pd.concat(df_list, axis=0)
total_speech = total_speech.reset_index(drop=True)
total_speech.to_csv(r"E:\est\kdt\project\president_prediction\data\debate_combined_ver2.csv")

- 텍스트 자료 + 시계열 데이터 병합

In [None]:
df_data = pd.read_csv(r"E:\est\kdt\project\president_prediction\data\merged_Data_final_2024GDP.csv")

df_1 = total_speech.copy() # 텍스트 데이터
df_2 = df_data.copy() # 정형 데이터

df_2.drop(111, axis=0, inplace=True) # 2024년 데이터 드랍....

# 정형데이터 4년 단위로 변신
years_to_keep = list(range(1916, 2025, 4))
df_2 = df_2[df_2['Year'].isin(years_to_keep)]

# 병합을 위한 신규 컬럼 생성 - 텍스트 데이터
last_name =[] # 리스트로 데이터 만들고 컬럼에 값으로 채우기
for sen in df_1['name']:
    lname = sen.split()[-1]
    last_name.append(lname)
df_1['LastName'] = last_name # 데이터 채우기

# 병합을 위한 신규 컬럼 생성 - 정형 데이터
last_name =[] # 리스트로 데이터 만들고 컬럼에 값으로 채우기
for sen in df_2['Name']:
    lname = sen.split()[-1]
    last_name.append(lname)
df_2['LastName'] = last_name # 데이터 채우기

merge_data = pd.merge(df_1, df_2, on=['LastName', 'Year'], how='left')

- 병합된 자료의 결측치 채우기(Party)

In [None]:
sorted_data = merge_data.sort_values(by='Year')
filled_party = sorted_data[sorted_data['Party'].notnull()]

time_win_party = {}
for time in range(1916, 2024, 4):
    win_party = filled_party[filled_party['Year'] ==time]['Party'].values[0]
    if win_party == 'Republican':
        time_win_party[time] = 'Democratic'
    elif win_party =='Democratic':
        time_win_party[time] = 'Republican'

for idx, row in merge_data.iterrows():
    if pd.isnull(row['Party']):
        nan_year = row['Year']
        merge_data.loc[idx, 'Party'] = time_win_party[nan_year]

merge_data['Party'].isnull().sum() # 혹시 결측 있나 확인 0 이면 정상

- 병합된 자료의 결측치 채우기(Elections_Won)

In [None]:
# 역시 left조인으로 잘 알맞게 달라 붙은 것 같다. -> 결측치는 모두 당선 한번도 안된사람인것을 알 수 있다.
merge_data[merge_data['Name'] =='Dwight D. Eisenhower'] # 여기서 컬럼을 이름만 바꿔서하면 안되는 이유 나옴. name으로는 검색안됨. 신규 컬럼 만들었어야함
merge_data['Elections_Won'].fillna(0, inplace=True)

## 원본에서 IDF 하위 181 제거 컬럼 붙은 데이터 생성

In [None]:
# df_anal IDF 낮은 값 제거 과정

# TF-IDF 벡터화기 생성
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_anal['speech_whole'])

# 단어와 IDF 값 추출
idf_values = vectorizer.idf_
word_to_idf = dict(zip(vectorizer.get_feature_names_out(), idf_values))

# DataFrame으로 변환
idf_df = pd.DataFrame(list(word_to_idf.items()), columns=['Word', 'IDF'])

# idf 사전 생성
idf_corpus = idf_df.sort_values('IDF')['Word'][:181].tolist()

# ### idf 리스트 저장
# # import pickle
# # with open(r"E:\est\kdt\project\president_prediction\data\idf_corpus","wb") as fp:
# #     pickle.dump(idf_corpus, fp)

## idf 값 불러오기
with open(r"E:\est\kdt\project\president_prediction\data\idf_corpus","rb") as fp:
    idf_corpus = pickle.load(fp)

# idf 제거
def filter_lowidf(text):
    # 단어 단위로 분리
    words = re.findall(r'\b\w+\b', text)
    # 소문자로 변환하여 비교
    filtered_words = [word for word in words if word.lower() not in idf_corpus]
    # 필터링된 단어들을 다시 문장으로 결합
    return ' '.join(filtered_words)

# idf 제거 컬럼 생성 -> 전처리 안한것에서 idf만 제거한것 생성
df_anal["whole_remove_idf"] = df_anal["speech_whole"].apply(lambda x: filter_lowidf(x))

### df_anal 최종본 저장
df_anal.to_csv(r"E:\est\kdt\project\president_prediction\data\speech_analysis.csv")

## 분석을 위한 데이터 - 각 분야 코퍼스 리스트가 과학, 경제, 환경, 교육, 안보에 등장 횟수 카운트 데이터프레임 생성

- 연도/후보의 연설문(전처리x)에서 각 분야의 키워드 출현 횟수 데이터

In [None]:
# 전처리 하지 않은 연설문에서, 해당 분야의 중복되는 단어수를 카운트 
# -> 분야별 빈도수를 알 수 있다.

def corpus_freq(field_keyword):
    keyowrd_freq_1 = []
    speech = df_anal['speech_whole']

    for sen in tqdm(speech):
        cnt=0
        for keyword in field_keyword:
            if keyword in sen.lower():
                cnt+=1
        keyowrd_freq_1.append(cnt)
    return keyowrd_freq_1

In [None]:
# 과학관련 키워드 200

sci_keyword = set([
    'atom', 'molecule', 'cell', 'organism', 'tissue', 
    'organ', 'system', 'ecosystem', 'biosphere', 'species', 
    'genome', 'gene', 'chromosome', 'dna', 'rna', 
    'protein', 'enzyme', 'metabolism', 'photosynthesis', 'respiration', 
    'mitosis', 'meiosis', 'evolution', 'mutation', 'adaptation', 
    'natural', 'selection', 'ecology', 'biome', 'habitat', 
    'niche', 'population', 'community', 'biomass', 'trophic', 
    'food', 'chain', 'web', 'energy', 'matter', 
    'quantum', 'particle', 'atom', 'ion', 'electron', 
    'proton', 'neutron', 'photon', 'neutrino', 'boson', 
    'fermion', 'field', 'force', 'gravity', 'magnetism', 
    'electricity', 'magnet', 'current', 'voltage', 'resistance', 
    'circuit', 'capacitance', 'inductance', 'thermodynamics', 'entropy', 
    'enthalpy', 'kinetics', 'catalysis', 'reaction', 'solution', 
    'solvent', 'solute', 'concentration', 'equilibrium', 'pH', 
    'buffer', 'redox', 'oxidation', 'reduction', 'acidity', 
    'alkalinity', 'isotope', 'radioactivity', 'decay', 'half-life', 
    'radiation', 'nuclear', 'fusion', 'fission', 'cellular', 
    'organelle', 'mitochondria', 'chloroplast', 'ribosome', 'endoplasmic', 
    'reticulum', 'golgi', 'lysosome', 'vacuole', 'cytoskeleton', 
    'membrane', 'phospholipid', 'carbohydrate', 'lipid', 'nucleic', 
    'acid', 'amino', 'peptide', 'polypeptide', 'antibody', 
    'antigen', 'hormone', 'neurotransmitter', 'receptor', 'transcription', 
    'translation', 'genotype', 'phenotype', 'inheritance', 'genetics', 
    'biotechnology', 'genetic', 'engineering', 'cloning', 'stem', 
    'cell', 'virology', 'bacteriology', 'mycology', 'parasitology', 
    'immunology', 'pathogen', 'disease', 'epidemic', 'pandemic', 
    'vaccination', 'antibiotic', 'antiviral', 'epigenetics', 'pharmacology', 
    'toxicology', 'physiology', 'anatomy', 'histology', 'embryology', 
    'evolutionary', 'paleontology', 'astronomy', 'cosmology', 'planet', 
    'star', 'galaxy', 'universe', 'nebula', 'black', 
    'hole', 'supernova', 'quasar', 'dark', 'matter', 
    'dark', 'energy', 'redshift', 'blue', 'shift', 
    'parallax', 'light', 'years', 'astronomical', 'unit', 
    'telescopes', 'satellite', 'probe', 'space', 'time', 
    'relativity', 'string', 'theory', 'quantum', 'mechanics', 
    'thermodynamics', 'special', 'general', 'gravitational', 'waves', 
    'entanglement', 'superconductivity', 'nanotechnology', 'molecular', 'biophysics',
    'isomer', 'catalyst', 'reactant', 'product', 'photonics', 
    'spectroscopy', 'chromatography', 'microscopy', 'astronaut', 'hydrolysis', 
    'thermochemistry', 'stoichiometry', 'biochemistry', 'nanoscale', 'electrodynamics', 
    'optics', 'pharmacogenomics', 'seismology', 'geophysics', 'meteorology', 
    'climatology', 'oceanography'
])

sci_keyword = set(sci_keyword)
len(sci_keyword)

205

In [None]:
sci_cnt = corpus_freq(sci_keyword)

100%|██████████| 3397/3397 [00:19<00:00, 170.65it/s]


In [None]:
# 경제 키워드 200개
econ_keyword = set([
    'inflation', 'deflation', 'stagflation', 'recession', 'depression', 
    'gdp', 'cpi', 'ppi', 'unemployment', 'employment', 
    'investment', 'consumption', 'production', 'distribution', 'allocation', 
    'taxation', 'subsidy', 'budget', 'deficit', 'surplus', 
    'tariff', 'quota', 'trade', 'export', 'import', 
    'currency', 'exchange', 'interest', 'savings', 'loan', 
    'mortgage', 'bond', 'stock', 'equity', 'dividend', 
    'capital', 'labor', 'land', 'entrepreneurship', 'market', 
    'competition', 'monopoly', 'oligopoly', 'monopsony', 'oligopsony', 
    'price', 'wage', 'rent', 'profit', 'revenue', 
    'cost', 'expense', 'income', 'earnings', 'wealth', 
    'poverty', 'inequality', 'welfare', 'insurance', 'risk', 
    'return', 'yield', 'valuation', 'appreciation', 'depreciation', 
    'liquidity', 'solvency', 'leverage', 'hedge', 'arbitrage', 
    'speculation', 'investment', 'capitalism', 'socialism', 'communism', 
    'regulation', 'deregulation', 'privatization', 'nationalization', 'globalization', 
    'localization', 'economics', 'microeconomics', 'macroeconomics', 'monetary', 
    'fiscal', 'policy', 'supply', 'demand', 'equilibrium', 
    'elasticity', 'substitute', 'complement', 'inferior', 'normal', 
    'luxury', 'necessity', 'goods', 'services', 'consumer', 
    'producer', 'firms', 'households', 'government', 'foreign', 
    'sector', 'financial', 'institution', 'bank', 'central', 
    'reserve', 'debt', 'credit', 'debit', 'surplus', 
    'deficit', 'tax', 'subsidy', 'grant', 'bailout', 
    'stimulus', 'reform', 'austerity', 'quota', 'sanction', 
    'blockade', 'protectionism', 'liberalization', 'efficiency', 'productivity', 
    'innovation', 'invention', 'technology', 'automation', 'outsourcing', 
    'insourcing', 'offshoring', 'reshoring', 'migration', 'immigration', 
    'emigration', 'demographics', 'population', 'growth', 'development', 
    'sustainability', 'renewable', 'nonrenewable', 'resource', 'scarcity', 
    'abundance', 'choice', 'opportunity', 'cost', 'tradeoff', 
    'specialization', 'division', 'labor', 'economies', 'scale', 
    'diseconomies', 'scope', 'benchmarking', 'forecasting', 'analysis', 
    'model', 'theory', 'hypothesis', 'data', 'statistics', 
    'correlation', 'causation', 'trend', 'cycle', 'boom', 
    'bust', 'bubble', 'crisis', 'recovery', 'expansion', 
    'contraction', 'peak', 'trough', 'plateau', 'stagnation',
    'liability', 'asset', 'collateral', 'deposit', 'withdrawal', 
    'fund', 'portfolio', 'index', 'benchmark', 'option', 
    'derivative', 'futures', 'forward', 'swap', 'tranche', 
    'synergy', 'merger', 'acquisition', 'consolidation', 'divestiture', 
    'liquidation', 'bankruptcy', 'solvent', 'insolvent', 'overdraft', 
    'audit', 'compliance', 'reporting', 'transparency', 'governance'
])
len(econ_keyword)

208

In [None]:
econ_cnt = corpus_freq(econ_keyword)

100%|██████████| 3397/3397 [00:18<00:00, 183.20it/s]


In [None]:
# 환경 키워드 200개

env_keyword = set(
    [
    'ecosystem', 'biodiversity', 'sustainability', 'pollution', 'conservation',
    'habitat', 'climate', 'carbon', 'emissions', 'renewable',
    'nonrenewable', 'deforestation', 'reforestation', 'afforestation', 'desertification',
    'erosion', 'degradation', 'ecology', 'biome', 'wetland',
    'river', 'lake', 'ocean', 'sea', 'forest',
    'grassland', 'desert', 'tundra', 'mountain', 'valley',
    'island', 'shore', 'coast', 'marsh', 'swamp',
    'pollutant', 'waste', 'recycling', 'compost', 'biodegradable',
    'toxins', 'chemical', 'contamination', 'acidification', 'ozone',
    'greenhouse', 'global', 'warming', 'weather', 'temperature',
    'precipitation', 'humidity', 'wind', 'storm', 'hurricane',
    'typhoon', 'tornado', 'drought', 'flood', 'stormwater',
    'runoff', 'sewage', 'sanitation', 'water', 'air',
    'quality', 'smog', 'particulate', 'green', 'energy',
    'solar', 'wind', 'hydro', 'geothermal', 'biomass',
    'fossil', 'fuel', 'coal', 'oil', 'natural',
    'gas', 'nuclear', 'radiation', 'wastewater', 'treatment',
    'sustainable', 'development', 'policy', 'legislation', 'regulation',
    'carbon', 'footprint', 'offset', 'adaptation', 'mitigation',
    'conservation', 'biodiversity', 'restoration', 'ecoservices', 'ecotourism',
    'organic', 'farming', 'pesticide', 'herbicide', 'fertilizer',
    'irrigation', 'hydrology', 'geomorphology', 'soil', 'landscape',
    'geology', 'tectonics', 'volcano', 'earthquake', 'biogeochemical',
    'cycle', 'nitrogen', 'phosphorus', 'carbon', 'water',
    'habitat', 'fragmentation', 'migration', 'species', 'invasive',
    'endangered', 'extinct', 'threatened', 'protected', 'reserve',
    'national', 'park', 'wildlife', 'refuge', 'sanctuary',
    'environmental', 'impact', 'assessment', 'audit', 'monitoring',
    'restoration', 'reclamation', 'preservation', 'safety', 'risk',
    'hazard', 'emergency', 'disaster', 'management', 'response',
    'mitigation', 'adaptation', 'strategies', 'technologies', 'innovations',
    'education', 'awareness', 'advocacy', 'activism', 'lobbying',
    'socioeconomic', 'effects', 'equity', 'justice', 'policy',
    'framework', 'international', 'agreement', 'protocol', 'convention',
    'sustainable', 'development', 'goals', 'population', 'growth',
    'urban', 'sprawl', 'transportation', 'land-use', 'planning',
    'greenhouse', 'gases', 'carbon', 'dioxide', 'methane',
    'nitrous', 'oxide', 'ozone', 'layer', 'depletion',
    'airborne', 'particles', 'emission', 'control', 'legislation',
    'environment', 'ally', 'ecosystem', 'services', 'bioindicator',
    'climate', 'zone', 'geothermal', 'albedo', 'sustainable',
    'landfill', 'overfishing', 'aquifer', 'recycling', 'wilderness',
    'pollinator', 'biotic', 'abiotic', 'dams', 'siltation',
    'habitat', 'corridor', 'greenwashing', 'overconsumption', 'biomagnification',
    'biofuel', 'aquaculture', 'ecotone'
])

len(env_keyword)

200

In [None]:
env_cnt = corpus_freq(env_keyword)

100%|██████████| 3397/3397 [00:26<00:00, 128.51it/s]


In [None]:
# 교육 키워드 200개

edu_keyword = set(
    [
    'curriculum', 'pedagogy', 'assessment', 'evaluation', 'learning',
    'teaching', 'instruction', 'school', 'student', 'teacher',
    'classroom', 'syllabus', 'textbook', 'lecture', 'seminar',
    'workshop', 'homework', 'assignment', 'exam', 'quiz',
    'grade', 'feedback', 'accreditation', 'diploma', 'degree',
    'certificate', 'course', 'module', 'program', 'major',
    'minor', 'elective', 'prerequisite', 'graduation', 'enrollment',
    'registration', 'attendance', 'participation', 'dissertation', 'thesis',
    'research', 'methodology', 'hypothesis', 'data', 'analysis',
    'review', 'study', 'project', 'presentation', 'collaboration',
    'discussion', 'debate', 'tutorial', 'mentorship', 'guidance',
    'counseling', 'intervention', 'remediation', 'enrichment', 'extra',
    'curricular', 'extracurricular', 'student', 'center', 'library',
    'resources', 'digital', 'technology', 'e-learning', 'online',
    'platform', 'tool', 'software', 'application', 'network',
    'internet', 'web', 'forum', 'blog', 'podcast',
    'video', 'tutorial', 'webinar', 'courseware', 'module',
    'quiz', 'game', 'simulation', 'interactive', 'activity',
    'participatory', 'engagement', 'self-directed', 'autonomous', 'collaborative',
    'cooperative', 'peer', 'group', 'individual', 'personalized',
    'adaptive', 'differentiated', 'inclusive', 'special', 'needs',
    'gifted', 'talented', 'remedial', 'behavioral', 'support',
    'administration', 'policy', 'regulation', 'governance', 'leadership',
    'management', 'strategic', 'planning', 'budget', 'funding',
    'grant', 'scholarship', 'loan', 'tuition', 'fee',
    'payment', 'financial', 'aid', 'resource', 'allocation',
    'infrastructure', 'facility', 'equipment', 'technology', 'software',
    'hardware', 'resource', 'center', 'lab', 'studio',
    'fieldwork', 'practicum', 'internship', 'placement', 'service',
    'community', 'outreach', 'partnership', 'collaboration', 'network',
    'association', 'society', 'conference', 'symposium', 'forum',
    'workshop', 'seminar', 'training', 'certification', 'continuing',
    'education', 'professional', 'development', 'competency', 'skill',
    'literacy', 'numeracy', 'fluency', 'proficiency', 'aptitude',
    'competence', 'knowledge', 'understanding', 'awareness', 'insight',
    'innovation', 'creativity', 'critical', 'thinking', 'problem-solving',
    'analytical', 'evaluation', 'reflection', 'self-assessment', 'peer-assessment',
    'formative', 'summative', 'norm-referenced', 'criterion-referenced', 'standard',
    'benchmark', 'outcome', 'goal', 'objective', 'target',
    'criteria', 'rubric', 'scale', 'metric', 'indicator',
    'report', 'transcript', 'record', 'portfolio', 'evidence',
    'achievement', 'progress', 'performance', 'development', 'growth',
    'expansion', 'advancement', 'success', 'failure', 'challenge',
    'barrier', 'obstacle', 'enabler', 'motivator', 'incentive',
    'reward', 'recognition', 'celebration', 'graduation', 'commencement'
])
len(edu_keyword)

209

In [None]:
edu_cnt = corpus_freq(edu_keyword)

  0%|          | 0/3397 [00:00<?, ?it/s]

100%|██████████| 3397/3397 [00:18<00:00, 183.23it/s]


In [None]:
# 안보 키워드 200개

sec_keyword = set(
    [
    'intelligence', 'surveillance', 'counterterrorism', 'cybersecurity', 'espionage',
    'security', 'defense', 'military', 'conflict', 'terrorism',
    'threat', 'risk', 'border', 'patrol', 'inspection',
    'detection', 'prevention', 'response', 'crisis', 'management',
    'emergency', 'protocol', 'alert', 'warning', 'defense',
    'strategy', 'policy', 'operation', 'taskforce', 'agency',
    'intelligence', 'agency', 'counterintelligence', 'decryption', 'encryption',
    'classified', 'information', 'leak', 'secret', 'confidential',
    'authorization', 'clearance', 'safeguard', 'mitigation', 'protection',
    'vulnerability', 'exposure', 'security', 'audit', 'assessment',
    'incident', 'investigation', 'reconnaissance', 'surveillance', 'monitoring',
    'intervention', 'control', 'engagement', 'sanction', 'embargo',
    'diplomacy', 'negotiation', 'treaty', 'agreement', 'alliance',
    'coalition', 'force', 'unit', 'deployment', 'strategy',
    'militant', 'guerrilla', 'insurgent', 'rebel', 'military',
    'hardware', 'software', 'network', 'infrastructure', 'cyber',
    'security', 'penetration', 'attack', 'malware', 'virus',
    'trojan', 'worm', 'phishing', 'ransomware', 'firewall',
    'vulnerability', 'exploit', 'breach', 'intrusion', 'security',
    'protocol', 'authentication', 'authorization', 'identity', 'credential',
    'access', 'control', 'data', 'privacy', 'safeguard',
    'espionage', 'agent', 'spy', 'surveillance', 'intercept',
    'transmission', 'communications', 'monitor', 'detect', 'track',
    'neutralize', 'countermeasure', 'debrief', 'training', 'drill',
    'exercise', 'readiness', 'alert', 'response', 'action',
    'operation', 'plan', 'contingency', 'recovery', 'reinforcement',
    'deployment', 'strategy', 'tactic', 'engagement', 'collaboration',
    'interagency', 'coordination', 'situation', 'intelligence', 'analysis',
    'briefing', 'report', 'memo', 'directive', 'protocol',
    'order', 'command', 'command', 'control', 'communications',
    'surveillance', 'patrol', 'border', 'check', 'screen',
    'detain', 'arrest', 'detention', 'interrogation', 'custody',
    'prosecution', 'defense', 'judicial', 'legal', 'case',
    'crime', 'criminal', 'offense', 'violation', 'transgression',
    'regulation', 'law', 'legislation', 'statute', 'ordinance',
    'directive', 'mandate', 'enforcement', 'compliance', 'monitoring',
    'audit', 'investigation', 'assessment', 'evaluation', 'review',
    'audit', 'inspection', 'assessment', 'evaluation', 'review',
    'countermeasure', 'deterrent', 'mitigation', 'resilience', 'adaptation',
    'recovery', 'response', 'readiness', 'training', 'preparedness',
    'operational', 'security', 'policy', 'strategy', 'plan',
    'control', 'command', 'strategy', 'policy', 'regulation',
    'procedure', 'guideline', 'protocol', 'framework', 'system',
    'network', 'infrastructure', 'support', 'protection', 'response',
    'incident', 'event', 'situation', 'condition', 'status',
    'operational', 'effectiveness', 'efficiency', 'capability', 'resource',
    'intervention', 'forensics', 'countermeasures', 'disaster', 'recovery',
    'scenario', 'brief', 'tactical', 'strategic', 'counter',
    'intelligence', 'threats', 'vigilance', 'operations', 'surveillance',
    'asset', 'security', 'protection', 'conflict', 'crisis',
    'response', 'preparation', 'mobilization', 'contingency', 'risk',
    'threatening', 'risk', 'hazard', 'situation', 'incident',
    'intercept', 'coordination', 'support', 'asset', 'logistics',
    'safety', 'precaution', 'security', 'compromise', 'protocol',
    'radicalization', 'extremism', 'jihad', 'militant', 'insurgent',
    'terrorist', 'bombing', 'attack', 'hostage', 'extortion',
    'fundamentalism', 'guerrilla', 'clandestine', 'cell', 'network',
    'propaganda', 'recruitment', 'violence', 'sabotage', 'crisis',
    'security', 'threat'
])
len(sec_keyword)

200

In [None]:
sec_cnt = corpus_freq(sec_keyword)

100%|██████████| 3397/3397 [00:18<00:00, 184.42it/s]


In [None]:
# 연도/후보 키워드 출현 횟수 데이터 새로 만듬 df_anal_3

df_word = df_anal[['name', 'elected', 'Year', 'Party']].copy()
df_word['sci_cnt'] = sci_cnt
df_word['econ_cnt'] = econ_cnt
df_word['env_cnt'] = env_cnt
df_word['edu_cnt'] = edu_cnt
df_word['sec_cnt'] = sec_cnt
df_word['Party'] = df_word['Party'].replace({'Republican': 0, 'Democratic': 1})

df_anal_3 = df_word.pivot_table(index=['Year','name'], 
                                aggfunc=np.mean
                                ).reindex(columns=['elected', 'Party', 'sci_cnt', 'econ_cnt', 'env_cnt', 'edu_cnt', 'sec_cnt']).sort_values('Year').round().reset_index()

df_anal_3['Party'] = df_anal_3['Party'].replace({0: 'Republican', 1: 'Democratic'})
df_anal_3['elected'] = df_anal_3['elected'].replace({0.0 : 'loss', 1.0 : 'win'})
df_anal_3.to_csv(r"E:\est\kdt\project\president_prediction\data\df_anal_3.csv")

## 분석을 위한 데이터 - 연설문을 연도별, 후보별, 대선당락별, 정당별 KeyBERT 분석한 데이터프레임 생성

### 1-1) 연도별 연설문 병합 - 아무 처리안한 연설문 keybert 분석 데이터프레임 생성

In [None]:
type(df_anal['speech_whole'][0])

In [None]:
# 연도별로 문장을 모을 딕셔너리 생성
year_speech = defaultdict(str)

# 데이터프레임 순회하면서 연도별로 문장을 합치기
for idx, row in df_anal.iterrows():
    year = row['Year']
    speech = row['speech_whole']
    year_speech[year] += speech

df_year_speech_1 = pd.DataFrame(list(year_speech.items()), columns=['Year', 'Speech'])
df_year_speech_1.sort_values('Year', inplace=True)

### 연도별 keybert 분석 데이터 프레임 생성
kw_model = KeyBERT()
keybert_year_speech_1 = kw_model.extract_keywords(df_year_speech_1['Speech'], keyphrase_ngram_range=(1, 2), stop_words='english', top_n=30)
df_year_speech_1['keybert_year'] = keybert_year_speech_1
df_year_speech_1.to_csv(r'E:\est\kdt\project\president_prediction\data\year_speech.csv')

### 1-2) 연도별 연설문 병합 - IDF 하위 181개 제거 연설문 keybert 분석 데이터프레임 생성

- 연도별로 문장(idf제거)

In [None]:
# 연도별로 문장(idf제거)을 모을 딕셔너리 생성
year_speech_2 = defaultdict(str)

# 데이터프레임 순회하면서 연도별로 문장을 합치기
for idx, row in df_anal.iterrows():
    year = row['Year']
    speech = row['whole_remove_idf']
    year_speech_2[year] += speech

df_year_speech_2 = pd.DataFrame(list(year_speech_2.items()), columns=['Year', 'Speech'])
df_year_speech_2.sort_values('Year', inplace=True)

# 연도별 keybert(idf제거) 분석

kw_model = KeyBERT()
keybert_year_speech_2 = kw_model.extract_keywords(df_year_speech_2['Speech'], keyphrase_ngram_range=(1, 2), stop_words='english', top_n=30)
df_year_speech_2['keybert_year'] = keybert_year_speech_2
df_year_speech_2.to_csv(r'E:\est\kdt\project\president_prediction\data\year_speech_remove_idf.csv')

### 2-1) 후보별 연설문 병합 - 아무 처리안한 연설문 keybert 분석 데이터프레임 생성

In [None]:
name_speech = defaultdict(str)

for idx, row in df_anal.iterrows():
    name = row['name']
    speech = row['speech_whole']
    name_speech[f'{row["Year"]}_{name}'] += speech

df_name_speech_1 = pd.DataFrame(list(name_speech.items()), columns=['name', 'Speech'])

# 후보별 연설문 keybert
df_name_speech_1 = df_name_speech_1.sort_values('name')
keybert_name_speech_1 = kw_model.extract_keywords(df_name_speech_1['Speech'], keyphrase_ngram_range=(1, 2), stop_words='english', top_n=30)
df_name_speech_1['keybert_name'] = keybert_name_speech_1
df_name_speech_1.to_csv(r'E:\est\kdt\project\president_prediction\data\candidate_keywords.csv')

### 2-2) 후보별 연설문 병합 - IDF 하위 181개 제거 연설문 keybert 분석 데이터프레임 생성

In [None]:
# 후보별 문장(idf제거)을 모을 딕셔너리 생성

name_speech_2 = defaultdict(str)

for idx, row in df_anal.iterrows():
    name = row['name']
    speech = row['whole_remove_idf']
    name_speech_2[f'{row["Year"]}_{name}'] += speech

df_name_speech_2 = pd.DataFrame(list(name_speech_2.items()), columns=['name', 'Speech'])

# 후보별 연설문(idf제거) keybert
kw_model = KeyBERT()
df_name_speech_2 = df_name_speech_2.sort_values('name')
keybert_name_speech_2 = kw_model.extract_keywords(df_name_speech_2['Speech'], keyphrase_ngram_range=(1, 2), stop_words='english', top_n=30)
df_name_speech_2['keybert_name'] = keybert_name_speech_2
df_name_speech_2.to_csv('candidate_keywords_remove_idf.csv')

### 3-1) 대선 당락별 연설문 병합 - IDF 하위 181개 제거 연설문 keybert 분석 데이터프레임 생성ㅡ

In [None]:
# 대선 당락별로 문장을 모을 딕셔너리 생성
elected_speech = defaultdict(str)

for idx, row in df_anal.iterrows():
    elected = int(row['elected'])
    speech = row['whole_remove_idf']
    elected_speech[elected] += speech

df_elected_speech_1 = pd.DataFrame(list(elected_speech.items()), columns=['elected', 'Speech'])

# 당락별 연설문 keybert
kw_model = KeyBERT()
keybert_name_speech_1 = kw_model.extract_keywords(df_elected_speech_1['Speech'], keyphrase_ngram_range=(1, 2), stop_words='english', top_n=30)
df_elected_speech_1['keybert_name'] = keybert_name_speech_1
df_elected_speech_1.to_csv(r'E:\est\kdt\project\president_prediction\data\elected_outcome_keywords.csv')

### 4-1) 정당별 연설문 병합 - IDF 하위 181개 제거 연설문 keybert 분석 데이터프레임 생성

In [None]:
# 정당별로 문장을 모을 딕셔너리 생성
party_speech_1 = defaultdict(str)

for idx, row in df_anal.iterrows():
    party = row['Party']
    speech = row['whole_remove_idf']
    party_speech_1[party] += speech

df_party_speech_1 = pd.DataFrame(list(party_speech_1.items()), columns=['Party', 'Speech'])

# 정당별 연설문 keybert
kw_model = KeyBERT()
keybert_party_speech_1 = kw_model.extract_keywords(df_party_speech_1['Speech'], keyphrase_ngram_range=(1, 2), stop_words='english', top_n=30)
df_party_speech_1['keybert_name'] = keybert_party_speech_1
df_party_speech_1.to_csv('party_keywords.csv')