In [1]:
import pandas as pd
import numpy as np

import pandas as pd
import tensorflow
import tensorflow.keras as keras
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE, RandomOverSampler

from keras.callbacks import ModelCheckpoint

import ast
import nltk

from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

from tqdm import tqdm
tqdm.pandas()

import pickle

In [None]:
#시드 고정
import random
random.seed(42)
np.random.seed(42)
tensorflow.random.set_seed(42)

In [None]:
#data path
path = '../data/'

## Preprocessing

In [2]:
social_df = pd.read_csv(path+'candidate_merged_Data_v3.csv',index_col=0)
social_df = social_df.reset_index()

In [None]:
region_coord = {"New York": (40.7128, 74.0060),
 "Texas": (31.9686, 99.9018),
 "Massachusetts": (42.4072,71.3824),
 "California": (36.7783, 119.4179),
 "Illinois": (40.6331, 89.3985),
 "Arkansas": (35.2010, 91.8318),
 "Hawaii": (19.8987, 155.6659),
 "Pennsylvania": (41.2033, 77.1945),
 "Virginia": (37.4316, 78.6569),
 "Ohio": (40.4173, 82.9071),
 "Vermont": (44.5588, 72.5778),
 "Iowa": (41.8780, 93.0977),
 "Missouri": (37.9643, 91.8318),
 "Georgia": (32.1574, 82.9071),
 "Kansas": (39.0119, 98.4842),
 "Arizona": (34.0489,111.0937),
 "South Dakota": (43.9695, 99.9018),
 "Michigan": (44.3148, 85.6024),
 "West Virginia": (38.5976, 80.4549),
 "Indiana": (40.5512, 85.6024),
 "Minnesota": (46.7296, 94.6859),
 "Tennessee": (35.5175, 86.5804)
 }

#region feature
social_df["region_x"] = social_df["Region"].apply(lambda x: region_coord[x][0] if pd.notna(x) else x)
social_df["region_y"] = social_df["Region"].apply(lambda x: region_coord[x][1] if pd.notna(x) else x)

social_df["c_region_x"] = social_df["c_Region"].apply(lambda x: region_coord[x][0] if pd.notna(x) else x)
social_df["c_region_y"] = social_df["c_Region"].apply(lambda x: region_coord[x][1] if pd.notna(x) else x)

social_df = social_df.drop(columns=["Region", "c_Region"])

#education
education = {"X": 0, "Bachelor's": 1, "JD": 2, "Master's": 3, "PhD": 4}
social_df["Final Education"] = social_df["Final Education"].apply(lambda x: education[x] if pd.notna(x) else x)
social_df["c_Final_Education"] = social_df["c_Final_Education"].apply(lambda x: education[x] if pd.notna(x) else x)

#party
social_df["Party"] = social_df["Party"].apply(lambda x: (1 if x == "Republican" else 0) if pd.notna(x) else x)
social_df["c_Party"] = social_df["c_Party"].apply(lambda x: (1 if x == "Republican" else 0) if pd.notna(x) else x)


pres_df = social_df.drop(["c_Final_Education", "c_region_x","c_region_y", "c_Party","c_Age"],axis=1)
cand_df = social_df.drop(["Final Education","region_x","region_y", "Party","Age"],axis=1)
cand_df = cand_df.rename(columns={"c_Final_Education": "Final Education", "c_region_x": "region_x", "c_region_y": "region_y", "c_Party": "Party", "c_Age": "Age"})


# party_republican = old data for old model, replace with candidate elected
pres_df["elected"] = (pres_df["Party_Republican"] == pres_df["Party"]).apply(lambda x: 1 if x else 0)
cand_df["elected"] = (cand_df["Party_Republican"] == cand_df["Party"]).apply(lambda x: 1 if x else 0)

pres_df = pres_df.drop(columns="Party_Republican")
cand_df = cand_df.drop(columns="Party_Republican")


#split train and test
social_train_df = pd.concat([pres_df.iloc[:-4],cand_df.iloc[:-4]])
social_test_df = pd.concat([pres_df.iloc[-4:], cand_df.iloc[-4:]])

#select random 5 instance of 4-years
valid_years = [1960,
1952,
1968,
1984,
2012]

social_valid_df = pd.concat([social_train_df[social_train_df["Year"].isin([1949,1950,1951,1952])],
                      social_train_df[social_train_df["Year"].isin([1957,1958,1959,1960])],
                      social_train_df[social_train_df["Year"].isin([1965,1966,1967,1968])],
                      social_train_df[social_train_df["Year"].isin([1981,1982,1983,1984])],
                      social_train_df[social_train_df["Year"].isin([2009,2010,2011,2012])]
                      ])

social_train_df = social_train_df[~social_train_df["Year"].isin([1949,1950,1951,1952,1957,1958,1959,1960,1965,1966,1967,1968,1981,1982,1983,1984,2009,2010,2011,2012])]

In [None]:
#Scale

social_scaler = MinMaxScaler()

to_scale = ["Approval_Rating", "CPI", "GDP", "Employment status", "Final Education", "Age", "region_x", "region_y"]

social_train_df[to_scale] = social_scaler.fit_transform(social_train_df[to_scale])
social_valid_df[to_scale] = social_scaler.transform(social_valid_df[to_scale])
social_test_df[to_scale] = social_scaler.transform(social_test_df[to_scale])

In [None]:
def divide_xy(df):

  social_x = df.drop("elected",axis=1)
  social_y = df["elected"]

  combined_4year_x = list()
  for i in range(0,social_x.values.shape[0],4):
    combined_4year_x.append(social_x.values[i:i+4])

  combined_4year_y = list()
  for i in range(0,social_y.values.shape[0],4):
    combined_4year_y.append(social_y.values[i])

  combined_4year_x = np.array(combined_4year_x)
  combined_4year_y = np.array(combined_4year_y)



  return combined_4year_x, combined_4year_y


social_train_x, social_train_y = divide_xy(social_train_df)
social_valid_x, social_valid_y = divide_xy(social_valid_df)
social_test_x, social_test_y = divide_xy(social_test_df)

In [None]:
# Save Data

# with open(path+'ind_train_x.pkl','wb') as f:
#   pickle.dump(social_train_x,f)

# with open(path+'ind_train_y.pkl','wb') as f:
#   pickle.dump(social_train_y,f)

# with open(path+'ind_valid_x.pkl','wb') as f:
#   pickle.dump(social_valid_x,f)

# with open(path+'ind_valid_y.pkl','wb') as f:
#   pickle.dump(social_valid_y,f)

# with open(path+'ind_test_x.pkl','wb') as f:
#   pickle.dump(social_test_x,f)

# with open(path+'ind_test_y.pkl','wb') as f:
#   pickle.dump(social_test_y,f)

## Create combined data

In [None]:
orig_filename = 'text_features_LDA.csv'
orig_filename_2024 = '2024_text_features_LDA.csv'

In [None]:
text_df = pd.read_csv(path+orig_filename, index_col=0)

recent_df = pd.read_csv(path+orig_filename_2024, index_col=0)

In [None]:
def add_year(df):
  df["speech_year"] = df["date"].apply(lambda x: x.rstrip()[-4:])
  return

add_year(text_df)
add_year(recent_df)

text_df["speech_year"] = text_df["speech_year"].astype(np.int64)
recent_df["speech_year"] = recent_df["speech_year"].astype(np.int64)

### Check Length Distribution

In [None]:
lengths = text_df["speech_processed2"].apply(lambda x:ast.literal_eval(x))
lengths = lengths.reset_index()

fig, ax = plt.subplots()

lengths["speech_processed2"].hist(ax=ax)
ax.set(xlabel='Word count', ylabel='Documents', title='Document Word Count')
plt.show()

### Merge data

In [None]:
text_df["speech_divide_to_10"] = text_df["speech_processed2"].apply(lambda x: ast.literal_eval(x)).apply(lambda x: [x[i:min(len(x),i+100)] for i in range(0,len(x),100)])
recent_df["speech_divide_to_10"] = recent_df["speech_processed2"].apply(lambda x: ast.literal_eval(x)).apply(lambda x: [x[i:min(len(x),i+100)] for i in range(0,len(x),100)])

In [None]:
merged_text_df = pd.merge(text_df,social_df,how='left',left_on='speech_year', right_on='Year',suffixes=[None,"_y"])
merged_test_df = pd.merge(recent_df,social_df,how='left',left_on='speech_year', right_on='Year',suffixes=[None,"_y"])

In [None]:
merged_text_df["speech_divide_to_10"] = merged_text_df["speech_processed2"].apply(lambda x: ast.literal_eval(x)).apply(lambda x: [x[i:min(len(x),i+100)] for i in range(0,len(x),100)])
merged_test_df["speech_divide_to_10"] = merged_test_df["speech_processed2"].apply(lambda x: ast.literal_eval(x)).apply(lambda x: [x[i:min(len(x),i+100)] for i in range(0,len(x),100)])

In [None]:
def name_to_number(name):
  if 'Biden'.lower() in name.lower():
    return 0
  elif 'Trump'.lower() in name.lower():
    return 1
  else:
    return -1
merged_test_df["elected"] = merged_test_df["name"].apply(name_to_number)

In [None]:
train_df, valid_df = train_test_split(merged_text_df, test_size=0.2, random_state=42)

In [None]:
test_df = merged_test_df

In [None]:
text_df.to_csv(path+'processed_'+orig_filename)
recent_df.to_csv(path+'processed_'+orig_filename_2024)

In [None]:
train_df.to_csv(path+'combined_train.csv')
valid_df.to_csv(path+'combined_valid.csv')
test_df.to_csv(path+'combined_test.csv')