In [1]:
import pandas as pd
import spacy
from spacy.tokens.token import Token
import string
from typing import List
nlp = spacy.load("en_core_web_sm")




In [2]:
def convert_to_string(data) -> str:
    return data if isinstance(data, str) else str(data)

def tokenize(text: str) -> List[Token]:
  doc = nlp(text) # spacy converts the given text into a list of tokens
  return [w for w in doc]

def remove_punctuation(tokens: List[Token]) -> List[Token]:
  return [t for t in tokens if t.text not in string.punctuation]

def remove_stop_words(tokens: List[Token]) -> List[Token]:
  return [t for t in tokens if not t.is_stop]

def lemmatize(tokens: List[Token]) -> List[str]:
  return [t.lemma_ for t in tokens]

def pre_process_text(text: str) -> List[str]:
    return lemmatize(remove_stop_words(remove_punctuation(tokenize(convert_to_string(text)))))

In [3]:
conservative_df = pd.read_csv('/Users/anmol/cis6930-Project/cons_comments.csv', header=0, nrows = 10)
liberal_df = pd.read_excel('/Users/anmol/cis6930-Project/lib_comments.xlsx', header=0, nrows = 10)
democrat_df = pd.read_excel('/Users/anmol/cis6930-Project/dem_comments.xlsx', header=0, nrows = 10)

In [4]:
conservative_df['preprocess_body'] = conservative_df['body'].apply(pre_process_text)
liberal_df['preprocess_body'] = liberal_df['body'].apply(pre_process_text)
democrat_df['preprocess_body'] = democrat_df['body'].apply(pre_process_text)

In [16]:
def create_vocab(data):
    #perform case folding
    vocab = {token.lower() for tokens in data for token in tokens}
    # create a vocabulary list sorted alphabetically
    vocab = sorted(list(vocab))
    # assign an index to each word in the vocabulary
    return vocab

In [18]:
conservative_vocab = create_vocab(conservative_df['preprocess_body'])
liberal_vocab = create_vocab(liberal_df['preprocess_body'])
democrat_vocab = create_vocab(democrat_df['preprocess_body'])
print(conservative_vocab[:10])
print(liberal_vocab[:10])
print(democrat_vocab[:10])

['\n\n', ' ', '1.7b', 'administration', 'airliner', 'asset', 'attack', 'away', 'bad', 'beard']
['  ', '30', '9/11', 'administration', 'airport', 'area', 'article', 'ass', 'assign', 'ban']
['\n', '\n\n', ' \n\n', '...', '/u', '30', 'account', 'acknowledge', 'actively', 'actually']
