In [None]:
from glob import glob 
import pandas as pd 
import codecs
from collections import Counter
import re
import string

In [None]:
import nltk
nltk.download('punkt')

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
exclude = set(string.punctuation)
exclude.add("‘")
exclude.add("“")

In [None]:
from nltk import word_tokenize
from nltk.corpus import brown
from nltk.corpus import wordnet as wn

In [None]:
from nltk.stem import SnowballStemmer
snowball_stemmer = SnowballStemmer("english")
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

There are different ways to clean the text. Perhaps we should consider the method we want to use: naive, tokenizer, lemmatization, or stemming? Below I have used a single case to demonstrate naive, tokenizer, lemmatizer (couldn't figure out stemmer, but will do this upcoming week. 

In [None]:
#naive pipeline 
def clean1(x):
    x=x.replace('\n\n','') # remove the line breaks
    x=x.lower()# lower text
    x = ''.join(ch for ch in x if ch not in exclude) #remove punctuation
    x=re.sub('[0-9]+', '', x) # remove numbers
    x=x.split() #split words 
    x=[word for word in x if word not in stopwords.words('english')]#remove stopwords
   #x=" ".join(str(x) for x in x) # you can do this if you want to remove list structure
    return x

In [None]:
#tokenizer 
def nlp_pipeline1(text):
    
    text=text.lower()
    
    #tokenize words for each sentence
    text = nltk.word_tokenize(text)
    
    text = ''.join(ch for ch in text if ch not in exclude) #remove punctuation
    text=re.sub('[0-9]+', '', text) 
    text=text.split("'") #split words 
    
    # remove punctuation and numbers
    #text = [token for token in text if token.isalpha()] #for some reason, this step was removing almost all of the words so replaced it with the above two lines
    
    # remove stopwords - be careful with this step    
    text = [token for token in text if token not in stop_words]

    return text

In [None]:
#lemmatization 
def nlp_lem(text):
    #tokenize words for each sentence
    text = nltk.word_tokenize(text)
    # pos tagger
    text = nltk.pos_tag(text)
    # lemmatizer
    text = [wordnet_lemmatizer.lemmatize(token.lower(),"v")if "V" in pos else wordnet_lemmatizer.lemmatize(token.lower()) for token,pos in text]
    # remove punctuation and numbers
    text = ''.join(ch for ch in text if ch not in exclude) #remove punctuation
    text=re.sub('[0-9]+', '', text) 
    text=text.split("'") #split words 
    # remove stopwords - be careful with this step    
    text = [token for token in text if token not in stop_word_list]
    return text

In [None]:
#stemming
#stem_list1 = [snowball_stemmer.stem(word) for word in list1]
#def nlp_stem(text):
    #tokenize words for each sentence
    #text = nltk.word_tokenize(text)
    # pos tagger
    #text = nltk.pos_tag(text)
    # stemmer 
    #text = [snowball_stemmer.stem(word) for word in text]
    # remove punctuation and numbers
    #text = ''.join(ch for ch in text if ch not in exclude) #remove punctuation
    #text=re.sub('[0-9]+', '', text) 
    #text=text.split("'") #split words 
    # remove stopwords - be careful with this step    
    #text = [token for token in text if token not in stop_word_list]
    #return text

In [None]:
#random case, D4.Feb23.2001.MAJ
d4feb232001maj = codecs.open("/Users/schap/Desktop/TA Data/AC/2002/1/TXT/D1.Mar26.2002.MAJ.txt", "r", "utf-8").read().strip().split()
d4feb232001maj = str(d4feb232001maj)
#cleaning using naive pipeline 
maj = clean1(d4feb232001maj)
print (Counter(maj).most_common())

In [None]:
token_d4feb232001maj = codecs.open("/Users/schap/Desktop/TA Data/AC/2002/1/TXT/D1.Mar26.2002.MAJ.txt", "r", "utf-8").read().strip().split()
token_d4feb232001maj = str(token_d4feb232001maj)
#cleaning using tokenizer pipeline 
token_maj = nlp_pipeline1(token_d4feb232001maj)
print (Counter(token_maj).most_common())

In [None]:
lem_d4feb232001maj = codecs.open("/Users/schap/Desktop/TA Data/AC/2002/1/TXT/D1.Mar26.2002.MAJ.txt", "r", "utf-8").read().strip().split()
lem_d4feb232001maj = str(lem_d4feb232001maj)
#cleaning using lemmaztizer pipeline 
lem_maj = nlp_lem(lem_d4feb232001maj)
print (Counter(lem_maj).most_common())

# Word Count Using all 2002 documents

This is just using naive pipeline

In [None]:
#using all decisions
all2002 = codecs.open("/Users/schap/Desktop/TA Data/All Text Files Combined/ALL/all2002text.txt", "r", "utf-8").read().strip().split()
all2002 = str(all2002)
a2002 = clean1(all2002)
print (Counter(a2002).most_common())

In [None]:
#using only AC 
all2002ac = codecs.open("/Users/schap/Desktop/TA Data/All Text Files Combined/AC/all2002AC.txt", "r", "utf-8").read().strip().split()
all2002ac = str(all2002ac)
a2002ac = clean1(all2002ac)
print (Counter(a2002ac).most_common())

In [None]:
#using only dissent 
all2002diss = codecs.open("/Users/schap/Desktop/TA Data/All Text Files Combined/Dissent/all2002dissent.txt", "r", "utf-8").read().strip().split()
all2002diss = str(all2002diss)
a2002d = clean1(all2002diss)
print (Counter(a2002d).most_common())

In [None]:
#using only majority
all2002maj = codecs.open("/Users/schap/Desktop/TA Data/All Text Files Combined/Majority/all2002majority.txt", "r", "utf-8").read().strip().split()
all2002maj = str(all2002maj)
a2002m = clean1(all2002maj)
print (Counter(a2002m).most_common())