In [276]:
import pandas as pd

import numpy as np
import urllib
import string
import nltk
nltk.download('punkt')
nltk.download('words')
from nltk.tokenize import word_tokenize
import os
import requests
import textstat
import time

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [277]:
# link = 'https://www.sec.gov/Archives/edgar/data/3662/0000950170-98-000413.txt'
def loading(link):
    lst = []
    content = urllib.request.urlopen(link)
    for line in content:
        lst.append(line.decode('utf-8'))
    return lst

In [278]:
df = pd.read_csv('LoughranMcDonald_MasterDictionary_2018.csv')

In [279]:
positive_dict = {}
negative_dict = {}

for i in range(len(df)):
    if df['Positive'][i] != 0:
        positive_dict[df['Word'][i].lower()] = df['Positive'][i]
    if df['Negative'][i] != 0:
        negative_dict[df['Word'][i].lower()] = df['Negative'][i]

In [280]:
with open('StopWords_Generic.txt', 'r') as file:
    stop_words = []
    for words in file:
        stop_words.append(words)
stop_words = [words.strip('\n').lower() for words in stop_words]

In [281]:
# Wrap this around a function and apply it for entire data

def text_preprocessing(lst, stop_words):
    words = set(nltk.corpus.words.words())
    new_list, sentence_length = [], []
    for i in lst:
        i = i.strip('\n').strip('\t')
        i = i.replace('\n', '').replace('\t', '')
        i = ''.join([j.lower() for j in i if j not in string.punctuation])
        i = ''.join([j for j in i if not j.isdigit()])
        i = i.strip(' ')
        i = ' '.join([j for j in i.split()])
        i = ' '.join(w for w in nltk.wordpunct_tokenize(i) if w.lower() in words or not w.isalpha())
        
        sent_len = len(i)
        
        sentence_length.append(sent_len)

        if len(i) >= 1:
            new_list.append(i)
        
    text =  ' '.join(new_list)
    cleaned_text  = ' '.join([words for words in text.split() if words not in stop_words])
    tokens = word_tokenize(cleaned_text)
    
    
    avg_sentence_len = np.array(sentence_length).sum()/len(sentence_length)

    return tokens, text, avg_sentence_len

In [282]:
def senti_score (polarity_score):
    if  polarity_score <= -0.5:
        return 'Most_Negative'
    elif polarity_score > -0.5 and polarity_score < 0:
        return 'Negative'
    elif polarity_score == 0:
        return 'Neutral'
    elif polarity_score > 0 and polarity_score < 0.5:
        return 'Positive'
    elif polarity_score >= 0.5:
        return 'Very_Positive'

In [283]:
def score(tokens, positive_dict, negative_dict):

    positive_score = 0
    negative_score = 0

    for word in tokens:
        if word in positive_dict.keys():
            positive_score += 1
        if word in negative_dict.keys():
            negative_score += 1
    
    polarity_score = (positive_score - negative_score)/ ((positive_score + negative_score) + 0.000001)
    subjectivity_score = (positive_score + negative_score)/ ((len(tokens)) + 0.000001)
    sentiment_score = senti_score(polarity_score)
    
    return positive_score, negative_score, polarity_score,subjectivity_score,sentiment_score

In [284]:
# import textstat
def analysis_of_readability(lst, stop_words):
    tokens , text, avg_sent_len = text_preprocessing(lst, stop_words)
    syllabus_in_text = textstat.syllable_count(text)
    words_in_text = textstat.lexicon_count(text, removepunct=True)
    sentence_in_text = textstat.sentence_count(text)
    complex_words_len = textstat.difficult_words(text)
    fog_index = textstat.gunning_fog(text)
    
    perc_complex_word = (complex_words_len/len(tokens)) * 100 
    
    avg_words_per_sentence = words_in_text // sentence_in_text
    per_complex_words = round((complex_words_len / words_in_text)*100,2)
    fog_index = 0.4 * (avg_words_per_sentence + per_complex_words)
    
    return avg_words_per_sentence, per_complex_words, fog_index , perc_complex_word, words_in_text, avg_sent_len, syllabus_in_text

In [285]:

def text_results(links, stop_words):
    
    positive_score, negative_score, polarity_score,subjectivity_score,sentiment_score = [], [], [], [], []
    avg_words_per_sentence, per_complex_words, fog_index,  perc_complex_word, words_in_text, avg_sent_len, syllabus_count, word_counts = [], [], [], [], [], [], [], []
    
    
    pre_tag = 'https://www.sec.gov/Archives/'
    for i in range(len(links)):
        
        
        link = links[i]
        link = os.path.join(pre_tag, link)
        lst = loading(link)
        tokens,cleaned_text, _ = text_preprocessing(lst, stop_words)
        word_count = len(tokens)
        pos_score, neg_score, pol_score,sub_score,sent_score = score(tokens, positive_dict, negative_dict)
        avg_words, per_complex, fog_idx, perc_complex, words_text, avg_len, syll_cnt = analysis_of_readability(lst, stop_words)
        
        positive_score.append(pos_score)
        negative_score.append(neg_score)
        polarity_score.append(pol_score)
        subjectivity_score.append(sub_score)
        sentiment_score.append(sent_score)
        avg_words_per_sentence.append(avg_words)
        per_complex_words.append(per_complex)
        fog_index.append(fog_idx)
        perc_complex_word.append(perc_complex)
        words_in_text.append(words_text)
        avg_sent_len.append(avg_len)
        word_counts.append(word_count)
        syllabus_count.append(syll_cnt)
        
        time.sleep(10)
        
    
    
    pos_ser = pd.Series(positive_score)
    neg_ser = pd.Series(negative_score)
    pol_ser = pd.Series(polarity_score)
    sub_ser = pd.Series(subjectivity_score)
    sent_ser = pd.Series(sentiment_score)
    avg_ser = pd.Series(avg_words_per_sentence)
    complex_ser = pd.Series(per_complex_words)
    fog_ser = pd.Series(fog_index)
    perc_comp_ser = pd.Series(perc_complex_word)
    word_text_ser = pd.Series(words_in_text)
    avg_len_ser = pd.Series(avg_len)
    word_cnt_ser = pd.Series(word_counts)
    syll_ser = pd.Series(syllabus_count)
        
    return pos_ser, neg_ser, pol_ser, sub_ser, sent_ser, avg_ser, complex_ser, fog_ser, perc_comp_ser, word_text_ser, avg_len_ser, word_cnt_ser, syll_ser

In [286]:
cik = pd.read_excel('cik_list.xlsx',engine ='openpyxl')
cik.columns

Index(['CIK', 'CONAME', 'FYRMO', 'FDATE', 'FORM', 'SECFNAME'], dtype='object')

In [287]:
pos_ser, neg_ser, pol_ser, sub_ser, sent_ser, avg_ser, complex_ser, fog_ser, perc_comp_ser, word_text_ser, avg_len_ser, word_cnt_ser, syll_ser = text_results(cik['SECFNAME'][:2], stop_words)

In [288]:
score_df = pd.DataFrame({'positive_score':pos_ser, 'negative_score': neg_ser, 'polarity_score':pol_ser,'average_sentence_length': avg_len_ser, 'percentage_of_complex_words': perc_comp_ser,
                        'fog_index':fog_ser, 'complex_word_count':complex_ser, 'word_count': word_cnt_ser, 'subjectivity_score':sub_ser, 'sentiment_score':sent_ser,
                        'syllabus_count':syll_ser})

In [294]:
score_df

Unnamed: 0,positive_score,negative_score,polarity_score,average_sentence_length,percentage_of_complex_words,fog_index,complex_word_count,word_count,subjectivity_score,sentiment_score,syllabus_count
0,1001,2191,-0.372807,38.015603,2.745692,56200.548,1.37,70037,0.045576,Negative,224070
1,534,1129,-0.357787,,3.40686,37921.08,1.7,47375,0.035103,Negative,148404


In [293]:
score_df.head()

Unnamed: 0,positive_score,negative_score,polarity_score,average_sentence_length,percentage_of_complex_words,fog_index,complex_word_count,word_count,subjectivity_score,sentiment_score,syllabus_count
0,1001,2191,-0.372807,38.015603,2.745692,56200.548,1.37,70037,0.045576,Negative,224070
1,534,1129,-0.357787,,3.40686,37921.08,1.7,47375,0.035103,Negative,148404


In [307]:
my_csv = pd.concat([cik,score_df],ignore_index = True,axis =1)

In [308]:
my_csv

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,3662.0,SUNBEAM CORP/FL/,199803.0,1998-03-06,10-K405,edgar/data/3662/0000950170-98-000413.txt,1001.0,2191.0,-0.372807,38.015603,2.745692,56200.548,1.37,70037.0,0.045576,Negative,224070.0
1,3662.0,SUNBEAM CORP/FL/,199805.0,1998-05-15,10-Q,edgar/data/3662/0000950170-98-001001.txt,534.0,1129.0,-0.357787,,3.406860,37921.080,1.70,47375.0,0.035103,Negative,148404.0
2,3662.0,SUNBEAM CORP/FL/,199808.0,1998-08-13,NT 10-Q,edgar/data/3662/0000950172-98-000783.txt,,,,,,,,,,,
3,3662.0,SUNBEAM CORP/FL/,199811.0,1998-11-12,10-K/A,edgar/data/3662/0000950170-98-002145.txt,,,,,,,,,,,
4,3662.0,SUNBEAM CORP/FL/,199811.0,1998-11-16,NT 10-Q,edgar/data/3662/0000950172-98-001203.txt,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8757,,,,NaT,,,,,,,,,,,,,
8758,,,,NaT,,,,,,,,,,,,,
8759,,,,NaT,,,,,,,,,,,,,
8760,,,,NaT,,,,,,,,,,,,,


In [289]:
# my_csv = pd.concat(cik, score_df)
# pd.to_csv('output.csv')

In [290]:
# df2 = pd.read_excel('irregular verbs.xlsx')

In [291]:
# ls = df2['Base Form']
#     matches = []
#     for match in ls:
#     if "Hello" in match:
#         matches.append(match)

# print(matches)