Text analysis with SpaCy

In [1]:
import spacy
import functions as fct
import time
import re
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def get_text(file_path, i, type):
    if type == 'txt':
        with open(file_path + str(i) + '.txt', 'r', encoding="utf-8") as f:
            return f.read()
    elif type == 'json':
        json = fct.open_file(file_path, 'json')
        return json[i]["raw_source"]
    
def process_text(train_path_txt, i, type, nlp):
    start_time = time.time()
    
    text = get_text(train_path_txt, i, type)

    doc = nlp(text)
    
    word_count = len([token.text for token in doc if not token.is_punct])
    sentence_count = len(list(doc.sents))

    end_time = time.time()
    execution_time = end_time - start_time
    
    a_tag = re.findall(r'<a\b[^>]*>', text)
    em_tag = re.findall(r'<em\b[^>]*>', text)
    blockquote_tag = re.findall(r'<blockquote\b[^>]*>', text)
        
    overall_stats = {
        "word_count": word_count,
        "sentence_count": sentence_count,
        "a_tag": len(a_tag),
        "em_tag": len(em_tag),
        "blockquote_tag": len(blockquote_tag),
        "execution_time": execution_time
    }
    
    return overall_stats, doc

def words_count(doc):
    words = [token.text.lower() for token in doc if not token.is_stop and not token.is_punct]
    word_freq = Counter(words)
    print(word_freq.most_common())
    return word_freq.most_common()

def preprocess(doc):
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(tokens) 

def words_tf_idf(docs, tfidf_vectorizer):
    processed_documents = [preprocess(doc) for doc in docs]

    tfidf_matrix = tfidf_vectorizer.fit_transform(processed_documents)

    feature_names = tfidf_vectorizer.get_feature_names_out()

    tfidf_scores = tfidf_matrix.sum(axis=0).A1  
    tfidf_dict = dict(zip(feature_names, tfidf_scores))

    return sorted(tfidf_dict.items(), key=lambda x: x[1], reverse=True)


HTLM text

In [3]:
nlp = spacy.load("en_core_web_sm")
tfidf_vectorizer = TfidfVectorizer()

train_path_txt = 'SCOTUS/train.json'
type = 'json'
# train_path_txt = 'data_txt_save/train_'
# type = 'txt'

stats = {}
docs = []

# Loop to process each text and store the result in the dictionary
for i in range(0, 100):
    stats[f"text_{i}"], doc = process_text(train_path_txt, i, type, nlp)
    docs.append(doc)

results_df = pd.DataFrame.from_dict(stats, orient='index')
   
averages = results_df.mean()
results_df.loc['Average'] = averages

# print(results_df.tail(1)) # n'afficher que la dernière ligne
results_df

Unnamed: 0,word_count,sentence_count,a_tag,em_tag,blockquote_tag,execution_time
text_0,11411.00,199.00,163.0,77.00,6.00,4.042055
text_1,8498.00,77.00,152.0,68.00,2.00,3.537663
text_2,11738.00,192.00,174.0,15.00,0.00,3.807845
text_3,11108.00,172.00,171.0,84.00,0.00,3.746665
text_4,8296.00,71.00,149.0,48.00,0.00,3.430441
...,...,...,...,...,...,...
text_96,51768.00,1319.00,469.0,890.00,0.00,9.036714
text_97,26226.00,690.00,231.0,371.00,13.00,6.005268
text_98,52240.00,1315.00,870.0,716.00,0.00,8.976322
text_99,33878.00,1009.00,464.0,310.00,0.00,7.105345


In [4]:
word_count = []

for doc in docs :
    word_count.append(words_count(doc))
    
word_tf_idf = words_tf_idf(docs, tfidf_vectorizer)

word_count
word_tf_idf

[('>', 914), ('<', 776), ('data', 572), ('gtm', 509), ('\n', 404), ('\n                        ', 132), ('type="click', 126), ('court', 116), ('div', 113), ('category="navigation', 111), ('/div', 107), ('action="footer', 85), ('resources', 81), ('li><a', 75), ('em', 72), ('\n                    ', 67), ('href="https://www.justia.com', 62), ('\n\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t', 62), ('law', 48), ('jury', 44), ('federal', 41), ('state', 41), ('\n                ', 38), ('\n                            ', 38), ('\n                                ', 36), ('supreme', 34), ('ninth', 34), ('v/20240521113135', 33), ('shared', 33), ('judge', 33), ('/p', 31), ('circuit', 31), ('images', 30), ('span', 29), ('navigation', 28), ('\n\t\t\t', 27), ('v.', 26), ('action="header', 26), ('block', 26), ('class="footer', 24), ('\n\t\t\t\t', 23), ('decision', 23), ('directive="j', 22), ('list', 22), ('\n        ', 21), ('icon', 21), ('\n            ', 21), ('img', 21), ('/span', 21), ('\n\t\t\t\t\t', 21), (

[('data', 49.63555256538572),
 ('gtm', 43.633884149402796),
 ('div', 19.25269408915274),
 ('class', 19.091593232793112),
 ('em', 17.329879150666272),
 ('href', 16.105411162616466),
 ('li', 15.983623009379997),
 ('justia', 15.544775304548486),
 ('com', 14.630094110431392),
 ('type', 12.784264875992179),
 ('navigation', 11.639940984193398),
 ('https', 11.559625327098274),
 ('footer', 11.26922798836329),
 ('action', 11.216093743163714),
 ('label', 11.21046635880573),
 ('category', 10.908469371928048),
 ('click', 10.76645904827503),
 ('court', 8.355113593541281),
 ('law', 7.527256167271706),
 ('span', 7.247221534660191),
 ('resources', 6.748188452137383),
 ('www', 6.675507700040376),
 ('supreme', 5.341324700873667),
 ('id', 5.031778141236198),
 ('title', 4.5661808922229135),
 ('nbsp', 4.490209765481511),
 ('case', 4.316297405373939),
 ('us', 3.8556603335037307),
 ('federal', 3.7355943541259364),
 ('image', 3.6873159774419033),
 ('icon', 3.6031691730370787),
 ('ul', 3.219497791065067),
 ('j

Text with some tags

In [5]:
#nlp = spacy.load("models/custom_spacy_model")
#tfidf_vectorizer = TfidfVectorizer()

# train_path_txt = 'SCOTUS/train.json'
# type = 'json'
train_path_txt = 'data_txt_save/train_'
type = 'txt'

stats_2 = {}
docs_2 = []

# Loop to process each text and store the result in the dictionary
for i in range(0, 100):
    stats_2[f"text_{i}"], doc = process_text(train_path_txt, i, type, nlp)
    docs_2.append(doc)

results_df_2 = pd.DataFrame.from_dict(stats_2, orient='index')
   
averages_2 = results_df_2.mean()
results_df_2.loc['Average'] = averages_2

# print(results_df.tail(1)) # n'afficher que la dernière ligne
results_df_2

Unnamed: 0,word_count,sentence_count,a_tag,em_tag,blockquote_tag,execution_time
text_0,3473.00,159.00,17.00,58.00,6.00,0.573929
text_1,644.00,38.00,3.00,34.00,1.00,0.093895
text_2,3153.00,135.00,23.00,8.00,0.00,0.461718
text_3,2833.00,114.00,22.00,65.00,0.00,0.387561
text_4,546.00,33.00,2.00,24.00,0.00,0.077968
...,...,...,...,...,...,...
text_96,14086.00,637.00,66.00,332.00,0.00,1.690084
text_97,16703.00,774.00,79.00,346.00,11.00,1.933477
text_98,36571.00,1569.00,221.00,687.00,0.00,4.218707
text_99,21722.00,1086.00,144.00,293.00,0.00,2.502549


In [6]:
word_count_2 = []

for doc in docs_2 :
    word_count_2.append(words_count(doc))
    
word_tf_idf_2 = words_tf_idf(docs_2, tfidf_vectorizer)

word_count_2
word_tf_idf_2

[('>', 162), ('<', 161), ('court', 60), ('/em', 58), ('em', 57), ('jury', 35), ('\n  ', 31), ('state', 29), ('ninth', 26), ('circuit', 26), ('judge', 24), ('law', 22), (' ', 19), ('/a', 17), ('v.', 16), ('appeal', 16), ('decision', 16), ('states', 15), ('united', 14), ('facts', 14), ('federal', 13), ('established', 13), ('verdict', 12), ('u.', 12), ('s.', 12), ('radcliff', 12), ('respondent', 11), ('murder', 11), ('contrary', 11), ('clearly', 11), ('supreme', 11), ('claim', 10), ('california', 9), ('habeas', 9), ('291', 9), ('consider', 9), ('\n ', 8), ('  ', 8), ('3d', 8), ('ibid', 8), ('foreman', 8), ('unreasonable', 7), ('based', 7), ('determination', 7), ('cases', 7), ('jenkins', 7), ('curiam', 7), ('f.', 7), ('\n', 7), ('failed', 7), ('attempted', 6), ('deliberations', 6), ('district', 6), ('2254(d', 6), ('proceedings', 6), ('gypsum', 6), ('co.', 6), ('count', 6), ('jurors', 6), ('right', 6), ('blockquote', 6), ('/blockquote', 6), ('particular', 6), ('packer', 5), ('petition', 5),

[('em', 43.50492656601537),
 ('court', 11.322136703642204),
 ('state', 4.319858526055203),
 ('respondent', 3.8646612818541852),
 ('petitioner', 3.789438019992543),
 ('case', 3.6507940797390646),
 ('jury', 3.5893847473097646),
 ('officer', 2.673595575526704),
 ('claim', 2.669464467605981),
 ('states', 2.6406260834163366),
 ('united', 2.5322983729591066),
 ('appeals', 2.4402656239875937),
 ('evidence', 2.4202622926491784),
 ('federal', 2.3372663709708665),
 ('law', 2.276307393620139),
 ('district', 2.2145839184394807),
 ('search', 2.210182488203316),
 ('amendment', 2.204633903647701),
 ('hold', 2.100091293843762),
 ('rule', 2.0359473001817348),
 ('decision', 2.003768036236634),
 ('footnote', 1.9540332659319724),
 ('act', 1.9040762696729778),
 ('trial', 1.8446499771451248),
 ('statute', 1.8382354319637753),
 ('instruction', 1.80901680760802),
 ('review', 1.7733202330800717),
 ('police', 1.7425893566529544),
 ('circuit', 1.7153673570788281),
 ('fourth', 1.6846298718561932),
 ('right', 1.66

In [7]:
train_path_txt = 'clean_data_txt_save/train_'
type = 'txt'

stats_3 = {}
docs_3 = []

# Loop to process each text and store the result in the dictionary
for i in range(0, 100):
    stats_3[f"text_{i}"], doc = process_text(train_path_txt, i, type, nlp)
    docs_3.append(doc)

results_df_3 = pd.DataFrame.from_dict(stats_3, orient='index')
   
averages_3 = results_df_3.mean()
results_df_3.loc['Average'] = averages_3

# print(results_df.tail(1)) # n'afficher que la dernière ligne
results_df_3

Unnamed: 0,word_count,sentence_count,a_tag,em_tag,blockquote_tag,execution_time
text_0,2970.00,123.00,0.0,0.0,0.0,0.386928
text_1,456.00,17.00,0.0,0.0,0.0,0.054001
text_2,2888.00,131.00,0.0,0.0,0.0,0.353414
text_3,2350.00,95.00,0.0,0.0,0.0,0.319823
text_4,417.00,17.00,0.0,0.0,0.0,0.076925
...,...,...,...,...,...,...
text_96,11525.00,512.00,0.0,0.0,0.0,1.296403
text_97,14181.00,583.00,0.0,0.0,0.0,1.601177
text_98,31508.00,1220.00,0.0,0.0,0.0,3.446085
text_99,19039.00,926.00,0.0,0.0,0.0,2.082409


In [8]:
word_count_3 = []

for doc in docs_3 :
    word_count_3.append(words_count(doc))
    
word_tf_idf_3 = words_tf_idf(docs_3, tfidf_vectorizer)

word_count_3
word_tf_idf_3

[(' ', 99), ('court', 56), ('\n  ', 45), ('jury', 35), ('ninth', 22), ('state', 22), ('law', 22), ('circuit', 21), ('judge', 21), ('v.', 16), ('decision', 16), ('united', 14), ('states', 14), ('facts', 14), ('appeal', 13), ('s.', 12), ('established', 12), ('murder', 11), ('federal', 11), ('u.', 11), ('radcliff', 11), ('verdict', 10), ('contrary', 10), ('clearly', 10), ('supreme', 10), ('california', 9), ('respondent', 9), ('claim', 9), ('habeas', 9), ('  ', 9), ('291', 9), ('consider', 9), ('3d', 8), ('ibid', 8), ('foreman', 8), ('determination', 7), ('curiam', 7), ('f.', 7), ('failed', 7), ('attempted', 6), ('2254(d', 6), ('unreasonable', 6), ('jenkins', 6), ('count', 6), ('jurors', 6), ('right', 6), ('cases', 6), ('packer', 5), ('deliberations', 5), ('district', 5), ('application', 5), ('second', 5), ('10', 5), ('note', 5), ('deliberate', 5), ('based', 5), ('instructions', 5), ('said', 5), ('opinion', 5), ('particular', 5), ('579', 5), ('petition', 4), ('writ', 4), ('certiorari', 4),

[('court', 12.779751944472496),
 ('state', 4.824400553276699),
 ('respondent', 4.353856546663805),
 ('petitioner', 4.292606391204436),
 ('case', 4.146231823046751),
 ('jury', 4.093981032959904),
 ('claim', 3.022757163815854),
 ('officer', 2.928995125338087),
 ('states', 2.8848868271922683),
 ('united', 2.746687975865418),
 ('appeals', 2.697097015177386),
 ('evidence', 2.6159510315129864),
 ('law', 2.6055260035239205),
 ('federal', 2.5774796081580122),
 ('search', 2.5445397050848677),
 ('district', 2.479890134072126),
 ('amendment', 2.397627935939971),
 ('rule', 2.3318206259895455),
 ('hold', 2.3176445791368896),
 ('footnote', 2.1919772619742783),
 ('decision', 2.187294951070883),
 ('trial', 2.126552330103679),
 ('act', 2.1213949571773973),
 ('review', 2.0715733320164467),
 ('police', 2.014079895915657),
 ('instruction', 2.0049041930169618),
 ('statute', 1.9653057289596945),
 ('2d', 1.9340241872394808),
 ('brief', 1.8957062246395846),
 ('right', 1.8808104271735027),
 ('circuit', 1.87870