<a href="https://colab.research.google.com/github/RobertBWeidlich/Colab_Notebooks/blob/main/CorpusAnalysisWSpaCy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Corpus Analysis with spaCy
#
# Thu Nov  9 18:57:00 EST 2023
#
# Megan S. Kane ORCID id icon
#
# This lesson demonstrates how to use the Python library spaCy for analysis of
# large collections of texts. This lesson details the process of using spaCy
# to enrich a corpus via lemmatization, part-of-speech tagging, dependency
# parsing, and named entity recognition. Readers will learn how the
# linguistic annotations produced by spaCy can be analyzed to help
# researchers explore meaningful trends in language patterns across a set
# of texts.
#
# https://programminghistorian.org/en/lessons/corpus-analysis-with-spacy
#

In [2]:
import spacy
from spacy import displacy
import os
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from google.colab import files


In [3]:
# 1. upload files from local system - select manually
import json

uploaded_files = files.upload()
for fn in uploaded_files.keys():
  print(f"fn: \"{fn}\"")
  print(f"len: {len(uploaded_files[fn])}")


Saving cn_rss_proc-20230816.json to cn_rss_proc-20230816.json
Saving cn_rss_proc-20230817.json to cn_rss_proc-20230817.json
Saving cn_rss_proc-20230818.json to cn_rss_proc-20230818.json
fn: "cn_rss_proc-20230816.json"
len: 6300316
fn: "cn_rss_proc-20230817.json"
len: 6432850
fn: "cn_rss_proc-20230818.json"
len: 6253551


In [4]:
import tempfile

def clean_cn_rss_json_file(filename: str) -> bool:
  # data files have comment lines (first line is '#' character), which
  # is not standard JSON, so they have to be removed.
  import os
  import shutil

  # 1. move cn_rss_proc-20230817.json to cn_rss_proc-20230817.json-orig
  orig_filename = f"{filename}-orig"
  #print(filename)
  #print(orig_filename)
  print(f"moving \"{filename}\" --> \"{orig_filename}\"")
  shutil.move(filename, orig_filename)

  # 2. cn_rss_proc-20230817.json-orig -> [filter] -> cn_rss_proc-20230817.json
  with open(orig_filename) as ifp:
    with open(filename, "w") as ofp:
      # 2a. iterate line by line
      for line in ifp:
        # note: retaining original new line at end of string
        #print(f"len: {len(line)}")
        #print(f">>>{line}<<<")
        if (len(line) > 0) and (line[0] != '#'):
          # note - " #" line not a comment, "# " IS a comment
          #print("printing to output")
          ofp.write(line)

  # 3. delete cn_rss_proc-20230817.json-orig??

###clean_cn_rss_json_file("cn_rss_proc-20230817.json")
##clean_cn_rss_json_file("c:\\\cn_rss_proc-20231105.json")
##clean_cn_rss_json_file("cn_rss_proc-20231105.json")
#clean_cn_rss_json_file("cn_rss_proc-20230817.json")

In [None]:
# 2. clean non-JSON comments from data files
for fn in uploaded_files.keys():
  print(f"fn: \"{fn}\"")
  print(f"len: {len(uploaded_files[fn])}")
  clean_cn_rss_json_file(fn)