<a href="https://colab.research.google.com/github/RobertBWeidlich/Colab_Notebooks/blob/main/CorpusAnalysisWSpaCy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Corpus Analysis with spaCy
#
# Sat Nov  4 14:21:09 EDT 2023
#
# Megan S. Kane ORCID id icon
#
# This lesson demonstrates how to use the Python library spaCy for analysis of
# large collections of texts. This lesson details the process of using spaCy
# to enrich a corpus via lemmatization, part-of-speech tagging, dependency
# parsing, and named entity recognition. Readers will learn how the
# linguistic annotations produced by spaCy can be analyzed to help
# researchers explore meaningful trends in language patterns across a set
# of texts.
#
# https://programminghistorian.org/en/lessons/corpus-analysis-with-spacy
#

In [5]:
import spacy
from spacy import displacy
import os
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from google.colab import files


In [7]:
# 1. upload files from local system - select manually
import json

uploaded_files = files.upload()
for fn in uploaded_files.keys():
  print(f"fn: \"{fn}\"")
  print(f"len: {len(uploaded_files[fn])}")


Saving cn_rss_proc-20231101.json to cn_rss_proc-20231101.json
Saving cn_rss_proc-20231102.json to cn_rss_proc-20231102.json
Saving cn_rss_proc-20231103.json to cn_rss_proc-20231103.json
Saving cn_rss_proc-20231104.json to cn_rss_proc-20231104.json
Saving cn_rss_proc-20231105.json to cn_rss_proc-20231105.json
fn: "cn_rss_proc-20231101.json"
len: 6146364
fn: "cn_rss_proc-20231102.json"
len: 5004431
fn: "cn_rss_proc-20231103.json"
len: 5078244
fn: "cn_rss_proc-20231104.json"
len: 5159433
fn: "cn_rss_proc-20231105.json"
len: 4276951


In [11]:
import tempfile

def clean_cn_rss_json_file_1(file: str) -> bool:
  with tempfile.TemporaryFile() as fp:
    # delete non-JSON comment lines beginning with '#' characters
    print(fp.name)
    print(fp)
    print(tempfile.mkdtemp()) # this creates a temporary directory such as
                              # "/tmp/tmp5d98bc69/"
    # 1. copy original file to temp directory
    # 2. open temp file, write to new file with original name in temp directory
    # 3. iterate line by line
    # 4. if #3 is successful, move new file to orginal file
    # 5. delete temp directory (is it empty?)


def clean_cn_rss_json_file_2(orig_filename: str) -> bool:
  import uuid

  new_ext = str(uuid.uuid4())
  new_filename = f"{orig_filename}-{new_ext}"
  print(new_filename)
  # 1. open file as source, open new file as destination
  with open(orig_filename) as ifp:
    with open(new_filename, "w") as ofp:
      # 2. iterate line by line
      for line in ifp:
        # note: retaining original new line at end of string
        #print(f"len: {len(line)}")
        #print(f">>>{line}<<<")
        if (len(line) > 0) and (line[0] != '#'):
          # note - " #" line not a comment, "# " IS a comment
          #print("printing to output")
          ofp.write(line)
        print()

  # 3. if #2 is successful, save original to filename with extension "-orig",
  #    then overwrite new to old

def clean_cn_rss_json_file(filename: str) -> bool:

  # 1. move cn_rss_proc-20230817.json to cn_rss_proc-20230817.json-orig
  orig_filename = f"{filename}-orig"
  print(filename)
  print(orig_filename)

  # 2. cn_rss_proc-20230817.json-orig -> [filter] -> cn_rss_proc-20230817.json





  # 3. delete cn_rss_proc-20230817.json-orig??

##clean_cn_rss_json_file("cn_rss_proc-20230817.json")
#clean_cn_rss_json_file("c:\\\cn_rss_proc-20231105.json")
clean_cn_rss_json_file("cn_rss_proc-20231105.json")

cn_rss_proc-20231105.json
cn_rss_proc-20231105.json-orig
