# Processing Releases Dataset
The releases are compiled from multiple files. It was updated under the file `revisions.txt` but it was purged from time to time so I have to reconstruct it using versions I found with `git blame`.

Later revisions are obtained directly from Github, which includes extra useful information such as downloads.


In [2]:
!pip install chardet
!pip install requests

import os
import re
import pandas as pd
import chardet
import requests
import json

Collecting chardet
  Downloading chardet-5.2.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.4/199.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: chardet
Successfully installed chardet-5.2.0


# Revisions.txt data

In [3]:
folder = "releases"

In [27]:
# Download and concatenate data files
def read_file_with_unknown_encoding(filepath):
    with open(filepath, 'rb') as f:
        content = f.read()
        
    encodings_to_try = ['utf-8', 'latin1', 'ascii', 'cp1252']  # Add more encodings to try if you wish
    for encoding in encodings_to_try:
        try:
            return content.decode(encoding)
        except UnicodeDecodeError:
            continue
            
    return None  # or raise an exception if you wish

# Define folder and URLs
releases_1 = "https://raw.githubusercontent.com/processing/processing/5f289537698f504499bfda0084b4c5f3f1ac0912/build/shared/revisions.txt"
releases_2 = "https://raw.githubusercontent.com/processing/processing/16dd0e77e70810c87d9f7b4a195ef72fe6e048f6/build/shared/revisions.txt"
releases_3 = "https://raw.githubusercontent.com/processing/processing/master/build/shared/revisions.txt"

# Create folder and download files
os.system(f'mkdir -p {folder}')
os.system(f'wget {releases_1} -O {folder}/1.txt')
os.system(f'wget {releases_2} -O {folder}/2.txt')
os.system(f'wget {releases_3} -O {folder}/3.txt')

# Read files with unknown encoding
file_contents = []
for i in range(1, 4):  # For files 1.txt, 2.txt, and 3.txt
    filepath = os.path.join(folder, f"{i}.txt")
    content = read_file_with_unknown_encoding(filepath)
    if content:
        file_contents.append(content)

# Combine and write to a new file
combined_content = ''.join(file_contents)
with open(os.path.join(folder, 'combined.txt'), 'w', encoding='utf-8') as f:
    f.write(combined_content)

--2023-09-25 10:24:15--  https://raw.githubusercontent.com/processing/processing/5f289537698f504499bfda0084b4c5f3f1ac0912/build/shared/revisions.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 82164 (80K) [text/plain]
Saving to: ‘releases/1.txt’

     0K .......... .......... .......... .......... .......... 62% 2.28M 0s
    50K .......... .......... ..........                      100% 9.34M=0.02s

2023-09-25 10:24:15 (3.19 MB/s) - ‘releases/1.txt’ saved [82164/82164]

--2023-09-25 10:24:15--  https://raw.githubusercontent.com/processing/processing/16dd0e77e70810c87d9f7b4a195ef72fe6e048f6/build/shared/revisions.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.

In [10]:
with open(folder + os.sep + "combined.txt", "r") as f:
    file_content = f.read()

In [34]:
pattern = r"(?P<entry>(?:ABOUT )?(?:PROCESSING )?(?P<version>\d.\w+(?:.\w+)*)?(?: \()*REV (?P<rev>\d{3,4})\)?(?: \S+? -)?(?: - )?(?P<date>\d+ \w+ \d+)?(?:[ \S]*))\n\n(?P<content>.+?)(?=^PROCESSING|^ABOUT|(?:. ){15,})"

data_list = []

for match in re.finditer(pattern, file_content, re.DOTALL | re.MULTILINE):
    data_dict = {
        'entry': match.group('entry'),
        'version': match.group('version') if match.group('version') else None,
        'revision': int(match.group('rev')) if match.group('rev') else None,
        'published_at': match.group('date') if match.group('date') else None,
        'body': match.group('content') if match.group('content') else None
    }
    
    data_list.append(data_dict)

df = pd.DataFrame(data_list)

def parse_date(date_str):
    if pd.isna(date_str) or not date_str:
        return pd.NaT
    try:
        return pd.to_datetime(date_str, errors='coerce', format='%d %B %Y')
    except Exception:
        return date_str  # return the original string if parsing fails

df['published_at'] = df['published_at'].apply(parse_date)
df['published_at'] = pd.to_datetime(df['published_at'], errors='coerce')
df['published_at'] = df['published_at'].dt.tz_localize('America/New_York')
df = df.drop_duplicates(subset=['revision'])

# versions 21 through 23 dealt with getting the beast to work smoothly for the numer workshop
# PROCESSING REV 0172 through 0175 are grouped together

df.sort_values(by='revision', ascending=True, inplace=True)
revisionstxt_df_final = df
#df.to_csv('temp2.csv', index=False)

# Github Releases

In [26]:
# Function to convert the list of asset dictionaries to a flat dictionary
def flatten_assets(assets_list):
    flattened = {}
    for i, asset in enumerate(assets_list):
        for key, value in asset.items():
            flattened[f'asset_{i+1}_{key}'] = value
    return flattened

def get_all_releases(owner, repo):
    url = f"https://api.github.com/repos/{owner}/{repo}/releases"
    page = 1
    all_releases = []

    while True:
        params = {'page': page}
        response = requests.get(url, params=params)

        if response.status_code != 200:
            print(f"Failed to get data: {response.content}")
            break

        releases = json.loads(response.text)
        if not releases:  # Exit the loop if no more releases
            break

        all_releases.extend(releases)
        page += 1  # Increment the page number for the next iteration

    return all_releases

all_releases = get_all_releases("processing", "processing") +  get_all_releases("processing", "processing4")

In [37]:
github_df = pd.DataFrame(pd.json_normalize(all_releases))

# Apply the function to the 'assets' column
flattened_assets_df = github_df['assets'].apply(flatten_assets).apply(pd.Series)

# Concatenate the original DataFrame with the new DataFrame
github_df_final = pd.concat([github_df, flattened_assets_df], axis=1)
github_df_final = github_df_final.sort_values(by='published_at', ascending=True)
github_df_final['published_at'] = pd.to_datetime(github_df_final['published_at'], errors='coerce')

pattern = r'processing-(?:0*(\d+)-)?([\w.]+)'

github_df_final[['revision', 'version']] = github_df_final['tag_name'].str.extract(pattern)
github_df_final['revision'] = github_df_final['revision'].astype('Int64')

github_df_final.to_csv('temp-github.csv', index=False)

In [38]:
concat_df_vertical = pd.concat([revisionstxt_df_final, github_df_final], axis=0)
concat_df_vertical['revision'] = concat_df_vertical['revision'].astype('Int64') #This supports NaN
concat_df_vertical = concat_df_vertical.dropna(subset=['revision'])

concat_df_vertical = concat_df_vertical.sort_values(by=['revision', 'published_at'], ascending=[True, True])

concat_df_vertical = concat_df_vertical.drop_duplicates(subset=['revision'])
concat_df_vertical.to_csv('releases-data.csv', index=False)

In [11]:
with open(folder + os.sep + "1.txt", "r") as f:
    content = f.read()

In [48]:
pattern = r"ABOUT REV (\d+)(?: - (\d+ \w+ \d+)(?: \w+)?)?\n\n(.+?)(?=\n\nABOUT REV|\Z)"
matches = re.findall(pattern, content, re.DOTALL)

# Create DataFrame
df = pd.DataFrame(matches, columns=["Release_Number", "Date", "Content"])

# Custom date parsing function
def parse_date(date_str):
    if pd.isna(date_str):
        return pd.NaT
    try:
        return pd.to_datetime(date_str, errors='raise', format='%d %B %Y')
    except Exception:
        return date_str  # return the original string if parsing fails

# Apply custom date parsing
df['Date'] = df['Date'].apply(parse_date)
df['Release_Number'] = df['Release_Number'].astype(int)

df.head()

Unnamed: 0,Release_Number,Date,Content
0,69,2004-03-28,"this is yet another bug fix release, which rep..."
1,68,2004-02-02,"this is a bug fix release, not the planned ver..."
2,67,2003-10-28,another bug fix release. high importance for p...
3,66,2003-10-19,"several important bug fixes in this release, s..."
4,65,2003-09-30,one major bug fix that repairs nastiness in dr...


In [60]:
with open(folder + os.sep + "2.txt", "r") as f:
    content2 = f.read()

In [63]:
pattern = r"ABOUT REV (\d+)(?: - (\d+ \w+ \d+)(?: \w+)?)?\n\n(.+?)(?=\n\nABOUT REV|\Z)"
matches = re.findall(pattern, content2, re.DOTALL)

# Create DataFrame
df = pd.DataFrame(matches, columns=["Release_Number", "Date", "Content"])

# Custom date parsing function
def parse_date(date_str):
    if pd.isna(date_str):
        return pd.NaT
    try:
        return pd.to_datetime(date_str, errors='raise', format='%d %B %Y')
    except Exception:
        return date_str  # return the original string if parsing fails

# Apply custom date parsing
df['Date'] = df['Date'].apply(parse_date)
df['Release_Number'] = df['Release_Number'].astype(int)

df.to_csv('dataframe_output.csv', index=False)
# last one 161

In [None]:
#Missing the following
# ABOUT REV 0070 - MEGABUCKET - 29 September 2004
# ABOUT REV 0071 
# ABOUT REV 0137 - 30 May 2008 
