In [None]:
# Open Anaconda Prompt and create a new environment
#conda create --name nlp_env python=3.9
#conda activate nlp_env

# Install necessary packages
#conda install -c conda-forge spacy
#pip install pydantic
#conda install pandas

# Download Spacy model
#python -m spacy download en_core_web_sm

# Import Libraries

In [2]:
import pandas as pd
import re
import spacy
from collections import Counter
import gensim
import gensim.corpora as corpora
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import matplotlib.pyplot as plt

# Definitions

In [45]:
# Load Spacy model
nlp = spacy.load("en_core_web_sm")

# Add "not" to the stop words list
nlp.Defaults.stop_words.add("not")
nlp.vocab["not"].is_stop = True

# Coordination terms
coord_terms = ["reuse", "copy", "clone", "common", "common developers", "license", "open source", "governance", "copyright", "open source software", "open-source", "cloning", "forking", "coordination", "collaboration",
               "bitcoin", "btc", "bch", "bitcoin cash"]

# Read CSV file
def read_csv(file_path):
    df = pd.read_csv(file_path)
    return df
    
# Preprocess text function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"[^\w\s]", "", text)  # Remove special characters
    text = re.sub(r"[\r\n]", "", text)    # Remove \r\n characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text

# Function to preprocess the comments in the DataFrame
def preprocess_dataframe(df):
    
    # Preprocess the comments
    df['preprocessed_comments'] = df['body'].dropna().apply(preprocess_text)
    return df

# Extract coordination terms function with lemmatization
def extract_coordination_terms(text, coord_terms):
    term_count = Counter()
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc]
    for lemma in lemmas:
        if lemma in coord_terms:
            term_count[lemma] += 1
    return term_count

# Function to get top comments based on keyword matches
def get_top_comments(preprocessed_df, coord_terms, top_n=5):
    # Initialize a list to store the comment and its keyword match count
    comments_with_matches = []
    
    for index, comment in preprocessed_df['preprocessed_comments'].items():
        terms = extract_coordination_terms(comment, coord_terms)
        total_matches = sum(terms.values())
        
        # Append the comment, index and total matches to the list
        comments_with_matches.append((index, preprocessed_df['preprocessed_comments'][index], total_matches))
    
    # Sort the list by total matches in descending order and get the top N comments
    top_comments = sorted(comments_with_matches, key=lambda x: x[2], reverse=True)[:top_n]
    
    return top_comments

# Tokenize and lemmatize the comments
def tokenize_and_lemmatize(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and token.lemma_ != 'not']
    return tokens

# Function to prepare data for LDA
def prepare_lda_data(texts):
    data_words = [tokenize_and_lemmatize(text) for text in texts]
    id2word = corpora.Dictionary(data_words)
    corpus = [id2word.doc2bow(text) for text in data_words]
    return id2word, corpus, data_words

# Function to perform LDA and visualize the topics
def perform_lda(id2word, corpus, num_topics=5):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=id2word,
                                                num_topics=num_topics, 
                                                random_state=100,
                                                update_every=1,
                                                chunksize=10,
                                                passes=10,
                                                alpha='auto',
                                                per_word_topics=True)
    return lda_model

# Visualize the topics
def visualize_topics(lda_model, corpus, id2word):
    vis = gensimvis.prepare(lda_model, corpus, id2word)
    pyLDAvis.display(vis)
    pyLDAvis.save_html(vis, 'lda_topics.html')

# Bitcoin Cash (BCH) Analysis

In [4]:
# File read
file_path = 'interesting_comments_bitcoin-abc.csv'
df = read_csv(file_path)

In [5]:
print(df.shape)
df[['body', 'keywords']].head()

(39, 5)


Unnamed: 0,body,keywords
0,GA is used for minimal usage data collection a...,open source
1,same error here..\r\r\nstarted with a fresh gi...,reuse
2,GA is used for minimal usage data collection a...,open source
3,same error here..\r\r\nstarted with a fresh gi...,reuse
4,@jasonbcox \r\r\n\r\r\n```\r\r\nlsb_release -a...,"copy,license"


In [46]:
preprocessed_df = preprocess_dataframe(df)
preprocessed_df.head()

Unnamed: 0.1,Unnamed: 0,created_at,body,developer,keywords,preprocessed_comments
0,46,2022-08-31T19:19:39Z,GA is used for minimal usage data collection a...,BytesOfMan,open source,ga is used for minimal usage data collection a...
1,235,2020-10-23T13:55:36Z,same error here..\r\r\nstarted with a fresh gi...,M8BWNN,reuse,same error herestarted with a fresh git clone ...
2,295,2022-08-31T19:19:39Z,GA is used for minimal usage data collection a...,BytesOfMan,open source,ga is used for minimal usage data collection a...
3,484,2020-10-23T13:55:36Z,same error here..\r\r\nstarted with a fresh gi...,M8BWNN,reuse,same error herestarted with a fresh git clone ...
4,614,2019-12-19T10:10:07Z,@jasonbcox \r\r\n\r\r\n```\r\r\nlsb_release -a...,gituser,"copy,license",jasonbcox lsb_release ano lsb modules are avai...


# Keyword Matching

In [47]:
# Lemmatized keyword matching
coordination_terms_count = Counter()
for comment in preprocessed_df['preprocessed_comments']:
    #processed_text = preprocess_text(comment)
    terms = extract_coordination_terms(comment, coord_terms)
    coordination_terms_count.update(terms)

print(coordination_terms_count)

Counter({'bitcoin': 30, 'license': 16, 'copyright': 16, 'copy': 7, 'bch': 4, 'btc': 3, 'clone': 2, 'governance': 1, 'coordination': 1})


In [48]:
top_comments = get_top_comments(preprocessed_df, coord_terms, top_n=5)

for index, comment, matches in top_comments:
    print(f"Comment Index: {index}\nComment: {comment}\nKeyword Matches: {matches}\n")

Comment Index: 18
Comment: whether it applies to a nonentity or not is a more esoteric question and im not a lawyer however regardless of the legal implications of a copyright stamp the license says basically you can use my stuff as long as you keep that little thing up there it does not say as long as that little thing up there is legally sound so the legality of the stamp and the requirements to keep it are completely separate concernssatoshi appears to have called him a bitcoin developer thats what he wrote personally and no one is allowed to change it the core team called themselves the bitcoin core developers and they claimed copyright from 2009 onwards and no one is allowed to change it if the 2009 is challenged in court and loses doesnt matter thats what they wrote and a condition of the license they gave us is that we dont change itnow your question does apply to the bitcoin abc developers since we are editing this and we have copyright ownership of our contributions however ev

# Topic Modelling - BCH

In [53]:
# Prepare LDA data
id2word, corpus, data_words = prepare_lda_data(preprocessed_df['preprocessed_comments'])

# Perform LDA
lda_model = perform_lda(id2word, corpus, num_topics=2)

# Print the topics
topics = lda_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

# Visualize the topics
visualize_topics(lda_model, corpus, id2word)

(0, '0.013*"bitcoin" + 0.010*"code" + 0.008*"copyright" + 0.007*"license" + 0.007*"fix" + 0.007*"core" + 0.007*"user" + 0.007*"change" + 0.006*"release" + 0.006*"need"')
(1, '0.097*"test" + 0.096*"perform" + 0.047*"look" + 0.040*"success" + 0.036*"find" + 0.011*"include" + 0.010*"file" + 0.009*"fail" + 0.008*"compiler" + 0.006*"c"')


# Bitcoin (BTC) Analysis

In [9]:
# File read
file_path_btc = 'interesting_comments_bitcoin.csv'
btc_df = read_csv(file_path_btc)
print(btc_df.shape)
btc_df[['body', 'keywords']].head()

(3312, 5)


Unnamed: 0,body,keywords
0,> but we're changing all the types now so migh...,stuff
1,It would be good for the PR description to out...,copy
2,"Thanks, @1440000bytes for your review. Interes...",stuff
3,This error affected my btcpay server apparentl...,copy
4,> I don't see how reimplementing BDB is better...,stuff


In [56]:
preprocessed_btc_df = preprocess_dataframe(btc_df)
preprocessed_btc_df.head()

Unnamed: 0.1,Unnamed: 0,created_at,body,developer,keywords,preprocessed_comments
0,60,2022-12-07T15:33:53Z,> but we're changing all the types now so migh...,LarryRuane,stuff,but were changing all the types now so might a...
1,104,2022-12-20T10:55:38Z,It would be good for the PR description to out...,fanquake,copy,it would be good for the pr description to out...
2,122,2022-12-01T13:55:30Z,"Thanks, @1440000bytes for your review. Interes...",brunoerg,stuff,thanks 1440000bytes for your review interestin...
3,152,2022-12-11T00:41:12Z,This error affected my btcpay server apparentl...,cpleonardo,copy,this error affected my btcpay server apparentl...
4,183,2022-11-29T21:27:56Z,> I don't see how reimplementing BDB is better...,achow101,stuff,i dont see how reimplementing bdb is better th...


# Keyword Matching

In [57]:
# Lemmatized keyword matching
coordination_terms_count_btc = Counter()
for comment in preprocessed_btc_df['preprocessed_comments']:
    #processed_text = preprocess_text(comment)
    terms = extract_coordination_terms(comment, coord_terms)
    coordination_terms_count_btc.update(terms)

print(coordination_terms_count_btc)

Counter({'copy': 955, 'bitcoin': 872, 'reuse': 321, 'copyright': 290, 'license': 227, 'clone': 161, 'btc': 78, 'common': 54, 'coordination': 30, 'collaboration': 18, 'cloning': 6, 'forking': 5, 'governance': 1})


In [58]:
top_comments_btc = get_top_comments(preprocessed_btc_df, coord_terms, top_n=5)

#for index, comment, matches in top_comments_btc:
#    print(f"Comment Index: {index}\nComment: {comment}\nKeyword Matches: {matches}\n")

In [59]:
# Prepare LDA data
id2word_btc, corpus_btc, data_words_btc = prepare_lda_data(preprocessed_btc_df['preprocessed_comments'])

# Perform LDA
lda_model_btc = perform_lda(id2word_btc, corpus_btc, num_topics=2)

# Print the topics
topics_btc = lda_model_btc.print_topics(num_words=10)
for topic in topics_btc:
    print(topic)

# Visualize the topics
visualize_topics(lda_model_btc, corpus_btc, id2word_btc)

(0, '0.015*"stuff" + 0.007*"copy" + 0.007*"need" + 0.006*"like" + 0.006*"change" + 0.006*"file" + 0.005*"code" + 0.005*"m" + 0.005*"add" + 0.005*"think"')
(1, '0.023*"0" + 0.014*"0x0" + 0.008*"libsystembdylib" + 0.004*"const" + 0.003*"extradata" + 0.003*"superdata" + 0.003*"stringdata" + 0.003*"static" + 0.003*"void" + 0.003*"int"')
