# Projecte github

# Imports

In [None]:
# Draw the plots immediately after the current cell
%matplotlib inline

import pandas as pd
# Uncomment the following line to install pymysql
#!pip install PyMySQL
import pymysql
import warnings

# POLARS # <- teacher said that this one is 1000% faster than pandas.

# Project configuration

The file that we are going to use to set up the basic structure:

GitHubScraper/JupyterNotebook/Structure/githubProjectStructure.sql

from the github repository.

### Step 1, import the structure
In workbench: 
"Server" -> "Data Import", and then you should select the direction of "githubProjectStructure.sql" using the 3 dots and import from self-contained file like in the following image:

![Javatpoint](./Images/Project_Setup/data_import.png)  

Then scroll down, and create a new Schema by clicking on the "New..." button. Name it: "githubProject".

![Javatpoint](./Images/Project_Setup/create_new_schema.png)  

Scroll to the bottom, and in the bottom right, click on "Start import". If there are no errors, it will transitionate to "Import progress" and indicate success.

Refresh the scehmas:

![Javatpoint](./Images/Project_Setup/refresh.png)  

### Step 2, importing the data from the csv files

Download the csv files [here](https://drive.google.com/drive/folders/1NWhfFss0_M9V_clkcE9TH-Fy3oIEndWV?usp=sharing).

Selecting the "githubProject" scheme, right click and "Table Data Import Wizard", with this, we are going to import each csv file to the matching table.

You have to select the .csv you want to import, for example in this case, the trendVisits.csv

![Javatpoint](./Images/Project_Setup/file_import.png) 

Then, since you already have the structure, you have to use the existing table that matches with the correct csv file:

![Javatpoint](./Images/Project_Setup/use_existing.png) 

After that, check that the variables type are correct and import it. Do this for each csv file.

You can also verify the data by doing this:

![Javatpoint](./Images/Project_Setup/verify.png) 



# Introduction

TODO index or sections indicating what are we doing...

In [None]:
#https://www.geeksforgeeks.org/connect-to-mysql-using-pymysql-in-python/
# Connect to MySQL database
def tryToConnectMySQL(db_name, db_host, db_port, db_username, db_password):
    try:
        conn = pymysql.connect(host=db_host,
                               port=db_port,
                               user=db_username,
                               password=db_password,
                               db=db_name)
        if conn:
            print("Connection successful")
        else:
            print("Error")
        
        return conn
        
    except pymysql.Error as e:        
        warnings.warn("Error connecting to MySQL:", e)
        return None

# Pass arguments from outside
db_name = "githubProject"
db_host = "localhost"
db_port = 3306
db_username = "root"
db_password = input("Input your password")

dataBaseConnection = tryToConnectMySQL(db_name, db_host, db_port, db_username, db_password)

In [None]:
# Execute query and return data in pandas dataframe 
def execute_select_query(cursor, query):
    try:
        cursor.execute(query)
        output = cursor.fetchall()
        # Fetch column names from cursor's description
        columns = [desc[0] for desc in cursor.description]
        
        # Convert output to pandas DataFrame
        if output:
            df = pd.DataFrame(output, columns=columns)
            print("Query executed successfully!")
        return output, df
            
    except pymysql.Error as e:
        print("Error executing query:")
        warnings.warn(str(e))
        return None, None

In [None]:
# Obtain the cursos in order to interact with the DataBase
cursor = dataBaseConnection.cursor()

# Repositoris "Open Source" & dades per a la pàgina web
Quant a l'objectiu de destacar repositoris open-source (i no només repositoris populars, com fa GitHub), volem exportar dades d'aquests repositoris en format `.json` per a que puguin ser usades per la pàgina web.

Considerem que repositoris són projectes open-source si tenen més de 5 contribuïdors i més de 50 issues en total - un criteri simple però efectiu.

Recollir les dades necessàries involucra fer JOINs amb múltiples taules.

In [None]:
open_source_projects_query = """
SELECT r.owner, r.name, r.description, r.mainLanguage,
MAX(stars) as total_stars, MAX(contributors) as total_contributors, MAX(openIssues + closedIssues) as total_issues,
o.avatar_url, GROUP_CONCAT(DISTINCT t.topic) as topics,
MAX(watchers) as total_watchers FROM Repositories r
-- Join with RepositoryVisits to get metrics like contributors, stars, issues amount
JOIN RepositoryVisits v
ON r.owner = v.owner AND r.name = v.name
-- Join with Owners to get the avatar URL
JOIN Owners o
ON r.owner = o.username
-- Join with RepositoryTopics to get a list of the tags/topics of the repos
JOIN RepositoryTopics t
ON t.repo = r.name AND t.owner = r.owner
GROUP BY r.owner, r.name, r.mainLanguage, r.license -- Must group by mainLanguage and license as well as they're non-aggregated
HAVING total_contributors > 5 AND total_issues > 50
ORDER BY total_contributors DESC; -- Show repos with most contributors first
"""

output_repo, df_repo = execute_select_query(cursor, open_source_projects_query)
print(len(df_repo), "repositories matching criteria")
df_repo.head(100)

Les dades d'aquests repositoris s'exporten a .json perque puguin ser usades per la pàgina web.

En aquest procés també categoritzem els repositoris segons les seves temàtiques generals:
- Desenvolupament web
- Data science
- Aplicacions
- Eines de desenvolupament
- Repositoris de recursos (ex. una col·lecció d'algorismes)

La categorització es fa segons els "topics" (tags) que tenen els repositoris. Els repositoris també es categoritzen pel seu llenguatge de programació principal.

In [None]:
from collections import defaultdict
import json

used_languages = set()
used_tags = set()

# Renames or discards languages.
# Used to group up frameworks, variants and transpilers.
LANGUAGE_REMAP = {
    "TypeScript": "JavaScript",
    "Vue": "JavaScript",
    "HTML": "JavaScript", # Bruh
    "Jupyter Notebook": "Python",
    "Kotlin": "Java & Kotlin",
    "Java": "Java & Kotlin",
    "C": "C/C++",
    "C++": "C/C++",
    "Ruby": "Others",
    "Go": "Others",
    "Swift": "Others",
    "Clojure": "Others",
    "Haskell": "Others",
    "Dart": "Others",
    "Shell": "Others",
    "PowerShell": "Others",
    "Scala": "Others",
    "Svelte": "Others",
    "Vim Script": "",
    "CSS": "",
    "SCSS": "",
    "MDX": "",
}

# Remaps GitHub topics to the tags that are used in the website.
TAG_MAP = {}
# Maps tags used by the website to a list of Github "topics" that will be considered as the same tag.
# Ex. repos with topics "react", "vue" become tagged as "Web"
TAG_ALIASES = {
    "Web": ["react", "vue", "web", "reactjs", "css", "chrome-extension", "react-grid", "react-table", "php", "http", "nodejs", "typescript", "electron", "search-engine", "webgl", "rest", "rest-api", "swagger", "static-site-generator", "blog-engine", "router", "webview", "jquery", "http-client", "website", "reactive-templates", "nuxt", "nat", "javascript", "http2", "nginx", "apache", "aws", "api-gateway", "jekyll", "bootstrap"],
    "Modding": ["mod", "minecraft", "emulation", "emulator", "forge", "minecraft-launcher", "modrinth", "minecraft-api", "minecraft-server", "bepinex", "unity3d", "unreal", "unity-mono", "craftbukkit", "valheim", "minecraft-mod", "gta5", "fabric", "gamedev", "retroarch", "game"],
    "Data Science": ["math", "numpy", "data-science", "graphql", "data-visualization", "jupyter-notebook", "big-data"],
    "Machine Learning": ["ml", "pytorch", "deep-learning", "machine-learning", "deep-neural-networks", "tensorflow", "neural-network", "tensor", "computer-vision", "reinforcement-learning", "hyperparameter-tuning", "ai", "artificial-intelligence", "llama", "llms", "llm", "openai"],
    "Tool": ["containers", "zsh", "docker", "github", "cli", "searchengine", "postgrest", "devtool", "cloudstorage", "git", "npm", "database", "postgresql", "backend", "shell-scripting", "websocket", "collaboration", "developer-tools", "promise", "api", "testing", "translation", "i18n", "language", "golang-library", "algorithm", "firmware", "style-linter", "linting", "converter", "blockchain", "wordpress", "static-site-generator", "blog-engine", "material", "material-design", "framework", "argument-parser", "command-line-parser", "readme-generator", "ssh", "backup", "reverse-engineering", "animation", "sdk", "devops", "jenkins", "documentation", "terminal", "encryption", "scrapers", "3d-printing", "reactive-templates", "image-optimization", "file-server", "nat", "proxy", "shell", "linters", "git-client", "raspberry-pi", "blogging", "npm-cli", "aws", "api-gateway", "decompiler", "kubernetes", "tools"],
    "App": ["note-taking", "productivity", "prest", "download", "latex", "text-editor", "curl", "ftp", "bot", "synchronization", "sqlite", "mattermost", "messaging", "conferencing", "remote-desktop", "emacs", "color-picker", "cli-app", "subtitle-downloader", "decompiler", "mobile-app"],

    "Resource": ["learn-to-code", "freecodecamp", "curriculum", "certification", "learnopengl", "lists", "resources", "resource", "dataset", "public-api", "public-apis", "practice", "interview", "styleguide", "list", "interview-questions", "awesome-list", "principles", "design-patterns"],
}
# Tags manually added to some repositories (which otherwise lack descriptive ones)
MANUAL_TAGS = {
    "minio/minio": ["Machine Learning"],
    "Aliucord/Aliucord": ["Modding"],
    "cli-guidelines/cli-guidelines": ["Resource"],
    "yjs/yjs": ["Tool"],
    "TigerVNC/tigervnc": ["Tool"],
    "ollama/ollama": ["App"],
    "micropython/micropython": ["Tool"], # Python implementation
    "raspberrypi/linux": ["App"],
    "rust-lang/rust": ["Tool"],
    "home-assistant/core": ["Tool"],
    "vuejs/vue-cli": ["Tool", "Web"],
    "remix-run/remix": ["Tool", "Web"],
}
# Same thing as above, but mapping tag to list of repos with it instead, as I realized at the end this would've been more convenient.
MANUAL_TAGS2 = {
    "Tool": ["pytorch/tutorials", "vuejs/core", "google/googletest", "google/guava", "ReactiveX/RxJava"],
    "Web": ["vuejs/core"],
    "App": ["square/retrofit"],
}
for tag,aliases in TAG_ALIASES.items():
    for alias in aliases:
        TAG_MAP[alias] = tag
for tag,repos in MANUAL_TAGS2.items():
    for repo in repos:
        if repo in MANUAL_TAGS:
            MANUAL_TAGS[repo].append(tag)
        else:
            MANUAL_TAGS[repo] = [tag]

IGNORED_TAGS = defaultdict(int)

oss_repos = {}
for index, row in df_repo.iterrows():
    key = row["owner"] + "/" + row["name"]
    main_language = row["mainLanguage"]
    if main_language in LANGUAGE_REMAP:
        main_language = LANGUAGE_REMAP[main_language]
    repo = {
        "topics": set(),
        "languages": set([main_language] if main_language != "" else []),
        "stars": row["total_stars"],
        "contributors": row["total_contributors"],
        "icon": row["avatar_url"],
        "description": row["description"],
    }
    for topic in str.split(row["topics"], ","):
        if topic in TAG_MAP:
            repo["topics"].add(TAG_MAP[topic])
        else:
            IGNORED_TAGS[tag] += 1
        if key in MANUAL_TAGS:
            for tag in MANUAL_TAGS[key]:
                repo["topics"].add(tag)

    oss_repos[key] = repo
    used_tags = used_tags.union(repo["topics"])
    used_languages.add(main_language)

# Exclude mirrors and other projects that are not contributable projects or unsuitable
BLACKLISTED_REPOS = [
    "gitlabhq/gitlabhq", # Read-only mirror.
    "qemu/qemu", # Read-only mirror.
    "xasset/xasset",# Not english.
    "jynew/jynew", # Unity RPG game framework, documentation in chinese-only though.
    "doocs/advanced-java", # Chinese-only Java interview questions.
    "CyC2018/CS-Notes", # Chinese computer science course resources.
    "apache/kafka", # Read-only mirror.
]
for blacklisted_repo in BLACKLISTED_REPOS:
    del oss_repos[blacklisted_repo]

# Convert sets to lists for json serialization, and add other
# keys the site expects
for key,repo in oss_repos.items():
    repo["owner"] = key.split("/")[0]
    repo["repo"] = key.split("/")[1]
    repo["topics"] = list(repo["topics"])
    repo["languages"] = list(repo["languages"])

# Export the .json
with open("repositories_output.json", "w") as f:
    json.dump(oss_repos, f, indent=2)

print("Valid repositories:", len(oss_repos))
print("Languages used:", used_languages)
print("Tags used:", used_tags)

Com els repositoris en la base de dades foren principalment trobades per scraping de les pàgines "trending" de GitHub, podem concloure que un 50% dels repositoris que GitHub destaca són només repositoris populars de projectes individuals o d'equips petits i no projectes contribuïbles; només uns 1000 repositoris dels 2000 en la base de dades compleixen els requisits que hem imposat.

# Tables
Shows the table for the different entities of the Data Base.

## Repositories

In [None]:
query = "SELECT * FROM Repositories;"

output_repo, df_repo = execute_select_query(cursor, query)
df_repo.tail(5)

In [None]:
query = "SELECT * FROM RepositoryVisits;"

output_repo_visists, df_repo_visists = execute_select_query(cursor, query)
df_repo_visists.head(5)

In [None]:
query = "SELECT * FROM RepositoryTopics;"

output_repo_topics, df_repo_topics = execute_select_query(cursor, query)
df_repo_topics.tail(5)

## Owners

In [None]:
query = "SELECT * FROM Owners;"
# Use _ to ignore
_, df_owners = execute_select_query(cursor, query)
df_owners.head(5)

In [None]:
query = "SELECT * FROM OwnerVisits;"

output_owner_visits, df_owner_visits = execute_select_query(cursor, query)
df_owner_visits.head(5)

## Commits

In [None]:
query = "SELECT * FROM Commits;"

output_commits, df_commits = execute_select_query(cursor, query)
df_commits.head(5)

## Topics

In [None]:
query = "SELECT * FROM Topics;"

output_topics, df_topics = execute_select_query(cursor, query)
df_topics.tail(5)

In [None]:
query = "SELECT * FROM TopicVisits;"

_, df_topic_visits = execute_select_query(cursor, query)
df_topic_visits.head(5)

## Trend

In [None]:
query = "SELECT * FROM TrendVisits;"

_, df_trend_visits = execute_select_query(cursor, query)
df_trend_visits.head(5)

In [None]:
# Closes the connection
dataBaseConnection.close()

# Analysis

### Basic statistics
* Distribution of main topics (bar or pie chart). Aka, in what proportion of our studied repos are they treated.
* Distribution of languages (bar or pie chart). Like above.
* Time evolution of interest in topics: repositories per topic, followers per topic.

### Dimensional reduction for an overview on repositories
What we mean here is to perform a PCA (principal components analysis) in order to able to have an insight on the structre of the whole dataset. We would then color the data points in it depending on the language used, for instance, to see if these groups have similar characteristics and lie close in the dataframe or not.
* Create a dataframe containing all RepositoryVisits data for a certain date for all the studied repositories.
* Perform a standarization on the data (i.e., to prevent some variables such as commits to be far more important than others such as forks).
* Perform a PCA into 2 components on it.
* Plot the results while clustering the points depending on different criteria:
    + mainLanguage
    + topic
* Supervised machine learning using stars.
### Open questions
* How to use stars and trends?
* How to use contributions by owners?


# Commits analisis

In [None]:
import numpy as np
from os import path
from PIL import Image
"""
If 
!pip install wordcloud 
doesn't work:
import sys
print(sys.executable) # use the path

path -m pip install wordcloud
"""
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
# https://www.datacamp.com/tutorial/wordcloud-python

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
def plot_word_cloud(word_cloud, text, save_image = False, image_name = "none"):
    # Create and generate a word cloud image:
    word_cloud_output = word_cloud.generate(text)
    
    # Display the generated image:
    plt.imshow(word_cloud_output, interpolation='bilinear')
    plt.axis("off")
    plt.show()
    if save_image:
        word_cloud_output.to_file(f"./Images/wordclouds/{image_name}_word_cloud.png")
        
def transform_format(val):
    if val == 0:
        return 255
    else:
        return val
        
def obtain_mask(mask):    
    trans_mask = np.ndarray((mask.shape[0],mask.shape[1]), np.int32)
    for i in range(len(mask)):
        trans_mask[i] = list(map(transform_format, mask[i]))
    return trans_mask

## Data cleaning

### Remove stop words

In [None]:
stopwords = set(STOPWORDS)
# stopwords.update(["a"]) # manually add stopwords

### Standarize words

In [None]:
import unicodedata
import re
import nltk

# Els matches d'aquest patró seran substituïts per un espai; serveix principalment per separar signes de puntuació de paraules al voltant
STANDARDIZE_SUBSTITUTION = re.compile(r"[!\"$#%&()*+,\-./:;<=>?[\]^_`{|}~]")

# Eliminarem espais consecutius
CONSECUTIVE_SPACE_SUBSTITUTION = re.compile(r"  +")

def remove_accents(input_str):
    """
    Converteix caràcters accentuats als caràcters base.
    Font: https://stackoverflow.com/a/1207479
    """
    nfkd_form = unicodedata.normalize('NFKD', input_str).encode("ascii", "ignore")
    return bytes.decode(nfkd_form)

# Aplicarem lemmatization usant NLTK;
# Substituirà per exemple paraules plurals per la seva forma singular.
lemmatizer = nltk.wordnet.WordNetLemmatizer()
LEMMATIZATION_BLACKLIST = set([
    "was",  # No apliquem lemmatization a aquesta paraula, produeix confusió
    "as",
])

WORD_REPLACEMENTS = {
    "read-me": "readme",
    "read.me": "readme",
    "readme.md": "readme"
}

def standardize(word):
    """
    :param word: paraula a estandaritzar
    :return : paraula estandaritzada
    """            
    word = word.lower()  # Convertim a lowercase
    word = re.sub(STANDARDIZE_SUBSTITUTION, " ", word)  # Eliminem simbols
    word = re.sub("read-me", "readme", word)  # Manually standarize readme.
    word = re.sub("read.me", "readme", word)
    word = re.sub("readme.md", "readme", word)
    word = remove_accents(word)  # Eliminem accents, altres combinacions de glyphs addicionals i caràcters que no tenen representació ASCII

    # Eliminem possesius
    word = word.replace("'s", "")

    # Eliminem hashtags
    word = word.replace("#", "")

    # Eliminem espais consecutius
    word = re.sub(CONSECUTIVE_SPACE_SUBSTITUTION, " ", word)
    word = word.strip()

    # Per cada paraula, apliquem lemmatization
    words = word.split(" ")
    words = [lemmatizer.lemmatize(word) if word not in LEMMATIZATION_BLACKLIST else word for word in words]

    # Reconstruim la frase
    return " ".join(words)


In [None]:
standardize("thIs a TesT")

In [None]:
standardize("readme.md")

In [None]:
_standardize_test_words = [
    "Testing @here as#Dasd 333232 tests testings gItHUbs testing",  # tests -> test per lemmatization
    "Test_ing !!, (asd,as d).",  # Considerem que underscores separen paraules, però realment no affecta molts tweets
    "Test łłłł ñaññañ áaaa",  # Eliminació d'accents i caràcters no estandards
    "テスト #asdasd arabian, wolves",
]

for word in _standardize_test_words:
    print(standardize(word))

### Remove commits made by bots

In [None]:
# Find all the commits authored by a bot.
from IPython.display import display
pd.set_option('display.max_rows', None)

cond_a = df_commits[df_commits["author"].str.contains("\[bot]|-bot")]
cond_c = df_commits[df_commits["message"].str.contains("dependabot")]
cond_d = df_commits[df_commits["message"].str.contains("renovatebot")]

temp = pd.concat([cond_a])

unique_auth = pd.unique(temp.author)
unique_msg = pd.unique(cond_c.message)

dict = {'bots' : unique_auth}
df_bots = pd.DataFrame(dict)

# displaying the bots names
display(df_bots)

dict = {'bots_message' : unique_msg}
df_bots_msg = pd.DataFrame(dict)

display(df_bots_msg)

In [None]:
# Filtering the bots
print("With bots", len(df_commits))
df_commits = df_commits[~df_commits["author"].isin(df_bots["bots"])]
print("Without bots", len(df_commits))

In [None]:
df_commits_small = df_commits.tail() # only last 5
print(df_commits_small.tail().message, "\n")
small_text_sample = " ".join(commit for commit in df_commits_small.message) # concatenate them
print("Concatenated text:", small_text_sample)
image_name = "first_word_cloud"
word_cloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=100, background_color="black")

In [None]:
text_hyper_small = df_commits["message"][0]
text_hyper_small = standardize(text_hyper_small)

image_name = "first_word_cloud"
word_cloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=100, background_color="black")

plot_word_cloud(word_cloud, text_hyper_small)

In [None]:
all_text = " ".join(commit for commit in df_commits.message) # concatenate them
word_cloud = WordCloud(stopwords=stopwords, max_font_size=30, max_words=200, background_color="black")
plot_word_cloud(word_cloud, all_text)

In [None]:
# Use a mask and a color map
        
# <a href='https://dryicons.com/icon/square-github-icon-8312'> Icon by Dryicons </a>
github_image = np.array(Image.open("./Images/wordclouds/github_square.png"))

word_cloud_mask = obtain_mask(github_image)
# Word cloud repeats words since he needs fill the gaps using the correct size.
# print(len(all_text.split(' ')))
# print(len(pd.unique(all_text.split(' '))))

word_cloud = WordCloud(stopwords=stopwords, colormap='rainbow', mask=word_cloud_mask, max_font_size=30, max_words=10000, background_color="black")
plot_word_cloud(word_cloud, all_text, True, "git_mask")

In [None]:
# Use a mask        
git_image = np.array(Image.open("./Images/wordclouds/git.png"))

word_cloud_mask = obtain_mask(git_image)

word_cloud = WordCloud(stopwords=stopwords, mask=word_cloud_mask,colormap='hot', max_font_size=20, max_words=10000, background_color="#474747")
plot_word_cloud(word_cloud, all_text)

In [None]:
text_hyper_small = df_commits["message"][0]
text_hyper_small = standardize(text_hyper_small)

image_name = "first_word_cloud"
word_cloud = WordCloud(
    stopwords=stopwords,
    max_font_size=50,
    max_words=100,
    background_color="black"
)

In [None]:
def count_words(df: pd.DataFrame):
    """
    :param df: DataFrame with the messages and associated information
    :return: Dictionary with the format {word: {n_ocur: value, n_messages: value}, ...}
    """
    # Using defaultdict for convenience; we won't have to add keys explicitly
    dicc = defaultdict(lambda: {"n_ocur": 0, "n_messages": 0})

    total_rows = len(df)
    processed_rows_count = 0
    for row in df.iterrows():
        text = row[1][0]
        text = standardize(text)  # Apply standardization
        if any(char.isdigit() for char in text):
            continue
        # Count the words
        words = text.split(" ")
        unique_words = set(words)
        # Times that a word appears and times that appears in different messages.
        for word in words:
            dicc[word]["n_ocur"] += 1
        for word in unique_words:
            dicc[word]["n_messages"] += 1

        processed_rows_count += 1
        # print(f"Word {processed_rows_count}/{total_rows} done; {processed_rows_count / total_rows * 100:.2f}%")
    
    return dicc

frame = {'Messages': df_commits.message}

result = pd.DataFrame(frame)

ocurrencesDict = count_words(result)

# Sort the words by occurrences and occurrences in messages.
def obtain_top_n_words(dictionary, N, filter="n_ocur", desc=True):
    # Get all the words and their frequency
    filtered_words = [(word, freq[filter]) for word, freq in dictionary.items() if word not in stopwords]
    
    # Sort from most to least frequent
    sorted_words = sorted(filtered_words, key=lambda x: x[1], reverse=True)
    
    # Take the N most frequent
    return [(word, freq) for word, freq in sorted_words[:N]]

top_words_list_ocur = obtain_top_n_words(ocurrencesDict, 30, "n_ocur", True)
top_words_list_msg = obtain_top_n_words(ocurrencesDict, 30, "n_messages", True)

print("Ocurrences Top", top_words_list_ocur)
print("Ocurrences per message Top", top_words_list_msg)

# Function to plot the top words
def plot_top_words(topList, yLabel, ax):
    x = [word for word, freq in topList]
    y = [freq for word, freq in topList]
    
    # Making the bar chart on the data
    ax.bar(x, y)
    
    # Giving title to the plot
    ax.set_title("Top used words")
    ax.set_xticks(x)
    ax.set_xticklabels(x, rotation=90, ha='right')
    # Giving X and Y labels
    ax.set_xlabel("Word")
    ax.set_ylabel(yLabel)
     
    # We do not call plt.show() here, as we want to show both plots together

# Create a figure with two subplots
fig, axs = plt.subplots(1, 2, figsize=(15, 7))

# Plot the top words by occurrences
plot_top_words(top_words_list_ocur, "Occurrences of word", axs[0])

# Plot the top words by occurrences in different messages
plot_top_words(top_words_list_msg, "Occurrences of word in different messages", axs[1])

# Adjust layout for better spacing
plt.tight_layout()

# Show the plots
plt.show()

# We can see that there is no significant difference, since normally, in a commit message, we do not repeat the same word twice.

### Use freqüent items to seek for the different word groups.

In [None]:
# TODO, comment more

In [None]:
frame = {'Messages': df_commits.message}

result = pd.DataFrame(frame)
print(len(result))
result.head()

In [None]:
#https://rasbt.github.io/mlxtend/user_guide/frequent_patterns/apriori/#apriori-frequent-itemsets-via-the-apriori-algorithm
def generate_data_set(df: pd.DataFrame, rmStopWords = True, useStandard = True):
    """
    :param df: DataFrame with the messages and associated information
    :return dataSet: The message of the commit is the transaction and the words the items.
    """
    data_set = []
    total_rows = len(df)
    processed_rows_count = 0
    for row in df.iterrows():
        text = row[1][0]
        if (useStandard):
            text = standardize(text)  # Apply standardization
       
        # Split the words, but don't hold digits.
        words = [word for word in text.split() if not any(char.isdigit() for char in word)]
        if (rmStopWords):
            unique_words = list(set(words) - stopwords)
        else:
            unique_words = list(set(words))
        data_set.append(unique_words)

        processed_rows_count += 1
        # print(f"Word {processed_rows_count}/{total_rows} done; {processed_rows_count / total_rows * 100:.2f}%")
    return data_set
data_set = generate_data_set(result)

In [None]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
import math
#!pip install mlxtend

def get_frequent_itemsets(data, min_supp = 0.01):
    """
    To save memory, you may want to represent your transaction data in the sparse format. 
    This is especially useful if you have lots of products and small transactions.
    """
    te = TransactionEncoder()
    oht_ary = te.fit(data).transform(data, sparse=True)
    sparse_df = pd.DataFrame.sparse.from_spmatrix(oht_ary, columns=te.columns_)
    
    # total message * support = number of messages which the bundle appears.
    print("We consider that at least the bundle appears in:", math.ceil(len(data) * min_supp), " messages")
    
    # Calculate it using apriori algorithm
    frequent_itemsets = apriori(sparse_df, min_support=min_supp, use_colnames=True, verbose=1)
    frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
    return frequent_itemsets

In [None]:
frequent_itemsets = get_frequent_itemsets(data_set)
# Sort them from more length and then support
frequent_itemsets_sorted = frequent_itemsets.sort_values(by=['length', 'support'], ascending=[False, False])
print(frequent_itemsets_sorted)

In [None]:
def plot_itemsets(data, title):
    plt.figure(figsize=(10, 6))
    plt.bar(data['itemsets'].astype(str), data['support'] * 100, color='skyblue')
    plt.xlabel('Itemsets')
    plt.ylabel('Support %')
    plt.title(title)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

In [None]:
data_len_one = frequent_itemsets[(frequent_itemsets['length'] == 1)]
data_len_one['itemsets'] = data_len_one['itemsets'].apply(lambda x: str(sorted(list(x))))
data_len_one = data_len_one.sort_values(by=['support'], ascending=False)

plot_itemsets(data_len_one, 'Support of Frequent Itemsets of Length 1')

In [None]:
data_group = frequent_itemsets[(frequent_itemsets['length'] >= 2)]
data_group['itemsets'] = data_group['itemsets'].apply(lambda x: str(sorted(list(x))))
data_group = data_group.sort_values(by=['length'], ascending=False)

plot_itemsets(data_group, 'Support of Frequent Itemsets from larger to smaller')

In [None]:
# TODO, maybe lower the minsupport even more to see strange connections

### Frequent items, but we want to also see the stopwords usage or the verbal tense used.

In [None]:
data_set_no_standard = generate_data_set(result, rmStopWords = False, useStandard = False)
frequent_itemsets = get_frequent_itemsets(data_set_no_standard)
# Sort them from more length and then support
frequent_itemsets_sorted = frequent_itemsets.sort_values(by=['length', 'support'], ascending=[False, False])
print(frequent_itemsets_sorted)

In [None]:
# TODO, maybe lower the minsupport even more to see strange connections

In [None]:
data_set_no_standard = generate_data_set(result, rmStopWords = False, useStandard = False)
frequent_itemsets = get_frequent_itemsets(data_set_no_standard, min_supp = 0.005) # Warning, if you go too low, it will crash, your pc can't handle it...
# Sort them from more length and then support
frequent_itemsets_sorted = frequent_itemsets.sort_values(by=['length', 'support'], ascending=[False, False])
print(frequent_itemsets_sorted)