# Data Generation from Scopus database

I created five tables for each subfield in AI. Each subfield table contains the articles from both mapped conferences:

- AI (*scb_b_202401_venue_items_ai*)
  - AAAI
  - IJCAI
- CV (*scb_b_202401_venue_items_cv*)
  - CVPR
  - ICCV
- ML (*scb_b_202401_venue_items_ml*)
  - ICML
  - ICLR
- NLP (*scb_b_202401_venue_items_nlp*)
  - ACL
  - EMNLP
- WIR (*scb_b_202401_venue_items_wir*)
  - WSDM
  - SIGIR

**Calculate outgoing citations from *industry-funded* papers to industry-funded papers**

In [None]:
-- cited column contains id's of articles that are not included in scopus.
-- thus, we can't declare their fundig type --> we can't use them in the analysis.
CREATE table unigjpwahle.unique_refs AS
SELECT DISTINCT r.item_id_citing, r.item_id_cited
FROM scp_b_202401.refs r
inner JOIN scp_b_202401.items i ON i.item_id = r.item_id_cited;

WITH all_articles AS (
    SELECT DISTINCT item_id, pubyear
    FROM unigjpwahle.scb_b_202401_venue_items_ai -- this table contains all articles from 2018-2023 for the two selected AI venues
    WHERE pubyear IN (2018, 2019, 2020, 2021, 2022, 2023)
),
-- declare funding type for each article in scb_b_202401_venue_items_ai
article_funding_type AS (
    SELECT 
        aa.item_id, aa.pubyear,
        CASE 
            WHEN fg.item_id IS NULL THEN 'non-funded'
            WHEN EXISTS (
                SELECT 1
                FROM scp_b_202401.funding_agencies_grants fg2
                INNER JOIN unigjpwahle.extracted_funding_agencies efa ON fg2.funding_agency = efa.funding_agencies
                WHERE fg2.item_id = aa.item_id
            ) THEN 'industry'
            ELSE 'non-industry'
        END AS funding_type
    FROM all_articles aa
    LEFT JOIN scp_b_202401.funding_agencies_grants fg ON aa.item_id = fg.item_id
),
-- get all items that are industry funded in scb_b_202401_venue_items_ai
industry_funded as (
select distinct item_id, pubyear
from article_funding_type
where funding_type = 'industry'
),
-- declare funding type for each article in scopus database (items table)
items_funding_type as (
    SELECT 
        aa.item_id, aa.pubyear,
        CASE 
            WHEN fg.item_id IS NULL THEN 'non-funded'
            WHEN EXISTS (
                SELECT 1
                FROM scp_b_202401.funding_agencies_grants fg2
                INNER JOIN unigjpwahle.extracted_funding_agencies efa ON fg2.funding_agency = efa.funding_agencies
                WHERE fg2.item_id = aa.item_id
            ) THEN 'industry'
            ELSE 'non-industry'
        END AS funding_type
    FROM scp_b_202401.items
    LEFT JOIN scp_b_202401.funding_agencies_grants fg ON aa.item_id = fg.item_id
),
-- get all items that are industry funded from scopus database
items as (
select distinct item_id, pubyear
from items_funding_type
where funding_type = 'industry' -- change to 'industry' to get industry funded items
)
select
	inf.pubyear,
	COUNT(DISTINCT r.*) as citation_count
FROM unigjpwahle.unique_refs r
inner join industry_funded inf ON r.item_id_citing = inf.item_id -- get references from industry funded articles in scb_b_202401_venue_items_ai
inner join items i ON r.item_id_cited = i.item_id -- get all referenced articles that are industry funded in scopus database
GROUP BY inf.pubyear
ORDER BY inf.pubyear;

**Calculate outgoing citations from *non-funded* papers to industry-funded papers**

In [None]:
WITH all_articles AS (
    SELECT DISTINCT item_id, pubyear
    FROM unigjpwahle.scb_b_202401_venue_items_ai
    WHERE pubyear IN (2018, 2019, 2020, 2021, 2022, 2023)
),
-- declare funding type for each article in scb_b_202401_venue_items_ai
article_funding_type AS (
    SELECT 
        aa.item_id, aa.pubyear,
        CASE 
            WHEN fg.item_id IS NULL THEN 'non-funded'
            WHEN EXISTS (
                SELECT 1
                FROM scp_b_202401.funding_agencies_grants fg2
                INNER JOIN unigjpwahle.extracted_funding_agencies efa ON fg2.funding_agency = efa.funding_agencies
                WHERE fg2.item_id = aa.item_id
            ) THEN 'industry'
            ELSE 'non-industry'
        END AS funding_type
    FROM all_articles aa
    LEFT JOIN scp_b_202401.funding_agencies_grants fg ON aa.item_id = fg.item_id
),
-- get all items that are non-funded in scb_b_202401_venue_items_ai
non_funded as (
select distinct item_id, pubyear
from article_funding_type
where funding_type = 'non-funded'
),
-- declare funding type for each article in scopus database (items table)
items_funding_type as (
    SELECT 
        aa.item_id, aa.pubyear,
        CASE 
            WHEN fg.item_id IS NULL THEN 'non-funded'
            WHEN EXISTS (
                SELECT 1
                FROM scp_b_202401.funding_agencies_grants fg2
                INNER JOIN unigjpwahle.extracted_funding_agencies efa ON fg2.funding_agency = efa.funding_agencies
                WHERE fg2.item_id = aa.item_id
            ) THEN 'industry'
            ELSE 'non-industry'
        END AS funding_type
    FROM scp_b_202401.items aa
    LEFT JOIN scp_b_202401.funding_agencies_grants fg ON aa.item_id = fg.item_id
),
-- get all items that are industry funded from scopus database
-- change to 'industry' to get industry funded items
-- change to 'non-industry' to get non-industry items
items as (
select distinct item_id, pubyear
from items_funding_type
where funding_type = 'industry'
)
select
	inf.pubyear,
	COUNT(DISTINCT r.*) as citation_count
FROM unigjpwahle.unique_refs r
inner join non_funded inf ON r.item_id_citing = inf.item_id -- get references from non-funded articles in scb_b_202401_venue_items_ai
inner join items i ON r.item_id_cited = i.item_id -- get all referenced articles that are industry funded in scopus database
GROUP BY inf.pubyear
ORDER BY inf.pubyear;

# Data Processing

#### Install necessary packages

In [None]:
install.packages('devtools')
install.packages("zoo")
install.packages("proj4")
install.packages("ggalt")
install.packages("profvis", type = "source")
devtools::install_github('bbc/bbplot')
install.packages("readxl")
install.packages("lubridate")
install.packages("wesanderson")
install.packages("viridis")
install.packages("tidytext")
install.packages("readr")
install.packages("ggnewscale")

#### Load necessary packages

In [None]:
library(readxl)
library(ggalt)
library(ggnewscale)
library(lubridate)
library(wesanderson)  # For color palettes
library(RColorBrewer)
library("viridis")
library(tidytext)
library(readr)
if(!require(pacman))install.packages("pacman")
pacman::p_load('dplyr', 'tidyr', 'gapminder',
               'ggplot2',
               'forcats', 'R.utils', 'png', 
               'grid', 'ggpubr', 'scales',
               'bbplot')
update.packages(ask = FALSE, checkBuilt = TRUE)

#### Top five industry funders in AI research across key domains

In [None]:
csv_path = "./data/funding_agencies_per_category.csv"
data = read_csv(csv_path, show_col_types = FALSE)

# Data preprocessing
plot_data <- data %>% 
  group_by(category) %>% 
  mutate(total_occ = sum(occurrences, na.rm = TRUE)) %>% 
  group_by(funding_agency) %>% 
  mutate(occ_per = (occurrences / total_occ)*100) %>% 
  group_by(category) %>%
  top_n(5, occ_per) %>%
  arrange(desc(occurrences), .by_group = TRUE) %>%
  mutate(order = row_number()) %>%
  ungroup() %>%
  mutate(category = ifelse(category == "WIR", "WIr", category)) %>% 
  mutate(category = factor(category, levels = c("AI", "CV", "ML", "NLP", "WIr")),
         funding_agency = fct_inorder(funding_agency))

# Plot data
subtitle_text <- "% of Industry Contribution per Field"
colors <- c("#0D0887", "#44039E", "#8004A8FF", "#A41F9AFF", "#BB3488FF", "#D45270FF", "#E4695EFF", "#FCA437FF")

p <- ggplot(plot_data, aes(x = reorder_within(funding_agency, order, category), y = occ_per, fill = funding_agency)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.8) +
  facet_grid(~ category, scales = "free_x", space = "free_x") +
  labs(x = "Number of Papers Funded", y = "Occurrences") +
  labs(subtitle = subtitle_text) +
  theme_minimal() +
  scale_y_continuous(labels = scales::percent_format(scale = 1), breaks = seq(0, 30, by = 5),limits=c(0, 26), expand = c(0, 0)) +
  scale_fill_manual(values = colors) +
    bbc_style() +
    theme(plot.subtitle = element_text(size = 20),
         axis.text.x = ggplot2::element_text(margin=ggplot2::margin(12, b = 0), angle = 55, hjust = 1, size=16),
         axis.ticks = element_line(size = 0.5, color = "black"),
    axis.ticks.length = unit(0.25, "cm"),
    axis.ticks.y = element_blank(),
    axis.line.x = element_line(colour = "#333333", linewidth = 1, linetype = "solid"),
    legend.position = "none",
     strip.placement = "inside",
    strip.text = element_text(size = 18, hjust = 0.5)) +
  scale_x_reordered()

ggsave("./figures/top_funding_agencies_plot_2.pdf", p, height = 6.67, width = 6.67)

print(p)

#### Industry funding per year

Funding percentage from 2018 to 2023

In [None]:
csv_path = "./data/industry_funding_time.csv"
data = read_csv(csv_path, show_col_types = FALSE)

# Data processing
data <- data %>% 
  reframe(year = year,
funding_percent = (industry_papers / total_papers)*100,
          mean_funding = mean(funding_percent, na.rm = TRUE),
          se_funding = sd(funding_percent, na.rm = TRUE) / sqrt(n()))

# Plot data
subtitle_text <- "% of Industry-Funded Papers Overall, 2018-2023"

p <- ggplot(data, aes(x = year, y = funding_percent)) +
  geom_ribbon(aes(ymin = funding_percent - se_funding, 
                  ymax = funding_percent + se_funding), 
              fill = "#150E39FF", alpha = 0.1) +
  geom_line(lwd=2, color = "#150E39FF") +
 coord_cartesian(xlim = c(2018, 2023)) +
labs(x = "Year", y = "% of Indusry Funded Papers", color = "Field") +
  labs(subtitle = subtitle_text) +
  theme_minimal() +
  scale_y_continuous(labels = scales::percent_format(scale = 1), limits = c(4, 12), expand =  c(0, 0)) +
    bbc_style() +
  theme(plot.subtitle = element_text(size = 19.4),
         axis.text.x = ggplot2::element_text(margin=ggplot2::margin(12, b = 0)),
         axis.ticks = element_line(size = 0.5, color = "black"),
    axis.ticks.length = unit(0.25, "cm"),
    axis.ticks.y = element_blank(),
    axis.line.x = element_line(colour = "#333333", linewidth = 1, linetype = "solid"))

ggsave("./figures/funding_percentage_overall_18_23.pdf", p, height = 6.67, width = 6.67)

print(p)

Funding percentage from 1998 to 2023

In [None]:
csv_path = "./data/industry_funding_time.csv"
data = read_csv(csv_path, show_col_types = FALSE)

# Data processing
data <- data %>% 
  reframe(year = year,
funding_percent = (industry_papers / total_papers)*100,
          mean_funding = mean(funding_percent, na.rm = TRUE),
          se_funding = sd(funding_percent, na.rm = TRUE) / sqrt(n()))

# Plot data
subtitle_text <- "% of Industry-Funded Papers Overall, 1998-2023"

p <- ggplot(data, aes(x = year, y = funding_percent)) +
  geom_ribbon(aes(ymin = funding_percent - se_funding, 
                  ymax = funding_percent + se_funding), 
              fill = "#150E39FF", alpha = 0.1) +
  geom_line(lwd=2, color = "#150E39FF") +
labs(x = "Year", y = "% of Indusry Funded Papers", color = "Field") +
  labs(subtitle = subtitle_text) +
  theme_minimal() +
  scale_y_continuous(labels = scales::percent_format(scale = 1), limits = c(0, 12), expand =  c(0, 0)) +
  scale_x_continuous(breaks = seq(1998, 2023, by = 5)) +
    bbc_style() +
  theme(plot.subtitle = element_text(size = 19.4),
         axis.text.x = ggplot2::element_text(margin=ggplot2::margin(12, b = 0)),
         axis.ticks = element_line(size = 0.5, color = "black"),
    axis.ticks.length = unit(0.25, "cm"),
    axis.ticks.y = element_blank(),
    axis.line.x = element_line(colour = "#333333", linewidth = 1, linetype = "solid"))

ggsave("./figures/funding_percentage_overall_98_23.pdf", p, height = 6.67, width = 6.67)

print(p)

#### The percentage distribution of funding types overall and split by AI subfields

In [None]:
csv_path = "./data/funded_papers_per_conference.csv"
data = read_csv(csv_path, show_col_types = FALSE)

# Data cleaning
# Reshape the data to a longer format
data_field <- data %>%
  group_by(Group) %>%
  reframe(industry_funded_papers = sum(industry_funded_papers_percent, na.rm = TRUE),
          non_industry_funded_papers = sum(non_industry_funded_papers_percent, na.rm = TRUE),
          non_funded_paperst = sum(non_funded_papers_percent, na.rm = TRUE),
          total_papers = sum(total_papers, na.rm = TRUE))

data_overall <- data_field %>% 
    reframe(industry_funded_papers = sum(industry_funded_papers, na.rm = TRUE),
          non_industry_funded_papers = sum(non_industry_funded_papers, na.rm = TRUE),
          non_funded_paperst = sum(non_funded_paperst, na.rm = TRUE),
          total_papers = sum(total_papers, na.rm = TRUE)) %>% 
  mutate(Group = "Overall")

data_field <- rbind(data_field, data_overall) %>% 
  mutate(Group = factor(Group, levels = unique(Group)))

data_percent <- data_field %>%
  group_by(Group) %>%
  reframe(industry_funded_papers_percent = (industry_funded_papers / total_papers)*100,
          non_industry_funded_papers_percent = (non_industry_funded_papers / total_papers)*100,
          non_funded_papers_percent = (non_funded_paperst / total_papers)*100)

mean <- data_percent %>% 
  mutate(funded = industry_funded_papers_percent + non_industry_funded_papers_percent) %>%
  reframe(mean_funded = mean(funded, na.rm = TRUE),
          mean_non_funded_papers = mean(non_funded_papers_percent, na.rm = TRUE))

data_long <- data_percent %>%
  pivot_longer(cols = c(non_funded_papers_percent, non_industry_funded_papers_percent, industry_funded_papers_percent),
               names_to = "Type",
               values_to = "Percentage")

# Rename the values in the 'Type' column to be more readable
data_long$Type <- recode(data_long$Type, 
                         'non_funded_papers_percent' = 'Non-Funded', 
                         'non_industry_funded_papers_percent' = 'Non-Industry',
                         'industry_funded_papers_percent' = 'Industry')

data_long_AI <- data_long %>% filter(Group == "AI")
data_long_CV <- data_long %>% filter(Group == "CV")
data_long_ML <- data_long %>% filter(Group == "ML")
data_long_NLP <- data_long %>% filter(Group == "NLP")
data_long_WIr <- data_long %>% filter(Group == "WIr")
data_long_Overall <- data_long %>% filter(Group == "Overall")

# Plot
subtitle_text <- "% Distribution of Funding Types per Field"
colors <- c("#1B068DFF", "#B52F8CFF", "#F58C46FF")

p <- ggplot() +
      geom_bar(data = data_long_WIr, aes(x = Percentage, y = Group, fill = Type), stat = "identity", position = "stack", width = 0.5) +
  geom_bar(data = data_long_NLP, aes(x = Percentage, y = Group, fill = Type), stat = "identity", position = "stack", width = 0.5) + 
  geom_bar(data = data_long_ML, aes(x = Percentage, y = Group, fill = Type), stat = "identity", position = "stack", width = 0.5) +  
  geom_bar(data = data_long_CV, aes(x = Percentage, y = Group, fill = Type), stat = "identity", position = "stack", width = 0.5) +  
  geom_bar(data = data_long_AI, aes(x = Percentage, y = Group, fill = Type), stat = "identity", position = "stack", width = 0.5) +       
  geom_bar(data = data_long_Overall, aes(x = Percentage, y = Group, fill = Type), stat = "identity", position = "stack", width = 0.5) + 
  geom_text(data = data_long_WIr, aes(x = Percentage, y = Group, label = paste0(round(Percentage), "%"), group = Type),
              position = position_stack(vjust = 0.5), color = "white", size = 5.5) +
  geom_text(data = data_long_NLP, aes(x = Percentage, y = Group, label = paste0(round(Percentage), "%"), group = Type),
              position = position_stack(vjust = 0.5), color = "white", size = 5.5) +
  geom_text(data = data_long_ML, aes(x = Percentage, y = Group, label = paste0(round(Percentage), "%"), group = Type),
              position = position_stack(vjust = 0.5), color = "white", size = 5.5) +        
  geom_text(data = data_long_CV, aes(x = Percentage, y = Group, label = paste0(round(Percentage), "%"), group = Type),
              position = position_stack(vjust = 0.5), color = "white", size = 5.5) +    
  geom_text(data = data_long_AI, aes(x = Percentage, y = Group, label = paste0(round(Percentage), "%"), group = Type),
              position = position_stack(vjust = 0.5), color = "white", size = 5.5) +
  geom_text(data = data_long_Overall, aes(x = Percentage, y = Group, label = paste0(round(Percentage), "%"), group = Type),
              position = position_stack(vjust = 0.5), color = "white", size = 5.5) +
  labs(x = "Conferences", y = "Funding percentage (%)", fill="Funding Type") +
  labs(subtitle = subtitle_text) +
  theme_minimal() +
  scale_fill_manual(values = colors, breaks = c("Industry", "Non-Industry", "Non-Funded")) +
  scale_x_continuous(labels = scales::percent_format(scale = 1), limits = c(0, 104.3),breaks = seq(0, 100, by = 25), expand = c(0, 0)) +
  bbc_style() +
  theme(
    legend.position = "top",
    legend.justification='left',
    plot.subtitle = element_text(size = 20),
    axis.text.x = ggplot2::element_text(margin=ggplot2::margin(12, b = 0)),
    axis.ticks = element_line(size = 0.5, color = "black"),
    axis.ticks.length = unit(0.25, "cm"),
    axis.ticks.y = element_blank(),
    axis.line.x = element_line(colour = "#333333", linewidth = 1, linetype = "solid"))

ggsave("./figures/funding_percentage_funding_types.pdf", p, height = 6.67, width = 6.67)

print(p)

#### % Proportion of Industry-Funded Papers per Field

In [None]:
csv_path = "./data/funded_papers_over_time_field.csv"
data = read_csv(csv_path, show_col_types = FALSE)

# Data processing
total_indunstry_articles <- data %>% 
  reframe(industry_funded_papers = sum(industry_funded_papers_percent, na.rm = TRUE))

data_clean <- data %>% 
  select(-non_industry_funded_papers_percent, -total, -Year) %>%
  group_by(Group) %>%
  reframe(industry_funded_papers_percent = sum(industry_funded_papers_percent, na.rm = TRUE)) %>% 
  group_by(Group) %>%
  reframe(industry_funded_papers_percent = (industry_funded_papers_percent / total_indunstry_articles$industry_funded_papers)*100) %>% 
mutate(Group = factor(Group, levels = unique(Group))) %>%
  arrange(industry_funded_papers_percent) %>% 
  mutate(order = row_number())

subtitle_text <- "% Proportion of Industry-Funded Papers per Field"
colors <- c("AI" = "#1B068DFF", "CV"="#681C81FF", "ML"="#AE347BFF", "NLP"="#D84D3EFF", "WIr"="#FCB216FF")

p <- ggplot() +
   geom_bar(data = data_clean, aes(x = industry_funded_papers_percent, y = reorder(Group, order), fill = Group), stat = "identity", width = 0.5) +
   geom_text(data = data_clean, aes(x = industry_funded_papers_percent, label = paste0(round(industry_funded_papers_percent), "%"), y = Group, group = Group), hjust = 1, nudge_x = -.5, color = "white", size = 5.5) +
  labs(x = "Conferences", y = "% of Industry Funded Papers", fill="Funding Type") +
  labs(subtitle = subtitle_text) +
  theme_minimal() +
  scale_fill_manual(values = colors) +
  scale_x_continuous(labels = scales::percent_format(scale = 1), 
                     limits = c(0, 33),
                     breaks = seq(0, 30, by = 10),
                     expand = c(0, 0)) +
    bbc_style() +
    theme(plot.subtitle = element_text(size = 19.4),
         axis.text.x = ggplot2::element_text(margin=ggplot2::margin(12, b = 0)),
         axis.ticks = element_line(size = 0.5, color = "black"),
    axis.ticks.length = unit(0.25, "cm"),
    axis.ticks.y = element_blank(),
    axis.line.x = element_line(colour = "#333333", linewidth = 1, linetype = "solid"),
    legend.position = "none")

    ggsave("./figures/funding_percentage_per_field.pdf", p, height = 6.67, width = 6.67)

print(p)

#### % of Industry-Funded Papers per Field

In [None]:
csv_path = "./data/funded_papers_over_time_field.csv"
data = read_csv(csv_path, show_col_types = FALSE)

# Data processing
total_articles_cag <- 57317

data_clean <- data %>% 
  select(-non_industry_funded_papers_percent, -total, -Year) %>%
  group_by(Group) %>%
  reframe(industry_funded_papers_percent = sum(industry_funded_papers_percent, na.rm = TRUE)) %>% 
  group_by(Group) %>%
  reframe(industry_funded_papers_percent = (industry_funded_papers_percent / total_articles_cag)*100) %>% 
mutate(Group = factor(Group, levels = unique(Group))) %>%
  arrange(desc(industry_funded_papers_percent))

data_overall <- data %>% 
  select(-non_industry_funded_papers_percent, -total, -Year) %>%
  reframe(industry_funded_papers_percent = sum(industry_funded_papers_percent, na.rm = TRUE),
          industry_funded_papers_percent = (industry_funded_papers_percent / total_articles_cag)*100) %>% 
  mutate(Group = "Overall",
         Group = factor(Group, levels = unique(Group)))

subtitle_text <- "% of Industry-Funded Papers per Field"
colors <- c("AI" = "#1B068DFF", "CV"="#681C81FF", "ML"="#AE347BFF", "NLP"="#D84D3EFF", "WIr"="#FCB216FF")

p <- ggplot() +
  geom_bar(data = data_overall, aes(x = Group, y = industry_funded_papers_percent), fill = "#150E39FF", stat = "identity", position = "identity", width = 0.7) + 
  geom_text(data = data_overall, aes(x = Group, label = paste0(round(industry_funded_papers_percent), "%"), y = industry_funded_papers_percent, group = Group), position = position_stack(vjust = 0.5), color = "white", size = 5.5) +
   geom_bar(data = data_clean, aes(x = "Fields", y = industry_funded_papers_percent, fill = fct_reorder(Group, industry_funded_papers_percent)), stat = "identity", width = 0.7) + 
  geom_text(data = data_clean, aes(x = "Fields", label = Group, y = industry_funded_papers_percent, group = fct_reorder(Group, industry_funded_papers_percent)), position = position_stack(vjust = 0.5), color = "white", size = 5.5) +
   geom_segment(data = data_clean,
               aes(x = 1.35, xend = 1.65,
                   y = cumsum(industry_funded_papers_percent) - industry_funded_papers_percent/2,
                   yend = cumsum(industry_funded_papers_percent) - industry_funded_papers_percent/2),
               color = "#333333",
               arrow = arrow(type = "closed", , length = unit(0.1, "inches"))) +
  geom_text(data = data_clean,
            aes(x = 1.5, 
                y = cumsum(industry_funded_papers_percent) - industry_funded_papers_percent/2,
                label = paste0(round(industry_funded_papers_percent, 1), "%")),
            vjust = -0.5, size = 5, color = "#333333") +
  labs(x = "Conferences", y = "% of Industry Funded Papers", fill="Funding Type") +
  labs(subtitle = subtitle_text) +
  theme_minimal() +
  scale_fill_manual(values = colors) +
  scale_y_continuous(labels = scales::percent_format(scale = 1), 
                     limits = c(0, 10),
                     expand = c(0, 0)) +
    bbc_style() +
    theme(plot.subtitle = element_text(size = 20),
         axis.text.x = ggplot2::element_text(margin=ggplot2::margin(12, b = 0)),
         axis.ticks = element_line(size = 0.5, color = "black"),
    axis.ticks.length = unit(0.25, "cm"),
    axis.ticks.y = element_blank(),
    axis.line.x = element_line(colour = "#333333", linewidth = 1, linetype = "solid"),
    legend.position = "none")

ggsave("./figures/funding_percentage_per_field_total.pdf", p, height = 6.67, width = 6.67)

print(p)

#### $FIFP$ from 2018 to 2023

In [None]:
csv_path = "./data/funded_papers_over_time_field.csv"
data = read_csv(csv_path, show_col_types = FALSE)

# Data processing
data_long <- data %>%
  select(Year, Group, industry_funded_papers_norm) %>%
  mutate(industry_funded_papers_norm = industry_funded_papers_norm * 100) %>% 
  filter(if_all(c("industry_funded_papers_norm"), ~!is.na(.))) %>% 
  pivot_longer(cols = c(industry_funded_papers_norm),
               names_to = "Type",
               values_to = "Percentage")

# Rename the values in the 'Type' column to be more readable
data_long$Type <- recode(data_long$Type, 
                         'industry_funded_papers_norm' = 'Industry Funded Papers')

ai_df <- data_long %>% filter(Group == "AI")
cv_df <- data_long %>% filter(Group == "CV")
ml_df <- data_long %>% filter(Group == "ML")
nlp_df <- data_long %>% filter(Group == "NLP")
wir_df <- data_long %>% filter(Group == "WIr")

# Plot
subtitle_text <- "% of Industry-Funded Papers per Field, 2018-2023"

colors <- c("AI" = "#1B068DFF", 
            "CV" = "#681C81FF", 
            "ML" = "#AE347BFF",
            "NLP" = "#D84D3EFF", 
            "WIr" = "#FCB216FF", 
            "Overall" = "#150E39FF")

p <- ggplot() +    
  geom_line(data = ai_df, aes(x = Year, y = Percentage, color = "AI"), lwd=2) +
  geom_line(data = cv_df, aes(x = Year, y = Percentage, color = "CV"), lwd=2) + 
  geom_line(data = ml_df, aes(x = Year, y = Percentage, color = "ML"), lwd=2) + 
  geom_line(data = nlp_df, aes(x = Year, y = Percentage, color = "NLP"), lwd=2) + 
  geom_line(data = wir_df, aes(x = Year, y = Percentage, color = "WIr"), lwd=2) + 
  labs(x = "Year", y = "% of Indusry Funded Papers", color = "Field") +
  labs(subtitle = subtitle_text) +
  theme_minimal() +
  scale_color_manual(values = colors) +
  scale_y_continuous(labels = scales::percent_format(scale = 1), limits = c(0, 4.1), expand = c(0, 0)) +
  bbc_style() +
  theme(
    legend.position = "top",
    legend.justification='left',
    plot.subtitle = element_text(size = 19.4),
     axis.text.x = ggplot2::element_text(margin=ggplot2::margin(12, b = 0)),
         axis.ticks = element_line(size = 0.5, color = "black"),
    axis.ticks.length = unit(0.25, "cm"),
    axis.ticks.y = element_blank(),
    axis.line.x = element_line(colour = "#333333", linewidth = 1, linetype = "solid"))

ggsave("./figures/funding_percentage_per_field_plot.pdf", p, height = 6.67, width = 6.67)

print(p)

#### Citation Preference Ratio of AI, 2018-2023

In [None]:
csv_path = "./data/industry_to_x_citations_per_year.csv"
data = read_csv(csv_path, show_col_types = FALSE)

# Data processing
data_overall <- data %>% 
  group_by(Year) %>% 
  reframe(total_industry_citations = sum(total_industry_to_industry_citations, total_non_funded_to_industry_citations, total_non_industry_to_industry, na.rm = TRUE),
          total_non_industry_funded_citations = sum(total_industry_to_non_industry_funded_citations, total_non_funded_to_non_industry_funded_citations, total_non_industry_to_non_industry, na.rm = TRUE),
          total_non_funded_citations = sum(total_industry_to_non_funded_citations, total_non_funded_to_non_funded_citations, total_non_industry_to_non_funded, na.rm = TRUE),
          total_citations = sum(total_industry_citations, total_non_industry_funded_citations, total_non_funded_citations, na.rm = TRUE),
          total_industry_funded_papers = sum(total_industry_funded_papers, na.rm = TRUE),
          total_non_funded_papers = sum(total_non_funded_papers, na.rm = TRUE),
          total_non_industry_funded_papers = sum(total_non_industry_funded_papers, na.rm = TRUE),
          total_papers = sum(total_industry_funded_papers, total_non_funded_papers, total_non_industry_funded_papers, na.rm = TRUE))

# Proportion
data_overall <- data_overall %>% 
  group_by(Year) %>%
  mutate(p_industry = (total_industry_funded_papers / total_papers),
         p_non_industry_funded = (total_non_industry_funded_papers / total_papers),
         p_non_funded = (total_non_funded_papers / total_papers))

# Citation Preference Ratio
data_overall <- data_overall %>% 
  group_by(Year) %>%
  reframe(cbr_industry = total_industry_citations / (total_citations * p_industry),
         cbr_non_industry = total_non_industry_funded_citations / (total_citations * p_non_industry_funded),
         cbr_non_funded = total_non_funded_citations / (total_citations * p_non_funded))

# Plot
subtitle_text <- "Citation Preference Ratio of AI, 2018-2023"

colors <- c("Industry" = "#1B068DFF", 
            "Non-industry" = "#B52F8CFF",
            "Non-funded" = "#F58C46FF")

p <- ggplot() +
  geom_hline(yintercept = 1, color = "#333333", lwd=2, linetype="dotted") +
  geom_line(data=data_overall, aes(x = Year, y = cbr_industry, color="Industry"), lwd = 2) +
    geom_line(data=data_overall, aes(x = Year, y = cbr_non_industry, color="Non-industry"), lwd = 2) +
      geom_line(data=data_overall, aes(x = Year, y = cbr_non_funded, color="Non-funded"), lwd = 2) +
  labs(x = "Year",
       y = "Normalized Outgoing Citations") +
  theme_minimal() +
  labs(subtitle = subtitle_text) +
  scale_color_manual(values = colors, breaks = c("Industry", "Non-industry", "Non-funded"), labels = c("Industry", "Non-Industry", "Non-Funded")) +
 scale_y_continuous(limits = c(.6, 1.6), expand = c(0, 0)) +
  bbc_style() +
  theme(
    legend.position = "top",
    legend.justification='left',
    plot.subtitle = element_text(size = 20),
     axis.text.x = ggplot2::element_text(margin=ggplot2::margin(12, b = 0)),
         axis.ticks = element_line(size = 0.5, color = "black"),
    axis.ticks.length = unit(0.25, "cm"),
    axis.ticks.y = element_blank(),
    axis.line.x = element_line(colour = "#333333", linewidth = 1, linetype = "solid"))

ggsave("./figures/out_cit_overall_cpr.pdf", p, height = 6.67, width = 6.67)

print(p)


#### % of Outgoing Citations to Industry, 2018-2023

In [None]:
csv_path = "./data/outgoing_citation_data.csv"
data = read_csv(csv_path, show_col_types = FALSE)

# Data processing
data_pro <- data %>% 
  group_by(citing_item_id, citing_pubyear) %>% 
  reframe(total_cit = n(),
          industry_cit = sum(cited_funding_type == "industry"),
          citing_funding_type = citing_funding_type,
          industry_per = (industry_cit / total_cit)*100) %>% 
  filter(industry_per != 0) %>% 
  group_by(citing_funding_type, citing_pubyear) %>%
  reframe(avg = mean(industry_per, na.rm = TRUE))

data_industry <- data_pro %>% 
  filter(citing_funding_type == "industry") %>% 
  mutate(funding_type = "Industry",
         funding_type = factor(funding_type))

data_non_industry <- data_pro %>% 
  filter(citing_funding_type == "non-industry") %>% 
  mutate(funding_type = "Non-Industry",
         funding_type = factor(funding_type))

data_non_funded <- data_pro %>%
  filter(citing_funding_type == "non-funded") %>% 
  mutate(funding_type = "Non-Funded",
         funding_type = factor(funding_type))

data_avg <- data_pro %>% 
  group_by(citing_pubyear) %>% 
  reframe(avg = mean(avg, na.rm = TRUE),
          funding_type = "Avg.",
          funding_type = factor(funding_type))

# Plot
subtitle_text <- "% of Outgoing Citations to Industry, 2018-2023"

colors <- c("Avg." = "#150E39FF",
            "Industry" = "#1B068DFF", 
            "Non-Industry" = "#B52F8CFF",
            "Non-Funded" = "#F58C46FF")

p <- ggplot() +
  geom_line(data=data_industry, aes(x = citing_pubyear, y = avg, color="Industry"), lwd=2) +
    geom_line(data=data_non_industry, aes(x = citing_pubyear, y = avg, color="Non-Industry"), lwd=2) +
      geom_line(data=data_non_funded, aes(x = citing_pubyear, y = avg, color="Non-Funded"), lwd=2) +
  geom_line(data=data_avg, aes(x = citing_pubyear, y = avg, color="Avg."), lwd=2) +
  labs(x = "Year",
       y = "% of Intra-Field Citations") +
  labs(subtitle = subtitle_text) +
  theme_minimal() +
  scale_color_manual(values = colors, breaks = c("Avg.", "Industry", "Non-Industry", "Non-Funded"), labels = c("Avg.", "Industry", "Non-Industry", "Non-Funded")) +
  scale_y_continuous(labels = scales::percent_format(scale = 1), limits = c(8, 16), expand =  c(0, 0)) +
  bbc_style() +
  theme(
    legend.position = "top",
    legend.justification='left',
    plot.subtitle = element_text(size = 20),
    axis.text.x = ggplot2::element_text(margin=ggplot2::margin(12, b = 0)),
    axis.ticks = element_line(size = 0.5, color = "black"),
    axis.ticks.length = unit(0.25, "cm"),
    axis.ticks.y = element_blank(),
    axis.line.x = element_line(colour = "#333333", linewidth = 1, linetype = "solid"))

ggsave("./figures/oc_indsutry.pdf", p, height = 6.67, width = 6.67)

print(p)

#### Outgoing Relative Citational Prominence ($ORCP$)

In [None]:
# Functions
X_function <- function(citations_f_to_f, total_citations_f_to_all) {
  return(citations_f_to_f / total_citations_f_to_all)
}

Y_function <- function(cit_avg) {
  return(cit_avg / 3)
}

calculate_ORCP <- function(citations_f_to_f, total_citations_f_to_all, cit_avg) {
  X_f <- X_function(citations_f_to_f, total_citations_f_to_all)
  Y_f <- Y_function(cit_avg)
  return(X_f - Y_f)
}

# Read data
csv_path = "./data/industry_to_x_citations_per_year.csv"
data = read_csv(csv_path, show_col_types = FALSE)

# -----------------------------------------------------------------------------industry
# Data processing
data_industry <- data %>%
  reframe(citations_industry_to_industry = sum(total_industry_to_industry_citations, na.rm = TRUE),
          total_citations_industry_to_all = sum(total_industry_to_industry_citations, total_industry_to_non_industry_funded_citations, total_industry_to_non_funded_citations, na.rm = TRUE),
          citations_non_industry_to_industry = sum(total_non_industry_to_industry, na.rm = TRUE),
          total_citations_non_industry_to_all = sum(total_non_industry_to_industry, total_non_industry_to_non_industry, total_non_industry_to_non_funded, na.rm = TRUE),
          citations_non_funded_to_industry = sum(total_non_funded_to_industry_citations, na.rm = TRUE),
          total_citations_non_funded_to_all = sum(total_non_funded_to_industry_citations, total_non_funded_to_non_industry_funded_citations, total_non_funded_to_non_funded_citations, na.rm = TRUE),
          industry_per = citations_industry_to_industry / total_citations_industry_to_all,
          non_industry_per = citations_non_industry_to_industry / total_citations_non_industry_to_all,
          non_funded_per = citations_non_funded_to_industry / total_citations_non_funded_to_all,
          cit_avg = sum(industry_per, non_industry_per, non_funded_per, na.rm = TRUE)) %>%
  mutate(ORCP = calculate_ORCP(citations_industry_to_industry, total_citations_industry_to_all, cit_avg),
         funding_type = "Industry")

data_non_industry <- data %>%
  reframe(citations_industry_to_non_industry = sum(total_industry_to_non_industry_funded_citations, na.rm = TRUE),
          total_citations_industry_to_all = sum(total_industry_to_industry_citations, total_industry_to_non_industry_funded_citations, total_industry_to_non_funded_citations, na.rm = TRUE),
          citations_non_industry_to_non_industry = sum(total_non_industry_to_non_industry, na.rm = TRUE),
          total_citations_non_industry_to_all = sum(total_non_industry_to_industry, total_non_industry_to_non_industry, total_non_industry_to_non_funded, na.rm = TRUE),
          citations_non_funded_to_non_industry = sum(total_non_funded_to_non_industry_funded_citations, na.rm = TRUE),
          total_citations_non_funded_to_all = sum(total_non_funded_to_industry_citations, total_non_funded_to_non_industry_funded_citations, total_non_funded_to_non_funded_citations, na.rm = TRUE),
          industry_per = citations_industry_to_non_industry / total_citations_industry_to_all,
          non_industry_per = citations_non_industry_to_non_industry / total_citations_non_industry_to_all,
          non_funded_per = citations_non_funded_to_non_industry / total_citations_non_funded_to_all,
          cit_avg = sum(industry_per, non_industry_per, non_funded_per, na.rm = TRUE)) %>%
  mutate(ORCP = calculate_ORCP(citations_industry_to_non_industry, total_citations_industry_to_all, cit_avg),
         funding_type = "Non-Industry")


data_non_funded <- data %>%
  reframe(citations_industry_to_non_funded = sum(total_industry_to_non_funded_citations, na.rm = TRUE),
          total_citations_industry_to_all = sum(total_industry_to_industry_citations, total_industry_to_non_industry_funded_citations, total_industry_to_non_funded_citations, na.rm = TRUE),
          citations_non_industry_to_non_funded = sum(total_non_industry_to_non_funded, na.rm = TRUE),
          total_citations_non_industry_to_all = sum(total_non_industry_to_industry, total_non_industry_to_non_industry, total_non_industry_to_non_funded, na.rm = TRUE),
           citations_non_funded_to_non_funded = sum(total_non_funded_to_non_funded_citations, na.rm = TRUE),
          total_citations_non_funded_to_all = sum(total_non_funded_to_industry_citations, total_non_funded_to_non_industry_funded_citations, total_non_funded_to_non_funded_citations, na.rm = TRUE),
          industry_per = citations_industry_to_non_funded / total_citations_industry_to_all,
          non_industry_per = citations_non_industry_to_non_funded / total_citations_non_industry_to_all,
          non_funded_per = citations_non_funded_to_non_funded / total_citations_non_funded_to_all,
          cit_avg = sum(industry_per, non_industry_per, non_funded_per, na.rm = TRUE)) %>%
  mutate(ORCP = calculate_ORCP(citations_industry_to_non_funded, total_citations_industry_to_all, cit_avg),
         funding_type = "Non-Funded")

plot_data <- bind_rows(data_industry, data_non_industry, data_non_funded) %>% 
  select(funding_type, ORCP) %>% 
  mutate(funding_type = factor(funding_type,levels = c("Non-Funded", "Non-Industry", "Industry")),
         ORCP = ORCP * 100)

# Plot
colors <- c("Industry" = "#1B068DFF", 
            "Non-Industry" = "#B52F8CFF",
            "Non-Funded" = "#F58C46FF")

p <- ggplot() +
  geom_dumbbell(data = plot_data, aes(y = funding_type, x = ORCP, xend = 0, color = funding_type), size = 3, colour = "#dddddd", colour_x = "#ffffff00", colour_xend = "#ffffff00") +
  geom_vline(xintercept = 0, color = "black", lwd = 1.6) +
  geom_point(
    data = plot_data,
    aes(x = ORCP, y = funding_type, color = funding_type),
    size = 4,
    show.legend = FALSE) +
  labs(x = "Difference from Average (%)", y = "Field") +
  labs(subtitle = "") +
  bbc_style() +
  guides(color = guide_legend(override.aes = list(shape = 16))) +
  scale_color_manual(values = colors) +
  scale_x_continuous(limits = c(-2, 3), breaks = c(-2, -1, 0, 1, 2, 3), labels = function(x) paste0(x, "%")) +
  geom_text(aes(x = 0.3, y = nrow(plot_data) + 1.1, label = "Above average"), size = 18/.pt,
            vjust = 1, hjust = 0, color = "#777777") +
   geom_text(aes(x = -0.3, y = nrow(plot_data) + 1.1, label = "Below average"), size = 18/.pt,
            vjust = 1, hjust = 1, color = "#777777") +
  geom_segment(aes(x = 0.3, xend = 1.5, y = nrow(plot_data) + 1.2, yend = nrow(plot_data) + 1.2), color="#777777",
               arrow = arrow(length = unit(0.16, "cm"))) +
   geom_segment(aes(x = -0.3, xend = -1.5, y = nrow(plot_data) + 1.2, yend = nrow(plot_data) + 1.2), color="#777777",
               arrow = arrow(length = unit(0.16, "cm"))) +
  coord_cartesian(clip = "off", ylim = c(0.9, nrow(plot_data)+0.2)) +  # Adjusted ylim
  theme(
    plot.subtitle = element_text(hjust = -.2, vjust=8),
    panel.grid.major.x = element_line(colour = "#cccccc"),
    panel.grid.major.y = element_blank(),
    axis.ticks.y = element_blank(),
    legend.margin = margin(0, 0, 30, -550))

ggsave("./figures/ORCP_industry.pdf", p, height = 3.67, width = 6.67)

print(p)

# -----------------------------------------------------------------------------non-industry
# Data processing
data_industry <- data %>%
  reframe(citations_industry_to_industry = sum(total_industry_to_industry_citations, na.rm = TRUE),
          total_citations_industry_to_all = sum(total_industry_to_industry_citations, total_industry_to_non_industry_funded_citations, total_industry_to_non_funded_citations, na.rm = TRUE),
          citations_non_industry_to_industry = sum(total_non_industry_to_industry, na.rm = TRUE),
          total_citations_non_industry_to_all = sum(total_non_industry_to_industry, total_non_industry_to_non_industry, total_non_industry_to_non_funded, na.rm = TRUE),
          citations_non_funded_to_industry = sum(total_non_funded_to_industry_citations, na.rm = TRUE),
          total_citations_non_funded_to_all = sum(total_non_funded_to_industry_citations, total_non_funded_to_non_industry_funded_citations, total_non_funded_to_non_funded_citations, na.rm = TRUE),
          industry_per = citations_industry_to_industry / total_citations_industry_to_all,
          non_industry_per = citations_non_industry_to_industry / total_citations_non_industry_to_all,
          non_funded_per = citations_non_funded_to_industry / total_citations_non_funded_to_all,
          cit_avg = sum(industry_per, non_industry_per, non_funded_per, na.rm = TRUE)) %>%
  mutate(ORCP = calculate_ORCP(citations_non_industry_to_industry, total_citations_non_industry_to_all, cit_avg),
         funding_type = "Industry")

data_non_industry <- data %>%
  reframe(citations_industry_to_non_industry = sum(total_industry_to_non_industry_funded_citations, na.rm = TRUE),
          total_citations_industry_to_all = sum(total_industry_to_industry_citations, total_industry_to_non_industry_funded_citations, total_industry_to_non_funded_citations, na.rm = TRUE),
          citations_non_industry_to_non_industry = sum(total_non_industry_to_non_industry, na.rm = TRUE),
          total_citations_non_industry_to_all = sum(total_non_industry_to_industry, total_non_industry_to_non_industry, total_non_industry_to_non_funded, na.rm = TRUE),
          citations_non_funded_to_non_industry = sum(total_non_funded_to_non_industry_funded_citations, na.rm = TRUE),
          total_citations_non_funded_to_all = sum(total_non_funded_to_industry_citations, total_non_funded_to_non_industry_funded_citations, total_non_funded_to_non_funded_citations, na.rm = TRUE),
          industry_per = citations_industry_to_non_industry / total_citations_industry_to_all,
          non_industry_per = citations_non_industry_to_non_industry / total_citations_non_industry_to_all,
          non_funded_per = citations_non_funded_to_non_industry / total_citations_non_funded_to_all,
          cit_avg = sum(industry_per, non_industry_per, non_funded_per, na.rm = TRUE)) %>%
  mutate(ORCP = calculate_ORCP(citations_non_industry_to_non_industry, total_citations_non_industry_to_all, cit_avg),
         funding_type = "Non-Industry")


data_non_funded <- data %>%
  reframe(citations_industry_to_non_funded = sum(total_industry_to_non_funded_citations, na.rm = TRUE),
          total_citations_industry_to_all = sum(total_industry_to_industry_citations, total_industry_to_non_industry_funded_citations, total_industry_to_non_funded_citations, na.rm = TRUE),
          citations_non_industry_to_non_funded = sum(total_non_industry_to_non_funded, na.rm = TRUE),
          total_citations_non_industry_to_all = sum(total_non_industry_to_industry, total_non_industry_to_non_industry, total_non_industry_to_non_funded, na.rm = TRUE),
           citations_non_funded_to_non_funded = sum(total_non_funded_to_non_funded_citations, na.rm = TRUE),
          total_citations_non_funded_to_all = sum(total_non_funded_to_industry_citations, total_non_funded_to_non_industry_funded_citations, total_non_funded_to_non_funded_citations, na.rm = TRUE),
          industry_per = citations_industry_to_non_funded / total_citations_industry_to_all,
          non_industry_per = citations_non_industry_to_non_funded / total_citations_non_industry_to_all,
          non_funded_per = citations_non_funded_to_non_funded / total_citations_non_funded_to_all,
          cit_avg = sum(industry_per, non_industry_per, non_funded_per, na.rm = TRUE)) %>%
  mutate(ORCP = calculate_ORCP(citations_non_industry_to_non_funded, total_citations_non_industry_to_all, cit_avg),
         funding_type = "Non-Funded")

plot_data <- bind_rows(data_industry, data_non_industry, data_non_funded) %>% 
  select(funding_type, ORCP) %>% 
  mutate(funding_type = factor(funding_type,levels = c("Non-Funded", "Non-Industry", "Industry")),
         ORCP = ORCP * 100)

# Plot
colors <- c("Industry" = "#1B068DFF", 
            "Non-Industry" = "#B52F8CFF",
            "Non-Funded" = "#F58C46FF")

p <- ggplot() +
  geom_dumbbell(data = plot_data, aes(y = funding_type, x = ORCP, xend = 0, color = funding_type), size = 3, colour = "#dddddd", colour_x = "#ffffff00", colour_xend = "#ffffff00") +
  geom_vline(xintercept = 0, color = "black", lwd = 1.6) +
  geom_point(
    data = plot_data,
    aes(x = ORCP, y = funding_type, color = funding_type),
    size = 4,
    show.legend = FALSE) +
  labs(x = "Difference from Average (%)", y = "Field") +
  labs(subtitle = "") +
  bbc_style() +
  guides(color = guide_legend(override.aes = list(shape = 16))) +
  scale_color_manual(values = colors) +
  scale_x_continuous(limits = c(-3, 4), breaks = c(-3, -2, -1, 0, 1, 2, 3, 4), labels = function(x) paste0(x, "%")) +
  geom_text(aes(x = 0.3, y = nrow(plot_data) + 1.1, label = "Above average"), size = 18/.pt,
            vjust = 1, hjust = 0, color = "#777777") +
   geom_text(aes(x = -0.3, y = nrow(plot_data) + 1.1, label = "Below average"), size = 18/.pt,
            vjust = 1, hjust = 1, color = "#777777") +
  geom_segment(aes(x = 0.3, xend = 1.5, y = nrow(plot_data) + 1.2, yend = nrow(plot_data) + 1.2), color="#777777",
               arrow = arrow(length = unit(0.16, "cm"))) +
   geom_segment(aes(x = -0.3, xend = -1.5, y = nrow(plot_data) + 1.2, yend = nrow(plot_data) + 1.2), color="#777777",
               arrow = arrow(length = unit(0.16, "cm"))) +
  coord_cartesian(clip = "off", ylim = c(0.9, nrow(plot_data)+0.2)) +  # Adjusted ylim
  theme(
    plot.subtitle = element_text(hjust = -.2, vjust=8),
    panel.grid.major.x = element_line(colour = "#cccccc"),
    panel.grid.major.y = element_blank(),
    axis.ticks.y = element_blank(),
    legend.margin = margin(0, 0, 30, -550))

ggsave("./figures/ORCP_non_industry.pdf", p, height = 3.67, width = 6.67)

print(p)

# -----------------------------------------------------------------------------non-funded
# Data processing
data_industry <- data %>%
  reframe(citations_industry_to_industry = sum(total_industry_to_industry_citations, na.rm = TRUE),
          total_citations_industry_to_all = sum(total_industry_to_industry_citations, total_industry_to_non_industry_funded_citations, total_industry_to_non_funded_citations, na.rm = TRUE),
          citations_non_industry_to_industry = sum(total_non_industry_to_industry, na.rm = TRUE),
          total_citations_non_industry_to_all = sum(total_non_industry_to_industry, total_non_industry_to_non_industry, total_non_industry_to_non_funded, na.rm = TRUE),
          citations_non_funded_to_industry = sum(total_non_funded_to_industry_citations, na.rm = TRUE),
          total_citations_non_funded_to_all = sum(total_non_funded_to_industry_citations, total_non_funded_to_non_industry_funded_citations, total_non_funded_to_non_funded_citations, na.rm = TRUE),
          industry_per = citations_industry_to_industry / total_citations_industry_to_all,
          non_industry_per = citations_non_industry_to_industry / total_citations_non_industry_to_all,
          non_funded_per = citations_non_funded_to_industry / total_citations_non_funded_to_all,
          cit_avg = sum(industry_per, non_industry_per, non_funded_per, na.rm = TRUE)) %>%
  mutate(ORCP = calculate_ORCP(citations_non_funded_to_industry, total_citations_non_funded_to_all, cit_avg),
         funding_type = "Industry")

data_non_industry <- data %>%
  reframe(citations_industry_to_non_industry = sum(total_industry_to_non_industry_funded_citations, na.rm = TRUE),
          total_citations_industry_to_all = sum(total_industry_to_industry_citations, total_industry_to_non_industry_funded_citations, total_industry_to_non_funded_citations, na.rm = TRUE),
          citations_non_industry_to_non_industry = sum(total_non_industry_to_non_industry, na.rm = TRUE),
          total_citations_non_industry_to_all = sum(total_non_industry_to_industry, total_non_industry_to_non_industry, total_non_industry_to_non_funded, na.rm = TRUE),
          citations_non_funded_to_non_industry = sum(total_non_funded_to_non_industry_funded_citations, na.rm = TRUE),
          total_citations_non_funded_to_all = sum(total_non_funded_to_industry_citations, total_non_funded_to_non_industry_funded_citations, total_non_funded_to_non_funded_citations, na.rm = TRUE),
          industry_per = citations_industry_to_non_industry / total_citations_industry_to_all,
          non_industry_per = citations_non_industry_to_non_industry / total_citations_non_industry_to_all,
          non_funded_per = citations_non_funded_to_non_industry / total_citations_non_funded_to_all,
          cit_avg = sum(industry_per, non_industry_per, non_funded_per, na.rm = TRUE)) %>%
  mutate(ORCP = calculate_ORCP(citations_non_funded_to_non_industry, total_citations_non_funded_to_all, cit_avg),
         funding_type = "Non-Industry")


data_non_funded <- data %>%
  reframe(citations_industry_to_non_funded = sum(total_industry_to_non_funded_citations, na.rm = TRUE),
          total_citations_industry_to_all = sum(total_industry_to_industry_citations, total_industry_to_non_industry_funded_citations, total_industry_to_non_funded_citations, na.rm = TRUE),
          citations_non_industry_to_non_funded = sum(total_non_industry_to_non_funded, na.rm = TRUE),
          total_citations_non_industry_to_all = sum(total_non_industry_to_industry, total_non_industry_to_non_industry, total_non_industry_to_non_funded, na.rm = TRUE),
           citations_non_funded_to_non_funded = sum(total_non_funded_to_non_funded_citations, na.rm = TRUE),
          total_citations_non_funded_to_all = sum(total_non_funded_to_industry_citations, total_non_funded_to_non_industry_funded_citations, total_non_funded_to_non_funded_citations, na.rm = TRUE),
          industry_per = citations_industry_to_non_funded / total_citations_industry_to_all,
          non_industry_per = citations_non_industry_to_non_funded / total_citations_non_industry_to_all,
          non_funded_per = citations_non_funded_to_non_funded / total_citations_non_funded_to_all,
          cit_avg = sum(industry_per, non_industry_per, non_funded_per, na.rm = TRUE)) %>%
  mutate(ORCP = calculate_ORCP(citations_non_funded_to_non_funded, total_citations_non_funded_to_all, cit_avg),
         funding_type = "Non-Funded")

plot_data <- bind_rows(data_industry, data_non_industry, data_non_funded) %>% 
  select(funding_type, ORCP) %>% 
  mutate(funding_type = factor(funding_type,levels = c("Non-Funded", "Non-Industry", "Industry")),
         ORCP = ORCP * 100)

# Plot
colors <- c("Industry" = "#1B068DFF", 
            "Non-Industry" = "#B52F8CFF",
            "Non-Funded" = "#F58C46FF")

p <- ggplot() +
  geom_dumbbell(data = plot_data, aes(y = funding_type, x = ORCP, xend = 0, color = funding_type), size = 3, colour = "#dddddd", colour_x = "#ffffff00", colour_xend = "#ffffff00") +
  geom_vline(xintercept = 0, color = "black", lwd = 1.6) +
  geom_point(
    data = plot_data,
    aes(x = ORCP, y = funding_type, color = funding_type),
    size = 4,
    show.legend = FALSE) +
  labs(x = "Difference from Average (%)", y = "Field") +
  labs(subtitle = "") +
  bbc_style() +
  guides(color = guide_legend(override.aes = list(shape = 16))) +
  scale_color_manual(values = colors) +
 scale_x_continuous(limits = c(-3, 4), breaks = c(-3, -2, -1, 0, 1, 2, 3, 4), labels = function(x) paste0(x, "%")) +
  geom_text(aes(x = 0.3, y = nrow(plot_data) + 1.1, label = "Above average"), size = 18/.pt,
            vjust = 1, hjust = 0, color = "#777777") +
   geom_text(aes(x = -0.3, y = nrow(plot_data) + 1.1, label = "Below average"), size = 18/.pt,
            vjust = 1, hjust = 1, color = "#777777") +
  geom_segment(aes(x = 0.3, xend = 1.5, y = nrow(plot_data) + 1.2, yend = nrow(plot_data) + 1.2), color="#777777",
               arrow = arrow(length = unit(0.16, "cm"))) +
   geom_segment(aes(x = -0.3, xend = -1.5, y = nrow(plot_data) + 1.2, yend = nrow(plot_data) + 1.2), color="#777777",
               arrow = arrow(length = unit(0.16, "cm"))) +
  coord_cartesian(clip = "off", ylim = c(0.9, nrow(plot_data)+0.2)) +  # Adjusted ylim
  theme(
    plot.subtitle = element_text(hjust = -.2, vjust=8),
    panel.grid.major.x = element_line(colour = "#cccccc"),
    panel.grid.major.y = element_blank(),
    axis.ticks.y = element_blank(),
    legend.margin = margin(0, 0, 30, -550))

ggsave("./figures/ORCP_non_funded.pdf", p, height = 3.67, width = 6.67)

print(p)

#### Outgoing Citation Fields

In [None]:
csv_path = "./data/thesis_excel/csv/top_10_fields.csv"
data = read_csv(csv_path, show_col_types = FALSE)

# ------------------------------------------------------------------------------industry
# Data processing
data_industry <- data %>%
  filter(funding_type == "industry") %>% 
  reframe(class_name = class_name,
          total_occ = sum(occurence, na.rm = TRUE),
          occurence_per = (occurence / total_occ)*100) %>% 
  arrange(occurence_per) %>% 
  mutate(class_name = factor(class_name, levels = unique(class_name))) %>% 
  slice_max(n=10, order_by = occurence_per)

sum(data_industry$occurence_per[0:4])
sum(data_industry$occurence_per[0:10])

data_non_industry <- data %>%
  filter(funding_type == "non-industry") %>%
  reframe(class_name = class_name,
          total_occ = sum(occurence, na.rm = TRUE),
          occurence_per = (occurence / total_occ)*100) %>% 
  arrange(occurence_per) %>% 
  mutate(class_name = factor(class_name, levels = unique(class_name))) %>% 
  slice_max(n=10, order_by = occurence_per)

sum(data_non_industry$occurence_per[0:4])
sum(data_non_industry$occurence_per[0:10])

data_non_funded <- data %>%
  filter(funding_type == "non-funded") %>%
  reframe(class_name = class_name,
          total_occ = sum(occurence, na.rm = TRUE),
          occurence_per = (occurence / total_occ)*100) %>% 
  arrange(occurence_per) %>% 
  mutate(class_name = factor(class_name, levels = unique(class_name))) %>% 
  slice_max(n=10, order_by = occurence_per)

sum(data_non_funded$occurence_per[0:4])
sum(data_non_funded$occurence_per[0:10])

#Plot
colors <- c("#FCA437FF", "#F1804DFF", "#E4695EFF", "#D45270FF", "#BB3488FF", "#A41F9AFF", "#8004A8FF", "#6F00A8", "#44039E", "#0D0887")

p <- ggplot() +
  geom_dumbbell(data = data_industry, aes(y = class_name, x = occurence_per, xend = 0, color = funding_type), size = 3, colour = "#dddddd", colour_x = "#ffffff00", colour_xend = "#ffffff00") +
  geom_vline(xintercept = 0, color = "black", lwd = 1.6) +
  ggnewscale::new_scale_color() +
  geom_point(
    data = data_industry,
    aes(x = occurence_per, y = class_name, color = class_name),
    size = 4,
    show.legend = FALSE) +
  labs(x = "Percentage",
       y = "Field",
       color = "Funding Type") +
  theme_minimal() +
  scale_color_manual(values = colors) +
  xlim(0, 15) +
    expand_limits(x = 15) +
  expand_limits(x = 0) +
  scale_x_continuous(breaks = c(0, 5, 10, 15), labels = function(x) paste0(x, "%")) +
  coord_cartesian(
    clip = "off",
    ylim = c(0.5, nrow(data_industry) + 0.5)
  ) +
  bbc_style() +
  theme(
    legend.position = "top",
    legend.justification='left',
    plot.subtitle = element_text(size = 20),
    plot.margin = margin(0, 0, 0, 0, "cm"),
    panel.grid.major.x = element_line(colour = "#cccccc"),
    panel.grid.major.y = element_blank(),
    axis.ticks.y = element_blank())

ggsave("./figures/citing_field_industry.pdf", p, height = 8.5, width = 8.5)

print(p)

# ------------------------------------------------------------------------------non-industry
p <- ggplot() +
  geom_dumbbell(data = data_non_industry, aes(y = class_name, x = occurence_per, xend = 0, color = funding_type), size = 3, colour = "#dddddd", colour_x = "#ffffff00", colour_xend = "#ffffff00") +
  geom_vline(xintercept = 0, color = "black", lwd = 1.6) +
  ggnewscale::new_scale_color() +
  geom_point(
    data = data_non_industry,
    aes(x = occurence_per, y = class_name, color = class_name),
    size = 4,
    show.legend = FALSE) +
  labs(x = "Percentage",
       y = "Field",
       color = "Funding Type") +
  theme_minimal() +
  scale_color_manual(values = colors) +
  xlim(0, 15) +
    expand_limits(x = 15) +
  expand_limits(x = 0) +
  scale_x_continuous(breaks = c(0, 5, 10, 15), labels = function(x) paste0(x, "%")) +
  coord_cartesian(
    clip = "off",
    ylim = c(0.5, nrow(data_industry) + 0.5)
  ) +
  bbc_style() +
  theme(
    legend.position = "top",
    legend.justification='left',
    plot.subtitle = element_text(size = 20),
    plot.margin = margin(0, 0, 0, 0, "cm"),
    panel.grid.major.x = element_line(colour = "#cccccc"),
    panel.grid.major.y = element_blank(),
    axis.ticks.y = element_blank())

ggsave("./figures/citing_field_non_industry.pdf", p, height = 8.5, width = 8.5)

print(p)

# ------------------------------------------------------------------------------non-funded
p <- ggplot() +
  geom_dumbbell(data = data_non_funded, aes(y = class_name, x = occurence_per, xend = 0, color = funding_type), size = 3, colour = "#dddddd", colour_x = "#ffffff00", colour_xend = "#ffffff00") +
  geom_vline(xintercept = 0, color = "black", lwd = 1.6) +
  ggnewscale::new_scale_color() +
  geom_point(
    data = data_non_funded,
    aes(x = occurence_per, y = class_name, color = class_name),
    size = 4,
    show.legend = FALSE) +
  labs(x = "Percentage",
       y = "Field",
       color = "Funding Type") +
  theme_minimal() +
  scale_color_manual(values = colors) +
  xlim(0, 15) +
    expand_limits(x = 15) +
  expand_limits(x = 0) +
  scale_x_continuous(breaks = c(0, 5, 10, 15), labels = function(x) paste0(x, "%")) +
  coord_cartesian(
    clip = "off",
    ylim = c(0.5, nrow(data_industry) + 0.5)
  ) +
  bbc_style() +
  theme(
    legend.position = "top",
    legend.justification='left',
    plot.subtitle = element_text(size = 20),
    plot.margin = margin(0, 0, 0, 0, "cm"),
    panel.grid.major.x = element_line(colour = "#cccccc"),
    panel.grid.major.y = element_blank(),
    axis.ticks.y = element_blank())

ggsave("./figures/citing_field_non_funded.pdf", p, height = 8.5, width = 8.5)

print(p)

Outgoing Citation Fields as sankey charts

In [None]:
!pip install -r requirements.txt

In [None]:
import pandas as pd
import plotly.graph_objects as go
from plotly.io import write_image

# Read the CSV file
df = pd.read_csv('./figures/top_10_fields.csv')

# Ensure the required columns are present
required_columns = ['funding_type', 'class_name', 'occurence', 'occurence_per']
if not all(col in df.columns for col in required_columns):
    raise ValueError(f"CSV file must contain the following columns: {', '.join(required_columns)}")

# Rename funding types
funding_type_mapping = {
    'industry': 'Industry Funded AI Paper',
    'non-industry': 'Non-Industry Funded AI Paper',
    'non-funded': 'Non-Funded AI Paper'
}

# Color scales for nodes
funding_type_colors = {"Industry Funded AI Paper": "#440154", "Non-Industry Funded AI Paper": "#3E4A89", "Non-Funded AI Paper": "#1F9E89"}
class_name_colors = ["#0D0887", "#44039E", "#6F00A8", "#9512A1", "#B6308B",
                     "#D14E72", "#E76E5B", "#F79044", "#FEB72D", "#F0F921"]

def create_sankey_diagram(funding_type_data, funding_type):
    source = []
    target = []
    value = []
    link_labels = []

    # Create node labels
    node_labels = [funding_type]
    class_names = list(funding_type_data['class_name'])
    i = 1
    # Create edges and prepare link labels
    for _, row in funding_type_data.iterrows():
        source.append(0)  # Always 0 as we have only one source node
        target.append(i)  # +1 because the first node is the funding type
        value.append(row['occurence_per'])
        link_label = f"{row['class_name']} ({row['occurence']:,.1f}k, {row['occurence_per']:.1f}%)"
        link_labels.append(link_label)
        node_labels.append(link_label)
        i += 1

    # Create color list
    node_colors = [funding_type_colors[funding_type]] + class_name_colors[:len(funding_type_data)]

    # Create the Sankey diagram
    fig = go.Figure(data=[go.Sankey(
        node = dict(
          pad = 15,
          thickness = 20,
          line = dict(color = "black", width = 0.5),
          label = node_labels,
          color = node_colors
        ),
        link = dict(
          arrowlen=8,
          source = source,
          target = target,
          value = value,
          label = link_labels,
          color = 'rgba(200, 200, 200, 0.5)'  # Semi-transparent gray
      ))])

    # Update the layout
    fig.update_layout(
        font_size=15,
        paper_bgcolor='white',  # Transparent background
        plot_bgcolor='white',   # Transparent plot area
        annotations=[
                      dict(
                x=0.5,  # Centered horizontally
                y=-0.082,  # Position it below the chart
                showarrow=False,
                text="Outgoing Citation Fields",
                xref="paper",
                yref="paper",
                font=dict(size=16),
                xanchor='center',
                yanchor='top'
            ),
            dict(
                x=0.628,  # Centered horizontally
                y=-0.15,  # Position it below the text
                showarrow=True,
                text="",
                xref="paper",
                yref="paper",
                arrowcolor="lightgrey",  # Black arrow color
                ax=-189,  # Positive ax to point the arrow to the right
                ay=0,  # No vertical offset
                arrowwidth=3,  # Thickness of the arrow
                arrowhead=2,  # Arrowhead size
                arrowsize=1,  # Size of the arrowhead
                xanchor='center',
                yanchor='top'
            )
        ],
        width=900,   # Assuming a 3:2 aspect ratio, 900 x 600
        height=600   # This is equivalent to height=5 in R's ggsave()
    )

    # Update the figure to ensure transparency
    fig.update_traces(node = dict(
        pad = 15,
        thickness = 20,
        label = node_labels,
        color = node_colors
    ),
    link = dict(
        color = 'rgba(200, 200, 200, 0.5)'
    ))

    return fig

# Function to create and display diagram for a specific funding type
def create_and_display_diagram(funding_type, diagram_title=None):
    renamed_funding_type = funding_type_mapping[funding_type]
    funding_type_data = df[df['funding_type'] == funding_type].sort_values('occurence_per', ascending=False)
    fig = create_sankey_diagram(funding_type_data, renamed_funding_type)
    if not diagram_title:
        diagram_title = f"{funding_type}_cit_fields"
    output_path = "./figures/{diagram_title}.png"
    write_image(fig, output_path, scale=2.5)

# Create and display diagrams for each funding type
create_and_display_diagram('industry')    
create_and_display_diagram('non-industry')
create_and_display_diagram('non-funded')

#### % age of Citations per Funding Type

In [None]:
csv_path = "./data/industry_to_x_citations_per_year.csv"
data = read_csv(csv_path, show_col_types = FALSE)

# Step 1: Calculate total citations for each paper
df_total_citations <- data %>%
  group_by(cited_item_id, cited_funding_type) %>%
  reframe(total_citations = sum(citations_at_age),
            .groups = "drop")

# Step 2: Join total citations back to original data and calculate percentages
df_citation_per <- data %>%
  left_join(df_total_citations, by = c("cited_item_id", "cited_funding_type"))

industry_total <- df_total_citations %>%
  filter(cited_funding_type == "industry") %>%
  reframe(total_citations = sum(total_citations, na.rm = TRUE))

non_funded_total <- df_total_citations %>%
  filter(cited_funding_type == "non-funded") %>%
  reframe(total_citations = sum(total_citations, na.rm = TRUE))

non_industry_total <- df_total_citations %>%
  filter(cited_funding_type == "non-industry") %>%
  reframe(total_citations = sum(total_citations, na.rm = TRUE))

overall_total <- df_total_citations %>%
  reframe(total_citations = sum(total_citations, na.rm = TRUE))

# Step 3: Calculate average percentages

df_industy_avrg <- df_citation_per %>%
  filter(cited_funding_type == "industry") %>%
  group_by(cited_funding_type, age_of_citation) %>%
  reframe(citations_at_age = sum(citations_at_age, na.rm = TRUE),
          total_citations = industry_total$total_citations) %>% 
   group_by(cited_funding_type, age_of_citation) %>% 
  mutate(citation_percentage = (citations_at_age / total_citations) * 100) %>% 
  reframe(avg_citation_percentage =  mean(citation_percentage, na.rm = TRUE))


df_non_funded_avrg <- df_citation_per %>%
  filter(cited_funding_type == "non-funded") %>%
  group_by(cited_funding_type, age_of_citation) %>%
  reframe(citations_at_age = sum(citations_at_age, na.rm = TRUE),
          total_citations = non_funded_total$total_citations) %>% 
   group_by(cited_funding_type, age_of_citation) %>% 
  mutate(citation_percentage = (citations_at_age / total_citations) * 100) %>% 
  reframe(avg_citation_percentage =  mean(citation_percentage, na.rm = TRUE))

df_non_industry_funded_avrg <- df_citation_per %>%
  filter(cited_funding_type == "non-industry") %>%
  group_by(cited_funding_type, age_of_citation) %>%
  reframe(citations_at_age = sum(citations_at_age, na.rm = TRUE),
          total_citations = non_industry_total$total_citations) %>% 
   group_by(cited_funding_type, age_of_citation) %>% 
  mutate(citation_percentage = (citations_at_age / total_citations) * 100) %>% 
  reframe(avg_citation_percentage =  mean(citation_percentage, na.rm = TRUE))

df_overall_avrg <- df_citation_per %>%
  group_by(age_of_citation) %>%
  reframe(citations_at_age = sum(citations_at_age, na.rm = TRUE),
          total_citations = overall_total$total_citations) %>%
  group_by(age_of_citation) %>%
  mutate(citation_percentage = (citations_at_age / total_citations) * 100) %>% 
  reframe(avg_citation_percentage =  mean(citation_percentage, na.rm = TRUE)) %>% 
  mutate(cited_funding_type = "Overall")

# Plot
subtitle_text <- "% age of Citations per Funding Type"

colors <- c("Industry" = "#1B068DFF", 
            "Non-industry" = "#B52F8CFF", 
            "Non-funded" = "#F58C46FF",
            "Overall" = "#150E39FF")

p <- ggplot() +
  geom_line(data = df_industy_avrg, aes(x = age_of_citation, y = avg_citation_percentage, color = "Industry"), lwd=2) +
  geom_line(data = df_non_industry_funded_avrg, aes(x = age_of_citation, y = avg_citation_percentage, color = "Non-industry"), lwd=2) + 
  geom_line(data = df_non_funded_avrg, aes(x = age_of_citation, y = avg_citation_percentage, color = "Non-funded"), lwd=2) + 
  geom_line(data = df_overall_avrg, aes(x = age_of_citation, y = avg_citation_percentage, color = "Overall"), lwd=2) + 
  labs(x = "Age when Cited in Years", y = "% age of Citation", color = "Field") +
  labs(subtitle = subtitle_text) +
  theme_minimal() +
  scale_color_manual(values = colors, breaks = c("Overall", "Industry", "Non-industry", "Non-funded"), labels = c("Overall", "Industry", "Non-Industry", "Non-Funded")) +
scale_y_continuous(
  labels = scales::percent_format(scale = 1), 
  limits = c(0, 32), 
  expand = expansion(mult = c(0, 0)),
  breaks = seq(0, 32, by = 5)
) +
scale_x_continuous(breaks = seq(0, 6, by = 2), limits = c(0, 6.1), labels = c("0", "2", "4", "6 years")) +
  bbc_style() +
  theme(
    legend.position = "top",
    legend.justification='left',
    plot.subtitle = element_text(size = 20),
    axis.text.x = ggplot2::element_text(margin=ggplot2::margin(12, b = 0)),
    axis.ticks = element_line(size = 0.5, color = "black"),
    axis.ticks.length = unit(0.25, "cm"),
    axis.ticks.y = element_blank(),
    axis.line.x = element_line(colour = "#333333", linewidth = 1, linetype = "solid"))

ggsave("./figures/aoc_funding_type.pdf", p, height = 6.67, width = 7)

print(p)

#### Distribution and Trend of mAoC, 2018-2023

In [None]:
csv_path = "./data/AoC_data_CAD.csv"
data = read_csv(csv_path, show_col_types = FALSE)

# 1. Calculate mAoC for each paper
df <- data %>%
  group_by(citing_item_id, funding_type, citing_pubyear) %>%
  reframe(mAoC = mean(cited_citation_age, na.rm = TRUE))

# 2. Filter for years 2018-2023 and create a new data frame for plotting
plot_df <- df %>%
  filter(citing_pubyear >= 2018, citing_pubyear <= 2023) %>%
  distinct(citing_item_id, .keep_all = TRUE)

# Calculate yearly average mAoC for the line plot
line_data <- plot_df %>% 
  group_by(citing_pubyear) %>% 
  reframe(avg_mAoC = mean(mAoC, na.rm = TRUE), sd_mAoC = sd(mAoC, na.rm = TRUE))

subtitle_text <- "Distribution and Trend of mAoC, 2018-2023"

year_colors <- c("2018" = "#0D0887FF", "2019" = "#47039FFF", "2020" = "#9C179EFF", 
                 "2021" = "#BD3786FF", "2022" = "#D8576BFF", "2023" = "#FA9E3BFF")

p <- ggplot() +
  geom_violin(data = plot_df, 
              aes(x = factor(citing_pubyear), y = mAoC, 
                  fill = factor(citing_pubyear), 
                  color = factor(citing_pubyear)),  # Add color aesthetic
              trim = FALSE, alpha = 0.6) +
    # Add standard deviation text
  geom_text(data = line_data,
            aes(x = factor(citing_pubyear), 
                y = -1.9, # Position below the plot
                label = sprintf("SD: %.2f", sd_mAoC)),
            size=6,
            hjust = 0.5,
            color = "black") +
  geom_boxplot(data = plot_df, aes(x = factor(citing_pubyear), y = mAoC), 
               width = 0.1, fill = "#cccccc", color = "#cccccc", alpha = 1,
               outlier.shape = NA) +
  stat_summary(data = plot_df, aes(x = factor(citing_pubyear), y = mAoC),
               fun = median, geom = "point", shape = 23, size = 3, 
               fill = "white", color = "#cccccc") +
 geom_line(data = line_data, aes(x = factor(citing_pubyear), y = avg_mAoC, group=1), 
            color = "#150E39FF", lwd =1) +
  labs(x = "Year of Publication", y = "Mean Age of Citation (mAoC)") +
  labs(subtitle = subtitle_text) +
  theme_minimal() +
  scale_fill_manual(values = year_colors) +
  scale_color_manual(values = year_colors) +
  scale_y_continuous(breaks = seq(0, 20, by = 5), limits = c(-2.5, 15), expand = c(0, 0)) +
  bbc_style() +
  theme(
    legend.position = "none",
    plot.subtitle = element_text(size = 20),
    axis.text.x = ggplot2::element_text(margin=ggplot2::margin(12, b = 0)),
    axis.ticks = element_line(size = 0.5, color = "black"),
    axis.ticks.length = unit(0.25, "cm"),
    axis.ticks.y = element_blank(),
    axis.line.x = element_line(colour = "#333333", linewidth = 1, linetype = "solid"))

ggsave("./figures/mAoc.pdf", p, height = 5, width = 10)

print(p)

## Statistical Analysis

In [None]:
# Read data
csv_path = "./data/sql_results.csv"
data = read_csv(csv_path, show_col_types = FALSE)

# Install if needed
install.packages("data.table")
library(data.table)

# Fast reading with automatic detection of column types
df <- fread(csv_path, 
            nrows = -1,             # read all rows
            select = NULL,          # read all columns
            na.strings = "NA")      # specify NA values

industry_citations <- df %>%
  filter(funding_type_cited_paper == "industry") %>%
  pull(citation_count_cited_paper)

non_industry_citations <- df %>%
  filter(funding_type_cited_paper == "non-industry") %>%
  pull(citation_count_cited_paper)

non_funded_citations <- df %>%
  filter(funding_type_cited_paper == "non-funded") %>%
  pull(citation_count_cited_paper)

# Install and load the 'car' package for Levene's Test
install.packages("car")
library(car)

# Combine data into a dataframe
citations <- c(industry_citations, non_industry_citations, non_funded_citations)
group <- factor(c(rep("industry", length(industry_citations)), rep("non-industry", length(non_industry_citations)), rep("non-funded", length(non_funded_citations))))
data <- data.frame(citations, group)

# Perform Levene's Test
leveneTest(citations ~ group, data = data)