# Unicorn Companies

A unicorn company is a privately held company with a current valuation of over $1 billion USD. This dataset consists of unicorn companies and startups across the globe as of November 2021, including country of origin, sector, select investors, and valuation of each unicorn. 

_Note former unicorn companies that have since exited due to IPO or acquisitions are not included in this list._

Not sure where to begin? Scroll to the bottom to find challenges!

## Load Libraries

In [None]:
install.packages("pkgcond")

suppressPackageStartupMessages(library(pkgcond))

suppress_conditions(source("librs_and_funcs.R"))

unicorns <- read_csv('data/unicorn_companies.csv', show_col_types = FALSE)

#### First peek at data

In [None]:
#Look at the first 10 observations in the datframe
head(unicorns, n = 10)

## Clean 

In [None]:
#Change column names for easier calls
colnames(unicorns) <- c('company','valuation_billions','date_added','country','category','select_investors')


#Sequoia Capital firm name to avoid error when creating new df "unicorn_investors"
unicorns$select_investors <- gsub("and Sequoia Capital China", "Sequoia Capital China", unicorns$select_investors)
unicorns$select_investors <- gsub("/td>", "TD", unicorns$select_investors)



#Trim whitespaces off columns "Company", "Country", and "Category"
unicorns$company <- trimws(unicorns$company, which = c("both"))
unicorns$country <- trimws(unicorns$country, which = c("both"))
unicorns$category <- trimws(unicorns$category, which = c("both"))


#Clean-up "select_investors" column in "unicorns"
clean_investors <- function(){
    ncv_position = 1
    new_column_values <- list()
    for(investor_group in unicorns$select_investors){
        rl_position = 1
        re_list <- c()
        for(investor in strsplit(investor_group, split = ",")[[1]]){
            if(investor == ''){
                re_list[[rl_position]] <- 'Unknown'
                rl_position <- rl_position + 1
                }else{
                re_list[[rl_position]] <- stri_trans_totitle(trimws(investor, which = c("both")))
                rl_position <- rl_position + 1
                }
            }
        new_column_values[[ncv_position]] <- re_list
        ncv_position <- ncv_position + 1
        }
    return(new_column_values)
    }

unicorns$select_investors <- clean_investors()

## Transform

In [None]:
#Remove the '$' from the "valuation_billions" column
unicorns$valuation_billions <- substr(unicorns$valuation_billions, 2,7)


#Convert the chr strings in "valuation_billions" to numeric values
unicorns$valuation_billions <- as.numeric(unicorns$valuation_billions)


#Create "over_5bill" to indicate companies valued over 5 billion
unicorns$over_5bill <- unicorns$valuation_billions >= 5.0




#Create "num_of_investors" column
investor_count <- function(){
    count = 1
    new_column_values <- c()
    for(investor_group in unicorns$select_investors){
        new_column_values[[count]] <- length(investor_group)
        count <- count + 1
        }
    return(new_column_values)
    }
    
unicorns$num_of_investors <- investor_count()


#Convert "num_of_investors" list values to integers
unicorns$num_of_investors <- as.integer(unicorns$num_of_investors)




#Get Today's Date
today <- Sys.Date()

#Convert the chr date in "date_added" to date
unicorns$date_added <- as.Date(unicorns$date_added, format = "%m/%d/%y")

#Create "age" column 
unicorns$age_years <- round(as.integer(today - unicorns$date_added)/365.25, digits = 2)




#Fix values 'United States,' and 'Santa Clara' to 'United States'
unicorns$category <- replace(unicorns$category, unicorns$category == 'Artificial intelligence', 'Artificial Intelligence')
unicorns$category <- replace(unicorns$category, unicorns$category == 'Finttech', 'Fintech')
unicorns$category <- replace(unicorns$category, unicorns$category == 'E-commerce & direct-to-consumer', 'Ecommerce & Direct-to-Consumer')
unicorns$category <- replace(unicorns$category, unicorns$category == 'Internet software & services', 'Web Products/Services')
unicorns$category <- replace(unicorns$category, unicorns$category == 'Supply chain, logistics, & delivery', 'Supply Chain & Logistics')
unicorns$category <- replace(unicorns$category, unicorns$category == 'Data management & analytics', 'Data Management & Analytics')
unicorns$category <- replace(unicorns$category, unicorns$category == 'Auto & transportation', 'Transportation & Auto')
unicorns$category <- replace(unicorns$category, unicorns$category == 'Consumer & retail', 'Retail')
unicorns$category <- replace(unicorns$category, unicorns$category == 'Mobile & telecommunications', 'Telecommunications & Mobile')


#Vectors to create "continent" column based on "country" name
aerospace <- c('SpaceX', 'ABL Space Systems', 'Firefly Aerospace', 'Relativity Space')
gaming <- c('Epic Games', 'Wildlife Studios', 'Voodoo', 'Playco')
food_beverage <- c('HEYTEA', 'Wenheyou', 'Manner', 'Ynsect')
batteries <- c('Northvolt', 'Sila Nanotechnologies', 'OCSiAl')
investment <- c('Black Unicorn Factory', 'NuCom Group', 'Mensa Brands')
web_prod_serv <- c('OVH', 'Sentry')
waste_recycle <- c('Redwood Materials', 'Rubicon Global')
film <- c('Dadi Cinema', 'Skydance Media')  
agriculture <- c('Bowery Farming', 'Farmers Business Network', 'Apeel Sciences', 'Inari')
advertising <- c('Trader Interactive', 'You & Mr Jones', 'Emerging Markets Property Group')  #now, the brandtech group
construction <- c('Nexii', 'Revolution Precrafted')
energy <- c('Uplight', 'OVO Energy')
transpotation_auto <- c('SITECH DEV', 'Boom Supersonic')
aviation <- c('Vista Global')
satellites <- c('Planet Labs')
metaverse <- c('Improbable')
diamonds <- c('Diamond Foundry')
financial_services <- c('Five Star Business Finance')
cosmetics <- c('GPclub')
restaurants <- c('Cava Group')
biomaterials <- c('Spiber')
travel <- c('LifeMiles')
media <- c('Red Ventures')
space <- c('Axiom Space')
ecommerce_dtc <- c('Printful')
insurance <- c('Howden Group Holdings')
retail <- c('Thrasio')


#Add "category" for values labeled "Other" and assign it to "unicorns"
unicorns$category <- replace(unicorns$category, unicorns$company %in% aerospace, 'Aerospace')
unicorns$category <- replace(unicorns$category, unicorns$company %in% gaming, 'Gaming')
unicorns$category <- replace(unicorns$category, unicorns$company %in% food_beverage, 'Food & Beverage')
unicorns$category <- replace(unicorns$category, unicorns$company %in% batteries, 'Batteries')
unicorns$category <- replace(unicorns$category, unicorns$company %in% investment, 'Investment')
unicorns$category <- replace(unicorns$category, unicorns$company %in% web_prod_serv, 'Web Products/Services')
unicorns$category <- replace(unicorns$category, unicorns$company %in% waste_recycle, 'Waste & Recycle')
unicorns$category <- replace(unicorns$category, unicorns$company %in% film, 'Film')
unicorns$category <- replace(unicorns$category, unicorns$company %in% agriculture, 'Agriculture')
unicorns$category <- replace(unicorns$category, unicorns$company %in% advertising, 'Advertising')
unicorns$category <- replace(unicorns$category, unicorns$company %in% construction, 'Construction')
unicorns$category <- replace(unicorns$category, unicorns$company %in% energy, 'Energy')
unicorns$category <- replace(unicorns$category, unicorns$company %in% transpotation_auto, 'Transportation & Auto')
unicorns$category <- replace(unicorns$category, unicorns$company %in% aviation, 'Aviation')
unicorns$category <- replace(unicorns$category, unicorns$company %in% satellites, 'Satellites')
unicorns$category <- replace(unicorns$category, unicorns$company %in% metaverse, 'Metaverse')
unicorns$category <- replace(unicorns$category, unicorns$company %in% diamonds, 'Diamonds')
unicorns$category <- replace(unicorns$category, unicorns$company %in% financial_services, 'Financial Services')
unicorns$category <- replace(unicorns$category, unicorns$company %in% cosmetics, 'Cosmetics')
unicorns$category <- replace(unicorns$category, unicorns$company %in% restaurants, 'Restaurants')
unicorns$category <- replace(unicorns$category, unicorns$company %in% biomaterials, 'Biomaterials')
unicorns$category <- replace(unicorns$category, unicorns$company %in% travel, 'Travel')
unicorns$category <- replace(unicorns$category, unicorns$company %in% media, 'Media')
unicorns$category <- replace(unicorns$category, unicorns$company %in% space, 'Space')
unicorns$category <- replace(unicorns$category, unicorns$company %in% ecommerce_dtc, 'Ecommerce & Direct-to-Consumer')
unicorns$category <- replace(unicorns$category, unicorns$company %in% insurance, 'Insurance')
unicorns$category <- replace(unicorns$category, unicorns$company %in% retail, 'Retail')




#Fix values 'United States,' and 'Santa Clara' to 'United States'
unicorns$country <- replace(unicorns$country, unicorns$country == 'Santa Clara' | unicorns$country == 'United States,', 'United States')


#Vectors to create "continent" column based on "country" name

#North America
na <- c('United States', 'Mexico', 'Canada')

#Europe
eu <- c('Sweden','United Kingdom', 'Bermuda', 'Germany', 'Netherlands', 'Belgium', 'Estonia', 'Lithuania', 'France', 'Austria', 'Ireland', 'Switzerland', 'Luxembourg', 'Finland', 'Denmark', 'Norway', 'Spain', 'Czech Republic', 'Croatia')

#South America
sa <- c('Brazil', 'Columbia', 'Argentina', 'Chile')

#Asia
as <- c('China', 'Hong Kong', 'India', 'Singapore', 'South Korea', 'Israel', 'Thailand', 'Malaysia', 'Indonesia')

#Africa
af <- c('Senegal', 'South Africa')




#Create "continent" column and assign it to "unicorns"
continents <- function(x){
  count = 1
  new_column_values <- c()
  for(country in unicorns$country){
    if(country %in% na){
      new_column_values[[count]] = 'North America'
      count <- count + 1
    }else if(country %in% eu){
      new_column_values[[count]] = 'Europe'
      count <- count + 1
    }else if(country %in% sa){
      new_column_values[[count]] = 'South America'
      count <- count + 1
    }else if(country %in% as){
      new_column_values[[count]] = 'Asia'
      count <- count + 1
    }else{
      new_column_values[[count]] = 'Africa'
      count <- count + 1
    }
  }
  
  return(new_column_values)
}
    
unicorns$continent <- continents()
    
    
    
    
#Function to convert a non_numeric column into a number-coded column based on the amount of unique values
coderize_column <- function(df_column){
    position = 1
    coded_nums <- 1:length(unique(df_column))
    order_col <- unique(df_column)
    new_column_values <- 1:length(df_column)
    for(value in order_col){
            value_index_positions <- which(df_column == value)
            for(index in value_index_positions){
                new_column_values[[index]] <- coded_nums[[position]]
            }
        position <- position + 1
    }
    return(as.integer(new_column_values))
}


#Coderize "country" column
unicorns$coded_country <- coderize_column(unicorns$country)     
    
#Coderize "continent" column
unicorns$coded_continent <- coderize_column(unicorns$continent)    
    
#Coderize "category" column
unicorns$coded_category <- coderize_column(unicorns$category)
                

#### Check "unicorns" after cleaning & transformations

In [None]:
#Take a glimpse at columns and some values in unicorn
glimpse(unicorns)

In [None]:
##### First 10 observations of "unicorns"
head(unicorns, n = 5)

In [None]:
#"unicorns" with a valuation over 5 Billion
as.integer(count(unicorns[unicorns$over_5bill == TRUE,]))

#"unicorns" with a valuation under 5 Billion
as.integer(count(unicorns[unicorns$over_5bill == FALSE,]))

### Create "unicorn_investors" based on "unicorns", switching from 'company' profiles to 'investor' profiles

In [None]:
#Number of investments by investor
investor_frequency <- function(x){
  count = 1
  new_column_values <- c()
  for(investor_group in unicorns$select_investors){
       for(investor in investor_group){
           new_column_values[[count]] = trimws(investor, which = c("both"))
           count <- count + 1
            }
      }
    return(new_column_values)
}
    
investor_freq <- investor_frequency()


#Create a vector to hold unique Investors
investors <- unlist(unique(investor_freq), use.names = FALSE)


#Create a vector to hold investment count per Investor
investment_count <- frequency(unlist(investor_freq, use.names = FALSE))


#Dataframe based Unicorn Investors
unicorn_investors <- data.frame(investors, investment_count)

#### "unicorn_investors" transformations

In [None]:
#Function that collects investments by "investor"
companies_by_investor <- function(){
    ncv_position = 1
    new_column_values <- list()
    for(investor in unicorn_investors$investors){
        company_pos = 1
        ic_position = 1
        investor_companies <- c()
        for(investor_group in unicorns$select_investors){
            if(investor %in% investor_group){
                investor_companies[[ic_position]] <- trimws(unicorns$company[[company_pos]], which = c('both'))
                ic_position = ic_position + 1
                company_pos <- company_pos + 1
                }else{
                company_pos <- company_pos + 1
                }
            }
           new_column_values[[ncv_position]] <- unlist(investor_companies, use.name = FALSE)
           ncv_position <- ncv_position + 1
        }
    return(new_column_values)
    }
  

#Assign "companies_by_investor" to new column "investments" in "unicorn_investments"
unicorn_investors$investments <- companies_by_investor()





#Function that calculates valuation in billions by "investor"
valuation_by_investor <- function(){
    ncv_position = 1
    new_column_values <- list()
    for(investor in unicorn_investors$investors){
        iv_position <- 1
        investor_valuations <- c()
        for(company in unicorn_investors[unicorn_investors$investors == investor,]$investments){
            investor_valuations[[iv_position]] <- unicorns[unicorns$company %in% company,]$valuation_billions
            iv_position = iv_position + 1
            }
            new_column_values[[ncv_position]] <- sum(unlist(investor_valuations, use.names = FALSE))
            ncv_position <- ncv_position + 1
            }
        return(as.numeric(new_column_values))
    }
    
  
#Assign "companies_by_investor" to new column "investments" in "unicorn_investments"
unicorn_investors$valuation_billions <- valuation_by_investor()





#Function that calculates the average valuation in billions by "investor"
investor_mean_valuation <- function(){
    ncv_position = 1
    new_column_values <- list()
    for(investor in unicorn_investors$investors){
        iv_position <- 1
        investor_valuations <- c()
        for(company in unicorn_investors[unicorn_investors$investors == investor,]$investments){
            investor_valuations[[iv_position]] <- unicorns[unicorns$company %in% company,]$valuation_billions
            iv_position = iv_position + 1
            }
            new_column_values[[ncv_position]] <- round(mean(as.numeric(unlist(investor_valuations, use.names = FALSE))), digits = 2)
            ncv_position <- ncv_position + 1
            }
        return(as.numeric(new_column_values))
    }
    
  
#Assign "companies_by_investor" to new column "investments" in "unicorn_investments"
unicorn_investors$mean_valuation <- investor_mean_valuation()

#### View "unicorn_investors"

In [None]:
#First 10 observations of "unicorn_investors"
head(unicorn_investors, n = 5)

#### Use "unicorn_investors" to create new df "cor_unicorns"
- "associate_companies"
- "num_of_associates"
- "association_valuation"
- "avg_associate_valuation"

In [None]:
cor_unicorns <- unicorns

In [None]:
#Function that collects a "company"'s associated companies by "investor"
associative_companies <- function(){
    ncv_position = 1
    new_column_values <- list()
    for(company in cor_unicorns$company){
        ac_position = 1
        associated_companies <- list()
        for(investor in unicorn_investors$investors){
            current_investments <- unlist(unicorn_investors[unicorn_investors$investors == investor,]$investments)
            company_index_position <- which(current_investments == company)
            investments_minus_company <- current_investments[-company_index_position]
            associated_companies[[ac_position]] <- investments_minus_company
            ac_position <- ac_position + 1
            }
        new_column_values[[ncv_position]] <- unlist(associated_companies, use.name = FALSE)
        ncv_position <- ncv_position + 1
        }
     return(new_column_values)
    }

#Assign "associative_companies" to new column "associate_companies" in "unicorns"
cor_unicorns$associate_companies <- associative_companies()





#Create a column for the number of "associate_companies" by "company"
unicorns$num_of_associates <- length(cor_unicorns$associate_companies[[1]])

#Create "num_of_associates" column
associate_count <- function(){
    count = 1
    new_column_values <- c()
    for(associate_group in cor_unicorns$associate_companies){
        new_column_values[[count]] <- length(associate_group)
        count <- count + 1
        }
    return(new_column_values)
    }
    
cor_unicorns$num_of_associates <- associate_count()


#Convert "num_of_associates" list values to integers
cor_unicorns$num_of_associates <- as.integer(cor_unicorns$num_of_associates)

In [None]:
#Function that calculates valuation in billions by "associate_companies"
valuation_by_association <- function(){
    ncv_position = 1
    new_column_values <- list()
    for(association_group in cor_unicorns$associate_companies){
        av_position <- 1
        associate_valuations <- c()
        for(company in unlist(association_group, use.names = FALSE)){
            associate_valuations[[av_position]] <- cor_unicorns[cor_unicorns$company %in% company,]$valuation_billions
            av_position = av_position + 1
            }
        new_column_values[[ncv_position]] <- sum(unlist(associate_valuations, use.names = FALSE))
        ncv_position <- ncv_position + 1
        }
    return(as.numeric(new_column_values))
}
    
  
#Assign "companies_by_investor" to new column "investments" in "unicorn_investments"
cor_unicorns$association_valuation <- valuation_by_association()




#Function that calculates valuation in billions by "associate_companies"
valuation_by_association <- function(){
    ncv_position = 1
    new_column_values <- list()
    for(association_group in cor_unicorns$associate_companies){
        av_position <- 1
        associate_valuations <- c()
        for(company in unlist(association_group, use.names = FALSE)){
            associate_valuations[[av_position]] <- cor_unicorns[cor_unicorns$company %in% company,]$valuation_billions
            av_position = av_position + 1
            }
        new_column_values[[ncv_position]] <- mean(as.numeric(unlist(associate_valuations, use.names = FALSE)))
        ncv_position <- ncv_position + 1
        }
    return(as.numeric(unlist(new_column_values, use.names = FALSE)))
}
    
  
#Assign "companies_by_investor" to new column "investments" in "unicorn_investments"
cor_unicorns$avg_associate_valuation <- valuation_by_association()




#Categorize "num_of_associates" for correlation
associates_level <- function(){
    position = 1
    new_column_values <- c()
    for(associates_num in cor_unicorns$num_of_associates){
        if(associates_num < 26){
            new_column_values[[position]] <- 1
            position <- position + 1
            }else if(associates_num > 25 & associates_num < 51){
            new_column_values[[position]] <- 2
            position <- position + 1
            }else if(associates_num > 50 & associates_num < 76){
            new_column_values[[position]] <- 3
            position <- position + 1
            }else if(associates_num > 75 & associates_num < 101){
            new_column_values[[position]] <- 4
            position <- position + 1
            }else if(associates_num > 100){
            new_column_values[[position]] <- 5
            position <- position + 1
            }
        }
    return(as.numeric(new_column_values))
    }

cor_unicorns$associates_level <- associates_level()

In [None]:
#cor_unicorns <- cor_unicorns[cor_unicorns$num_of_associates > 0,]

#### Check "cor_unicorns"

In [None]:
#Use when exporting to csv
#cor_unicorns$date_added = format(cor_unicorns$date_added, format = "%y/%d/%m")

In [None]:
#head(cor_unicorns, n = 5)

#### Use "cor_unicorns" & "unicorn_investors" to create 2 new columns in "unicorn_investors"

In [None]:
#Function that counts the number of "investments" have a valuation over 5 Billion
over_5bill_companies <- function(){
    ncv_position = 1
    new_column_values <- c()
    for(investor in unicorn_investors$investors){
        current_investments <- unlist(unicorn_investors[unicorn_investors$investors == investor,]$investments)
        b_position = 1
        companies_over_5bill = 0
        for(company in current_investments){
#            print(company)
            company_value <- sum(cor_unicorns[cor_unicorns$company == company,]$valuation_billions)
            if(company_value >= 5){
                companies_over_5bill <- companies_over_5bill + 1
                }
#            print(companies_over_5bill)
            }
        new_column_values[[ncv_position]] <- companies_over_5bill
        ncv_position <- ncv_position + 1
        }
     return(as.integer(new_column_values))
    }

#Assign "associative_companies" to new column "associate_companies" in "unicorns"
unicorn_investors$num_over_5bill <- over_5bill_companies()





#Function that counts the number of "investments" have a valuation over 5 Billion
pct_over_5bill <- function(){
    ncv_position = 1
    new_column_values <- c()
    for(investor in unicorn_investors$investors){
        num_over_5bill <- unicorn_investors[unicorn_investors$investors == investor,]$num_over_5bill
        num_of_investments <- unicorn_investors[unicorn_investors$investors == investor,]$investment_count
        success_rate <- num_over_5bill / num_of_investments
        new_column_values[[ncv_position]] <- round((success_rate * 100))
        ncv_position <- ncv_position + 1
        }
     return(as.numeric(new_column_values))
    }

#Assign "associative_companies" to new column "associate_companies" in "unicorns"
unicorn_investors$pct_5bill <- pct_over_5bill()

In [None]:
head(unicorn_investors, n = 5)

#### Create "investors_avg_pct_over5bill" column for "cor_unicorns"

In [None]:
#Function that determines the average percentage of "investments" over 5_bill per "select_investors" by "company"
investors_pct_over5bill <- function(){
    ncv_position = 1
    new_column_values <- list()
    for(company in cor_unicorns$company){
        position = 1
        investor_pcts <- c()
        investor_group <- unlist(cor_unicorns[cor_unicorns$company == company,]$select_investors, use.names = FALSE)
        for(investor in investor_group){
            pct_over_5bill <- unicorn_investors[unicorn_investors$investors == investor,]$pct_5bill
            investor_pcts[[position]] <- pct_over_5bill
            position <- position + 1
            }
        new_column_values[[ncv_position]] <- mean(unlist(investor_pcts, use.names = FALSE))
        ncv_position <- ncv_position + 1
        }
     return(as.numeric(new_column_values))
    }

#Assign "associative_companies" to new column "associate_companies" in "unicorns"
cor_unicorns$investors_avg_pct_over5bill <- investors_pct_over5bill()

#### Skim over "unicorns" and "unicorn_investors" for basic statistics

In [None]:
#Skim over "unicorns" for an overview of basic statistics
skim_tee(cor_unicorns)

In [None]:
#Skim over "unicorn_investors" for an overview of basic statistics
skim_tee(unicorn_investors)

In [None]:
#sort descending
#cor_unicorns[cor_unicorns$num_of_associates == 0, ]

In [None]:
# Replace the NULL value of Unicorn's with missing "avg_associate_valuation" with their "valuation_billions" as they are un-associated 
cor_unicorns <- cor_unicorns %>%
   mutate(avg_associate_valuation = ifelse(is.na(avg_associate_valuation), valuation_billions, avg_associate_valuation))

In [None]:
cor_unicorns[cor_unicorns$valuation_billions > 105.00,]

## Analysis + Visualiations

#### "unicorns" correlation

In [None]:
colnames(cor_unicorns)

In [None]:
unicorns_pearson <- cor_unicorns[ , c('valuation_billions',
                                      'age_years',
                                      'associates_level',
                                      'num_of_associates',
                                      'association_valuation',
                                      'avg_associate_valuation',
                                      'investors_avg_pct_over5bill')]


unicorns_spearman <- cor_unicorns[ , c('valuation_billions',
                                       'age_years',
                                       'over_5bill',
                                       'num_of_investors',
                                       'coded_country',
                                       'coded_continent',
                                       'coded_category',
                                       'num_of_associates',
                                       'associates_level',
                                       'association_valuation',
                                       'avg_associate_valuation',
                                       'investors_avg_pct_over5bill')]
                                  

In [None]:
cor(unicorns_pearson, method = "pearson")

In [None]:
cor(unicorns_spearman, method = "spearman")

In [None]:
# plotting the data to determine the linearity
unicorns_pearson_linearity = plot(unicorns_pearson, col="steelblue", main="Matrix Scatterplot")

In [None]:
fig <- cor_unicorns %>% plot_ly(width = 700, height = 500)

fig <- fig %>% add_trace(x = ~investors_avg_pct_over5bill,
                         y = ~valuation_billions,
                         color = ~over_5bill,
                         colors = 'Set1',
                         type = 'scatter',
                         mode = 'markers')

fig

In [None]:
#make this example reproducible
set.seed(101)

#use 70% of dataset as training set and 30% as test set
sample <- sample(c(TRUE, FALSE), nrow(cor_unicorns), replace=TRUE, prob=c(0.7,0.3))
train  <- cor_unicorns[sample, ]
test   <- cor_unicorns[!sample, ]

In [None]:
#Simple regression plot on "cor_unicorns"
ggplot(cor_unicorns, aes(x = investors_avg_pct_over5bill, y = valuation_billions)) +
    geom_point() +
    stat_smooth(method = "lm", col = "red", formula = y ~ x)

In [None]:
mlm_summary <- lm(valuation_billions ~investors_avg_pct_over5bill + age_years + num_of_associates + association_valuation, data = cor_unicorns)

In [None]:
#1. The distribution of model residuals should be approximately normal.
#hist(residuals(mlm_summary), col = "steelblue")

fig <- plot_ly(x = fitted(mlm_summary), y = residuals(mlm_summary), type = "scatter")

fig <- fig %>% layout(yaxis = list(showgrid = FALSE), width = 500, height = 400)

fig

In [None]:
#2. The variance of the residuals should be consistent for all observations.
#create fitted value vs residual plot
plot(fitted(mlm_summary), residuals(mlm_summary))

#add horizontal line at 0
abline(h = 0, lty = 2)

In [None]:
plot(mlm_summary)

In [None]:
summary(mlm_summary)

In [None]:
lm_predict <- lm(valuation_billions ~ investors_avg_pct_over5bill + age_years + num_of_associates + associates_level, data = train)

summary(lm_predict)

pred <- predict(lm_predict, newdata = test, interval = "confidence")

#### Over 5 Billion

In [None]:
cor_unicorns %>% group_by(over_5bill) %>% summarise(amount = n(),
                                                percentage_of_whole = round((n()/as.integer(count(cor_unicorns)))*100))

In [None]:
fig1 <- cor_unicorns[order(-cor_unicorns$over_5bill),] %>% plot_ly(width = 1000, height = 400)

fig1 <- fig1 %>% add_trace(x = ~continent,
                           y = ~investors_avg_pct_over5bill,
                           color = ~over_5bill,
                           colors = 'Set1',
                           type = 'box')

fig1 <- fig1 %>% layout(yaxis = list(showgrid = FALSE))


fig1

In [None]:
fig1 <- cor_unicorns[order(-cor_unicorns$over_5bill),] %>% plot_ly(width = 1000, height = 400)

fig1 <- fig1 %>% add_trace(x = ~continent,
                           y = ~valuation_billions,
                           color = ~over_5bill,
                           colors = 'Set1',
                           type = 'box')

fig1 <- fig1 %>% layout(yaxis = list(showgrid = FALSE))


fig1

#### Associates Level

In [None]:
cor_unicorns %>% group_by(over_5bill) %>% summarise(sd_associates_level = sd(associates_level),
                                                mean_associates_level = mean(associates_level),
                                                mode_associates_level = find_mode(round(associates_level)),
                                                median_associates_level = median(range(associates_level)))

In [None]:
fig1 <- cor_unicorns[order(-cor_unicorns$associates_level),] %>% plot_ly(width = 1000, height = 400)

fig1 <- fig1 %>% add_trace(x = ~associates_level,
                           y = ~age_years,
                           color = ~over_5bill,
                           colors = 'Set1',
                           type = 'box')



fig2 <- cor_unicorns[order(-cor_unicorns$associates_level),] %>% plot_ly(width = 1000, height = 400)

fig2 <- fig2 %>% add_trace(x = ~associates_level,
                           y = ~investors_avg_pct_over5bill,
                           color = ~over_5bill,
                           colors = 'Set1',
                           type = 'box')



fig3 <- cor_unicorns[order(-cor_unicorns$associates_level),] %>% plot_ly(width = 1000, height = 400)

fig3 <- fig3 %>% add_trace(x = ~associates_level,
                           y = ~valuation_billions,
                           color = ~over_5bill,
                           colors = 'Set1',
                           type = 'box')



fig4 <- cor_unicorns[order(-cor_unicorns$associates_level),] %>% plot_ly(width = 1100, height = 600)

fig4 <- fig4 %>% add_trace(x = ~associates_level,
                           y = ~association_valuation,
                           color = ~over_5bill,
                           colors = 'Set1',
                           type = 'box')






fig <- subplot(fig1,fig2, fig3, fig4, nrows = 2)

fig

#### Association Valuation

In [None]:
cor_unicorns %>% group_by(over_5bill) %>% summarise(sd_association_valuation = sd(association_valuation),
                                                mean_association_valuation = mean(association_valuation),
                                                mode_association_valuation = find_mode(round(association_valuation)),
                                                median_association_valuation = median(range(association_valuation)))

#### Number of Associates

In [None]:
cor_unicorns %>% group_by(over_5bill) %>% summarise(sd_num_of_associates = sd(num_of_associates),
                                                mean_num_of_associates = mean(num_of_associates),
                                                median_num_of_associates = median(range(num_of_associates)))

#### Age (years)

In [None]:
cor_unicorns %>% group_by(over_5bill) %>% summarise(sd_age = sd(age_years),
                                                mean_age = mean(age_years),
                                                mode_age = find_mode(round(age_years)),
                                                median_age = median(range(age_years)))

#### Valuation (billions)

In [None]:
cor_unicorns %>% group_by(over_5bill) %>% summarise(sd_valuations = sd(valuation_billions),
                                                mean_valuations = mean(valuation_billions),
                                                mode_valuations = find_mode(as.integer(valuation_billions)),
                                                median_valuations = median(range(valuation_billions)))

#### Investors Average Percentage Over & Under 5 Billion

In [None]:
cor_unicorns %>% group_by(over_5bill) %>% summarise(sd_avg_pct_over_5bill = sd(investors_avg_pct_over5bill),
                                                mean_avg_pct_over_5bill = mean(investors_avg_pct_over5bill),
                                                mode_avg_pct_over_5bill = find_mode(investors_avg_pct_over5bill),
                                                median_avg_pct_over_5bill = median(range(investors_avg_pct_over5bill)))

#### "unicorn_investors" correlation

In [None]:
investors_cor <- unicorn_investors[ , c('investment_count', 'valuation_billions','mean_valuation','num_over_5bill','pct_5bill')]

In [None]:
cor(investors_cor)

In [None]:
#Number of Unique Portfolios among all Investors
length(unique(unicorn_investors$investments))

In [None]:
#Investment Firms with only 1 investment
length(find_lode(investor_freq))

In [None]:
#Number of Investment Firms with more than 1 investment
length(unique(investor_freq)) - length(find_lode(investor_freq))

In [None]:
cor(unicorns$valuation_billions, unicorns$num_of_investors)

In [None]:
#Create a table based on total valuation in billions by "category"
categorical_valuation <- unicorns %>% 
    group_by(category) %>% 
        summarise(valuation = sum(valuation_billions), 
                  num_of_companies = n(), 
                  standard_deviation = sd(valuation_billions),
                  avg_valuation_billions = mean(valuation_billions))

In [None]:
#'categorical_valuation' table
categorical_valuation

## Visualizations

In [None]:
#Plot 'categorical_valuation'
fig1 <- categorical_valuation %>% plot_ly()

fig1 <- fig1 %>% add_trace(data = categorical_valuation,
                           x = ~category,
                           y = ~valuation,
                           name = 'Valuation (Billions)',
                           type = 'bar')     

fig1 <- fig1 %>% layout(xaxis = list(showticklabels = FALSE), yaxis = list(showgrid = FALSE))


fig2 <- categorical_valuation %>% plot_ly(width = 800, height = 600)

fig2 <- fig2 %>% add_trace(data = categorical_valuation,
                           x = ~category,
                           y = ~num_of_companies,
                           name = 'Number of Companies',
                           type = 'scatter',
                           mode = 'lines')

fig2 <- fig2 %>% layout(xaxis = list(showgrid = FALSE), yaxis = list(showgrid = FALSE))


#fig3 <- categorical_valuation <- plot_ly(width = 1000, height = 800)

#fig3 <- fig3 %>% add_trace(data = categorical_valuation,,
#                           x = ~category,
#                           y = ~categorical_strength,
#                           name = 'Categorical Strength',
#                           type = 'scatter')

fig <- subplot(fig1, fig2, nrows = 2)

fig

In [None]:
#Plot "company" based on "age_years" and "valuation_billions"
fig <- unicorns %>% plot_ly(width = 1000, height = 800)

fig <- fig %>% add_trace(data = unicorns,
                         x = ~age_years,
                         y = ~valuation_billions,
                         color = ~over_5bill,
                         colors = 'Set2',
                         type = 'scatter',
                         mode = 'markers')

fig

In [None]:
#Plot "company" based on "num_of_investors" and "valuation_billions"
fig <- unicorns %>% plot_ly(width = 600, height = 400)

fig <- fig %>% add_trace(data = unicorns,
                         x = ~num_of_investors,
                         y = ~valuation_billions,
                         type = 'scatter',
                         mode = 'markers')

fig <- fig %>% layout(xaxis = list(showgrid = FALSE), yaxis = list(showgrid = FALSE))

fig

In [None]:
unique(unicorns[c("num_of_investors")])

In [None]:
#Unicorns by "continent"
unicorns_by_continent <- unicorns %>% group_by(continent) %>% summarise(avg_valuation = mean(valuation_billions), valuation_billions = sum(valuation_billions), num_of_companies = n())

In [None]:
unicorns_by_continent$avg_valuation

In [None]:
#Plot "valuation_billions" based on "continents"
fig1 <-  unicorns_by_continent %>% plot_ly(width = 1000, height = 800)

fig1 <- fig1 %>% add_trace(x = ~continent,
                         y = ~valuation_billions,
                         name = 'Valuation by Continent',
                         type = 'bar')
fig1 <- fig1 %>% add_trace(x = ~continent,
                         y = ~num_of_companies,
                         name = 'Number of Companies',
                         type = 'scatter',
                         mode = 'lines')

fig1 <- fig1 %>% layout(xaxis = list(showgrid = FALSE), yaxis = list(showgrid = FALSE))
                                            

fig2 <- unicorns_by_continent %>% plot_ly(width = 1000, height = 800)

fig2 <- fig2 %>% add_trace(x = ~continent,
                         y = ~avg_valuation,
                         name = 'Average Valuation in Billions',
                         type = 'scatter',
                         mode = 'lines')

fig2 <- fig2 %>% layout(xaxis = list(showgrid = FALSE), yaxis = list(showgrid = FALSE))
                                                      
fig <- subplot(fig1, fig2, nrows = 2)

fig

In [None]:
fig <- unicorn_investors %>% plot_ly(width = 700, height = 700)

fig <- fig %>% add_trace(x = ~investment_count,
                         y = ~num_over_5bill,
                         type = 'scatter',
                         mode = 'markers')

fig

---

[Source](https://www.cbinsights.com/research-unicorn-companies) of dataset. 

## Don't know where to start?

**Challenges are brief tasks designed to help you practice specific skills:**

- 🗺️ **Explore**: Which investors are the most represented in the dataset?
- 📊 **Visualize**: Create a plot that visualizes the valuation of a company by the date it was added. Add annotations to outliers indicating the names of the company.
- 🔎 **Analyze**: Do the number of investors a company have correlate with its valuation?

**Scenarios are broader questions to help you develop an end-to-end project for your portfolio:**

You have been hired as a data scientist for a company that invests in start-ups. Your manager is interested in whether it is possible to predict whether a company reaches a valuation over 5 billion based on characteristics such as its country of origin, its category, and details about its investors.

Using the dataset provided, you have been asked to test whether such predictions are possible, and the confidence one can have in the results.

You will need to prepare a report that is accessible to a broad audience. It will need to outline your motivation, steps, findings, and conclusions.

---

✍️ _If you have an idea for an interesting Scenario or Challenge, or have feedback on our existing ones, let us know! You can submit feedback by pressing the question mark in the top right corner of the screen and selecting "Give Feedback". Include the phrase "Content Feedback" to help us flag it in our system._

<br></br>


In [None]:
cor_unicorns <- apply(cor_unicorns,2,as.character)
unicorn_investors <- apply(unicorn_investors,2,as.character)

write.csv(cor_unicorns,"cor_unicorns.csv", row.names = FALSE)
write.csv(unicorn_investors,"unicorn_investors.csv", row.names = FALSE)

In [None]:
uni_head = data.frame(head(unicorns, n = 5))
uni_inv_head = data.frame(head(unicorn_investors, n = 5))
cor_uni_head = data.frame(head(cor_unicorns, n = 5))
cor_uni_head_ac = data.frame(head(subset(cor_unicorns, select = -c(14))))

In [None]:
head(cor_uni_head_ac, n = 1)

In [None]:
uni_head <- apply(uni_head,2,as.character)
write.csv(uni_head,"uni_head.csv", row.names = FALSE)

In [None]:
uni_inv_head <- apply(uni_inv_head,2,as.character)
write.csv(uni_inv_head,"uni_inv_head.csv", row.names = FALSE)

In [None]:
cor_uni_head <- apply(cor_uni_head,2,as.character)
write.csv(cor_uni_head,"cor_uni_head.csv", row.names = FALSE)

In [None]:
cor_uni_head_ac <- apply(cor_uni_head_ac,2,as.character)
write.csv(cor_uni_head_ac,"cor_ac_uni_head.csv", row.names = FALSE)