https://www.kaggle.com/datasets/justinas/startup-investments

# Mount Drive

In [None]:
from google.colab import drive

In [None]:
# Mount local google drive so this notebook can access it (have to give permission)
def mount_drive(drivename):
  drive.mount(drivename)

In [None]:
# Note: Everyone's local environment will need to follow this same directory hierarchy
drivename = '/content/drive'
kagglefile_directory = '/content/drive/MyDrive/CS573_DataMining_FinalProject/Data/KaggleFiles/'

mount_drive(drivename)

# Load Data / Data Preprocessing (part 1)

1. Read in data as dictionary of data frames
2. Replace all nan values with '' empty string

In [None]:
import os
import pandas as pd

In [None]:
# Replace all nan values with an empty string ''
  # Note: I think all the columns with nan values might contain non-numeric values, but I'm not 100% sure
   # Put in comment near man function
def replace_nan(df, replace_val):
    df.fillna(value=replace_val, inplace=True)
    return df

In [None]:
# Read in csv file as a pandas dataframe
def read_csv(filepath):
  df = pd.read_csv(filepath, engine="python")
  df = replace_nan(df, '')
  return df

In [None]:
''' Creates python dictionary of pandas dataframes (one df per csv file)
      Keys are the csv filenames/tables (e.g. 'objects')
      Each Value is a corresponding pandas dataframe
        Within this dataframe,
        Keys are the column names (e.g., domain)
        Values are the columns values
'''
data_dict = {}
for filename in os.listdir(kagglefile_directory):
    filepath = os.path.join(kagglefile_directory, filename)

    key = filename.replace('.csv', '')
    value = read_csv(filepath)

    data_dict[key] = value

###  How to use the data dictionary of data frames:

In [None]:
# Data_dict is a dictionary of dataframes
print(data_dict.keys())
print(data_dict['objects'].keys())
print(data_dict['objects']['domain'][0])

In [None]:
# In each of the 11 tables, the 'id' column uniquely corresponds to the same company
  # The exception is the 'objects' file, where the column is titled 'entity_id'
print(data_dict['objects']['entity_id'][0])
print(data_dict['acquisitions']['id'][0])

In [None]:
# The 'object.csv' file is the primary file
numcompanies = len(data_dict['objects']['id'])

# This is the number of companies with STATUS (our labels) data :
print(numcompanies)

In [None]:
# ...But not all companies listed in the 'objects' table are included across all files
print(len(data_dict['objects']['entity_id']))
print(len(data_dict['acquisitions']['id']))

# Sachit EDA

In [None]:
#EDA Sachit
#Acquisitions, IPO's, Investments
STARTUPS = (data_dict['objects'].query('entity_type == "Company" and status != "" and country_code != "CSS" and country_code != "FST"')
            .drop(columns='entity_id')
            .drop_duplicates())
a = pd.crosstab(STARTUPS['funding_rounds'], STARTUPS['status'])
#a <- table(STARTUPS$funding_rounds, STARTUPS$status)
#a <- cbind(a, rep(0, nrow(a)))
#a = pd.pivot_table(STARTUPS, index='funding_rounds', columns='status',
#                   aggfunc=len, fill_value=0, margins=True, margins_name='total of status')
#a.loc['total of funding rounds'] = a.sum(axis=0)

# Calculate the row and column totals
#a['tot_funding_rounds'] = a.sum(axis=1)
#a.loc['total of status'] = a.sum(axis=0)

# Remove the margin totals
#a = a.iloc[:-1, :-1]

# Reorder the columns
#a = a[['acquired', 'closed', 'ipo', 'operating', 'tot_funding_rounds']]

# Reorder the rows
#a = a.loc[list(range(1, 16)) + ['total of funding rounds']]

# Calculate the total of each column and add it as a row
#a.loc['total of status'] = a.sum(axis=0)

# Print the resulting table
print(a)

#No idea how to get rid of the first three tables.


In [None]:
import numpy as np
pd.set_option('display.float_format', lambda x: '%.3f' % x)
np.set_printoptions(precision=3, suppress=True)

# Create the summary table and round the values to 3 decimal places
funding_summary = round(pd.Series(STARTUPS['funding_total_usd'][STARTUPS['funding_total_usd'].notnull() & STARTUPS['funding_total_usd'] != 0]).describe(), 3)

# Convert the summary series to a DataFrame and transpose it
funding_summary_df = pd.DataFrame(funding_summary).T

# Display the summary table
print(funding_summary_df)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
t = data_dict['acquisitions'].groupby('acquiring_object_id').size().reset_index(name='num_acquisizioni_effettuate')

# merge with STARTUPS
t = pd.merge(STARTUPS, t, left_on='id', right_on='acquiring_object_id', how='left')[['id', 'num_acquisizioni_effettuate', 'status']]

# replace missing values with 0
#['num_acquisizioni_effettuate'].fillna(0, inplace=True)
t['num_acquisizioni_effettuate_cat'] = pd.Categorical(t['num_acquisizioni_effettuate'])

barp = sns.catplot(x='num_acquisizioni_effettuate_cat', kind='count', hue='status', data=t,
                   height=5, aspect=2, palette='muted')
barp.set(xlabel='Number of acquired companies', ylabel='Count')
plt.show()
#This is a great representation of the data, but im not able to set it so it has decent zoom/works on percentages instead.

In [None]:
data_dict['objects'].describe()

# Exploratory Data Analysis Shafkat

Milestone->degrees->relationship

In [None]:
print("Table: Milestone")
id = data_dict['milestones']['id']
print("Number of milestones data: ", len(id))
print("Table: degrees")
id = data_dict['degrees']['id']
print("Number of degrees data: ", len(id))
print("Table: Relationship")
id = data_dict['relationships']['id']
print("Number of relationships data: ", len(id))

In [None]:
df  = data_dict['milestones']

df.describe(include='all')

In [None]:
df  = data_dict['relationships']

df.describe(include='all')

In [None]:
df  = data_dict['degrees']

df.describe(include='all')

In [None]:
df  = data_dict['objects']

df.describe(include='all')

In [None]:
print(df.shape)
df2=df.dropna()
df2.shape

In [None]:
df2.head()

In [None]:
print(data_dict['objects'].keys())

In [None]:
print(data_dict['milestones'].keys())

In [None]:
from collections import Counter
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

def create_frequency_plot(column, column_name, top_number):

  column = remove_values(column, "")
  column = remove_values(column, "unaffiliated")
  column = remove_values(column, "unknown")

  variable_frequency = Counter(column).most_common(top_number)

  variables = [var for var, _ in variable_frequency]
  counts = [counts for _, counts in variable_frequency]

  figure(figsize=(30, 10), dpi=80)
  plt.rcParams.update({'font.size': 10})
  plt.xticks(rotation='vertical')

  plt.bar(variables, counts)
  plt.title(str(top_number) + " Most Common " + column_name)
  plt.ylabel("Frequency", fontsize=12)
  plt.xlabel(column_name, fontsize=12)
  plt.show()

In [None]:
def remove_values(column, target_val):
  new_column = []
  count = 0
  for i in range(len(column)):
    col_val = column[i]
    if col_val.lower() != target_val.lower():
      new_column.append(col_val)
    else:
      count += 1

  if target_val == "":
    target_val = "NaN"
  print("Number of ", target_val,  "values removed: ", count)
  return new_column

Analyze Milestones

In [None]:
print("Column: milestone_at")
milestone_at = data_dict['milestones']['milestone_at']
print("Number of unique values: ", len(set(milestone_at)))
create_frequency_plot(milestone_at, "milestone_at", 50)

In [None]:
print("Column: milestone_code")
milestone_code = data_dict['milestones']['milestone_code']
print("Number of unique values: ", len(set(milestone_code)))
create_frequency_plot(milestone_at, "milestone_code", 50)

In [None]:
print("Column: description")
description = data_dict['milestones']['description']
print("Number of unique values: ", len(set(description)))
create_frequency_plot(description, "description", 50)

In [None]:
print("Column: source_url")
source_url = data_dict['milestones']['source_url']
print("Number of unique values: ", len(set(source_url)))
create_frequency_plot(source_url, "source_url", 50)

In [None]:
print("Column: milestone_at")
source_description = data_dict['milestones']['source_description']
print("Number of unique values: ", len(set(source_description)))
create_frequency_plot(source_description, "source_description", 50)

In [None]:
print("Column: created_at")
created_at = data_dict['milestones']['created_at']
print("Number of unique values: ", len(set(created_at)))
create_frequency_plot(created_at, "created_at", 50)

In [None]:
print("Column: updated_at")
updated_at = data_dict['milestones']['updated_at']
print("Number of unique values: ", len(set(updated_at)))
create_frequency_plot(updated_at, "updated_at", 50)

Analyze Degrees

In [None]:
print(data_dict['degrees'].keys())

In [None]:
print("Column: updated_at")
updated_at = data_dict['degrees']['updated_at']
print("Number of unique values: ", len(set(updated_at)))
create_frequency_plot(updated_at, "updated_at", 50)

In [None]:
print("Column: created_at")
created_at = data_dict['degrees']['created_at']
print("Number of unique values: ", len(set(created_at)))
create_frequency_plot(created_at, "created_at", 50)

In [None]:
print("Column: graduated_at")
graduated_at = data_dict['degrees']['graduated_at']
print("Number of unique values: ", len(set(graduated_at)))
create_frequency_plot(graduated_at, "graduated_at", 50)

In [None]:
print("Column: institution")
institution = data_dict['degrees']['institution']
print("Number of unique values: ", len(set(institution)))
create_frequency_plot(institution, "institution", 50)

In [None]:
print("Column: subject")
subject = data_dict['degrees']['subject']
print("Number of unique values: ", len(set(subject)))
create_frequency_plot(subject, "subject", 50)

In [None]:
print("Column: degree_type")
degree_type = data_dict['degrees']['degree_type']
print("Number of unique values: ", len(set(degree_type)))
create_frequency_plot(degree_type, "degree_type", 50)

Analyze Relationship

In [None]:
print(data_dict['relationships'].keys())

In [None]:
print("Column:title")
title = data_dict['relationships']['title']
print("Number of unique values: ", len(set(title)))
create_frequency_plot(title, "sequence", 50)

In [None]:
print("Column: end_at")
end_at = data_dict['relationships']['end_at']
print("Number of unique values: ", len(set(end_at)))
create_frequency_plot(end_at, "end_at", 50)

In [None]:
print("Column: start_at")
start_at = data_dict['relationships']['start_at']
print("Number of unique values: ", len(set(start_at)))
create_frequency_plot(start_at, "start_at", 50)

# Nikhil

In [None]:
#Funding rounds and investments seems to have almost the same data, so I think I did funding rounds as well. Take a look at my tables (Sachit) and if you find it enough you can skip funding rounds.

funding_rounds, funds, *objects*

# EDA Caitlin

People, Offices

In [None]:
def count_values(column, target_val):
  count = 0
  for i in range(len(column)):
    if column[i] == target_val:
      count+=1
  print("Number of ", target_val,  "values: ", count)

In [None]:
def remove_values(column, target_val):
  new_column = []
  count = 0
  for i in range(len(column)):
    col_val = column[i]
    if col_val.lower() != target_val.lower():
      new_column.append(col_val)
    else:
      count += 1

  if target_val == "":
    target_val = "NaN"
  print("Number of ", target_val,  "values removed: ", count)
  return new_column

In [None]:
from collections import Counter
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

def create_frequency_plot(column, column_name, top_number):

  column = remove_values(column, "")
  column = remove_values(column, "unaffiliated")
  column = remove_values(column, "unknown")

  variable_frequency = Counter(column).most_common(top_number)

  variables = [var for var, _ in variable_frequency]
  counts = [counts for _, counts in variable_frequency]

  figure(figsize=(30, 10), dpi=80)
  plt.rcParams.update({'font.size': 10})
  plt.xticks(rotation='vertical')

  plt.bar(variables, counts)
  plt.title(str(top_number) + " Most Common " + column_name)
  plt.ylabel("Frequency", fontsize=12)
  plt.xlabel(column_name, fontsize=12)
  plt.show()

In [None]:
print("Table: People")
id = data_dict['people']['id']
print("Number of people: ", len(id))

In [None]:
print("Column: Birthplaces")
birthplaces = data_dict['people']['birthplace']
print("Number of unique values: ", len(set(birthplaces)))
create_frequency_plot(birthplaces, "Birthplaces (People)", 50)

In [None]:
print("Column: Affiliation Name")
affiliation_name = data_dict['people']['affiliation_name']
print("Number of unique values: ", len(set(affiliation_name)))
create_frequency_plot(affiliation_name, "Affiliations (People)", 50)

In [None]:
print("Table: Offices")
id = data_dict['offices']['id']
print("Number of offices: ", len(id))

In [None]:
print("Column: Region")
region = data_dict['offices']['region']
print("Number of unique values: ", len(set(region)))
create_frequency_plot(region, "Office Regions", 50)

# Data Preprocessing (part 2)

(From Sachit)

Relevant features:
1. Degree Subject (CS, Law etc) -> Degrees
2. Market segment -> category_code in objects
3. Raised_amount_USD -> funding_rounds
4. IPO -> IPO’s to connect which companies did IPO

These 4 are the most relevant logically + without many garbage values. The rest are less relevant/ more garbage because 4 feels a bit less:

5. Number of acquisitions made by the startup ->Acquisitions
6. Institution of degree -> Degrees
7. Funding total USD -> Objects, lots of 0.0


_____

(From Caitlin)

**NOTE:** The code below is for generating the old train/test splits that did not include the country_code and relationships features.<br>
The code for the latter is in my folder.<br>

The cleaned feature matrix is NxD.<br>
  N is the number of companies<br>
  D is the number of features<br>

Associated pickle files (in Data/TrainTestSplit/): train_data_2, train_labels_2, test_data_2, test_labels_2<br>

**Feature ordering** (index into feature matrix):
- 0-41: **category_code** (one-hot encoded) [objects]
- 42-206: **country_code** (one-hot encoded) [objects]
- 207: **funding_total_usd** (sum of previous raised_amount_usd feature) [objects]
- 208: **founded_at** (only the year) [objects]
- 209: **funding_rounds** (total number of funding rounds) [objects]<br>
- 210: **relationships** (total number of relationships) [objects]<br>
- 211: **acquisitions** (count of acquiring_object_id entries) [acquisitions]


Notes:
1. A one-hot encoding is applied to the original category_code and country_code features.
2. The founded_at feature only includes the year, not the month or day.
3. Funding rounds: The purpose of creating this feature is to differentiate between companies with \$0 for funding_total_usd due to having 0 funding rounds and those with one or more funding rounds that raised \$0.
<br>

**Feature matrix cleaning**:<br>
If one feature was in violation, I removed the entire company from the feature matrix.
1. category_code, country_code, founded_at: Removed if null
2. funding_total_usd: Removed if the company's funding total was zero AND the number of funding_rounds was greater than zero AND the first_funding_at entry was null

**Original Objects table**:<br>
N = 196,553 companies<br>
operating: 183441<br>
acquired: 9394<br>
closed: 2584<br>
ipo: 1134<br>

**Feature matrix** (post-cleaning):<br>
N = 64,010 companies<br>
operating: 57900<br>
acquired: 3898<br>
closed: 1601<br>
ipo: 611<br>


________

**INFORMATION ABOUT THE OLD PICKLE FILES**

(OLD) Associated pickle files (in Data/TrainTestSplit/): train_data, train_labels, test_data, test_labels<br>

(OLD) Feature ordering (index into feature matrix):
- 0-41: **category_code** (one-hot encoded) [objects]
- 42: **funding_total_usd** (sum of our old raised_amount_usd feature) [objects]
- 43: **founded_at** (only the year) [objects]
- 44: **total number of acquisitions** (count of acquiring_object_id entries) [acquisitions]
- 45: **total number of funding rounds** (count of object_id entries) [funding_rounds]<br>

(OLD) Feature matrix (post-cleaning):<br>
N = 88,214 companies<br>
operating: 81413<br>
acquired: 4258<br>
closed: 1919<br>
ipo: 624<br><br>

_________

Later:

More data processing:

Definitely: Normalize the numerical variables with respect to the training set

Maybe: Identify outliers in funding_total_usd and draised_amount_usd features by removing rows with values += 3*stdev (but this assumes a normal distribution.)

____

Feature selection:<br>
It seems the funding_total_usd is just linear combination of the raised_amount_usd feature, so probably will want to drop the former feature.

Feature engineering:<br>
Technically, total number of acquisitions and total number of funding round are engineered features.

In [None]:
filelocation_featurematrix = '/content/drive/MyDrive/CS573_DataMining_FinalProject/Data/FeatureMatrix/'

## Construct the Feature Matrix

In [None]:
# Labels: Obtain ids for each company in the Objects table and their corresponding status
objects_table = data_dict['objects']
status_labels = list(objects_table.loc[objects_table['entity_type'] == "Company", 'status'].values)
status_company_ids = list(objects_table.loc[objects_table['entity_type'] == "Company", 'id'].values)

In [None]:
# Feature matrix is a list of lists
num_features = 5
feature_matrix = [[None] * num_features for i in range(len(status_labels))]

In [None]:
# Feature matrix: Incorporate Objects table features
objects_table = data_dict['objects']
category_codes = list(objects_table.loc[objects_table['entity_type'] == "Company", 'category_code'].values)
funding_total_usd = list(objects_table.loc[objects_table['entity_type'] == "Company", 'funding_total_usd'].values)
founded_at = list(objects_table.loc[objects_table['entity_type'] == "Company", 'founded_at'].values)

for i in range(len(feature_matrix)):
  feature_matrix[i][0] = category_codes[i]
  feature_matrix[i][1] = funding_total_usd[i]
  feature_matrix[i][2] = founded_at[i]

In [None]:
# Feature matrix: For each company with a status, compute the total number of acquisitions it has made
acquiring_objects_column = data_dict['acquisitions']['acquiring_object_id']
acquired_company_count = acquiring_objects_column.value_counts()
for i in range(len(status_company_ids)):
  company_id = status_company_ids[i]

  if company_id in acquired_company_count:
    feature_matrix[i][3] = acquired_company_count[company_id]
  else:
    feature_matrix[i][3] = 0

In [None]:
# Feature matrix:
  # For each company with a status,
  #   compute total number of funding rounds
  #   and a list of the amounts raised in usd for each funding round
funding_rounds_table = data_dict['funding_rounds']
funding_rounds_count = funding_rounds_table['object_id'].value_counts()
for i in range(len(status_company_ids)):

  company_id = status_company_ids[i]

  if company_id in funding_rounds_count:
    feature_matrix[i][4] = funding_rounds_count[company_id]
  else:
    feature_matrix[i][4] = 0

In [None]:
# Save the uncleaned feature matrix
import pickle

with open(filelocation_featurematrix + 'feature_matrix_uncleaned.pkl', 'wb') as f:
  pickle.dump(feature_matrix, f)

## Clean the Feature Matrix

In [None]:
# Read in the uncleaned feature matrix
import pickle

with open(filelocation_featurematrix + 'feature_matrix_uncleaned.pkl', 'rb') as f:
  feature_matrix_uncleaned = pickle.load(f)

In [None]:
# Helper data structure:
# Make a dictionary that maps each company id to funding_rounds and first_funding_at
objects_table = data_dict['objects']
objects_table_companies = objects_table.loc[objects_table['entity_type'] == "Company"]

objects_dict = dict()
for i in range(len(objects_table_companies)):
  objects_dict[objects_table_companies['id'].values[i]] = [objects_table_companies['funding_rounds'].values[i], objects_table_companies['first_funding_at'].values[i]]

print(len(objects_table_companies))

In [None]:
import numpy as np
# If the features pass all the missing value checks,
  # add the correspoding company to the cleaned feature matrix
  # (see note at top of Data Preprocessing (part 2) section for further details)
kept_indices = []
for i in range(len(feature_matrix_uncleaned)):

  company_id = status_company_ids[i]

  # Feature: Category Code
  category_code = feature_matrix_uncleaned[i][0]
  if category_code == '':
    continue

  # Feature: Founded At
  founded_at = feature_matrix_uncleaned[i][2]
  if founded_at == '':
    continue

  # Feature: Funding total USD
  funding_total_usd = feature_matrix_uncleaned[i][1]
  if funding_total_usd == 0:
    if objects_dict[company_id][0] > 0:
      if objects_dict[company_id][1] == '':
        continue

  kept_indices.append(i)

removal_indices = list(set(range(0, len(feature_matrix_uncleaned))) - set(kept_indices))
feature_matrix_cleaned = np.delete(feature_matrix_uncleaned, removal_indices, axis=0)
status_labels_cleaned = np.delete(status_labels, removal_indices, axis=0)
status_company_ids_cleaned = np.delete(status_company_ids, removal_indices, axis=0)

In [None]:
# Save the cleaned feature matrix and corresponding status-related arrays
with open(filelocation_featurematrix + 'feature_matrix_cleaned.pkl', 'wb') as f:
  pickle.dump(feature_matrix_cleaned, f)

with open(filelocation_featurematrix + 'status_labels_cleaned.pkl', 'wb') as f:
  pickle.dump(status_labels_cleaned, f)

with open(filelocation_featurematrix + 'status_company_ids_cleaned.pkl', 'wb') as f:
  pickle.dump(status_company_ids_cleaned, f)

## Further Data Preprocessing

In [None]:
# Read in the cleaned feature matrix and corresponding status-related arrays
import pickle

with open(filelocation_featurematrix + 'feature_matrix_cleaned.pkl', 'rb') as f:
  feature_matrix_cleaned = pickle.load(f)

with open(filelocation_featurematrix + 'status_labels_cleaned.pkl', 'rb') as f:
  status_labels_cleaned = pickle.load(f)

with open(filelocation_featurematrix + 'status_company_ids_cleaned.pkl', 'rb') as f:
  status_company_ids_cleaned = pickle.load(f)

In [None]:
# Strip the month and date from the founded_at feature, and convert it to an integer
for i in range(len(feature_matrix_cleaned)):
  date = feature_matrix_cleaned[i][2]
  year = date.split('-', 1)[0]
  feature_matrix_cleaned[i][2] = int(year)

In [None]:
# One hot encoding on category_code
from sklearn.preprocessing import OneHotEncoder

# Get all the category codes for all the startups
category_codes = np.empty((len(feature_matrix_cleaned), 1), dtype="object")
for i in range(len(feature_matrix_cleaned)):
  category_codes[i] = feature_matrix_cleaned[i][0]

# Create the one hot encoding of category code feature
ohencoder = OneHotEncoder()
oh_category_codes = ohencoder.fit_transform(category_codes).toarray()
print("Length of category code array: ", len(oh_category_codes[0]))

In [None]:
# Create new feature matrix with expanded category code
num_startups = len(feature_matrix_cleaned)
one_hot_length = len(oh_category_codes[0])
num_features = (len(feature_matrix[0]) - 1) + one_hot_length
print(num_startups)
print(num_features)
feature_matrix_final = np.zeros((num_startups, num_features))
for i in range(len(feature_matrix_final)):
  # Copy one-hot encoded category code feature vector and split into separate feature entries
  feature_matrix_final[i, 0:one_hot_length] = oh_category_codes[i]
  # Copy remaining features from the cleaned feature matrix
  feature_matrix_final[i, one_hot_length:len(feature_matrix_final[i])] = feature_matrix_cleaned[i][1:len(feature_matrix_cleaned[i])]

In [None]:
# Save the final feature matrix and corresponding status-related arrays
with open(filelocation_featurematrix + 'feature_matrix_final.pkl', 'wb') as f:
  pickle.dump(feature_matrix_final, f)

with open(filelocation_featurematrix + 'status_labels_final.pkl', 'wb') as f:
  pickle.dump(status_labels_cleaned, f)

with open(filelocation_featurematrix + 'status_company_ids_final.pkl', 'wb') as f:
  pickle.dump(status_company_ids_cleaned, f)

# Feature Selection

It seems the funding_total_usd is just linear combination of the raised_amount_usd feature, so probably will want to drop the former feature.



# Feature Engineering

# Split Data into Train/Test
### (Stratified Random Sampling)

**Training data**: train_data, train_labels<br>
**Test data**: test_data, test_labels<br>

In [None]:
filelocation_featurematrix = '/content/drive/MyDrive/CS573_DataMining_FinalProject/Data/FeatureMatrix/'
filelocation_traintestsplit = '/content/drive/MyDrive/CS573_DataMining_FinalProject/Data/TrainTestSplit/'

In [None]:
# Read in final startup data (features and labels)
import pickle

with open(filelocation_featurematrix + 'feature_matrix_final.pkl', 'rb') as f:
  feature_matrix_final = pickle.load(f)

with open(filelocation_featurematrix + 'status_labels_final.pkl', 'rb') as f:
  status_labels_final = pickle.load(f)

In [None]:
# Print the counts of each status label category
def print_label_value_counts(status_labels):
  unique_status = np.unique(status_labels)
  status_counts = np.zeros(len(unique_status))
  for i in range(len(unique_status)):
    status = unique_status[i]
    for j in range(len(status_labels)):
      label = status_labels[j]
      if label == status:
        status_counts[i] += 1
    print("Count for ", unique_status[i], ": ", int(status_counts[i]))

  print("Total label count: ", int(sum(status_counts)))

In [None]:
# Print status label counts for the final dataset
print_label_value_counts(status_labels_final)

In [None]:
from sklearn.model_selection import train_test_split
import math

In [None]:
# Train/valid/test split ratios
ratio_train = 0.8
ratio_test = 0.2

# Seed for random generator
seed = 42

In [None]:
number_of_startups = len(status_labels_final)
startup_ids = list(range(0, number_of_startups))

# Get train/valid/test split counts
number_of_test = math.ceil(number_of_startups*ratio_test)

# Get idxs for train/test split (stratified to resolve label class imbalances)
train_ids, test_ids = train_test_split(startup_ids, test_size=number_of_test, random_state=seed, stratify=status_labels_final)

In [None]:
# Form train/valid/test splits from unique ids
train_data = feature_matrix_final[train_ids]
train_labels = status_labels_final[train_ids]

test_data = feature_matrix_final[test_ids]
test_labels = status_labels_final[test_ids]

In [None]:
print("Total number of startups: ", number_of_startups)

# Print status label counts for the splits
print("\nTrain labels:")
print_label_value_counts(train_labels)
print("Percentage of total startups: ", len(train_labels)/number_of_startups)

print("\nTest labels:")
print_label_value_counts(test_labels)
print("Percentage of total startups: ", len(test_labels)/number_of_startups)

In [None]:
# Save the training and test feature and label matrices/vectors
with open(filelocation_traintestsplit + 'train_data.pkl', 'wb') as f:
  pickle.dump(train_data, f)

with open(filelocation_traintestsplit + 'train_labels.pkl', 'wb') as f:
  pickle.dump(train_labels, f)

with open(filelocation_traintestsplit + 'test_data.pkl', 'wb') as f:
  pickle.dump(test_data, f)

with open(filelocation_traintestsplit + 'test_labels.pkl', 'wb') as f:
  pickle.dump(test_labels, f)

# Data Pre-processing (Part 3)

Normalize training and test (non-categorical) data

In [None]:
filelocation_traintestsplit = '/content/drive/MyDrive/CS573_DataMining_FinalProject/Data/TrainTestSplit/'

In [None]:
# Read in training and test feature and label matrices/vectors
import pickle

with open(filelocation_traintestsplit + 'train_data.pkl', 'rb') as f:
  train_data = pickle.load(f)

with open(filelocation_traintestsplit + 'test_data.pkl', 'rb') as f:
  test_data = pickle.load(f)

In [None]:
import copy

# Normalize train and test data according to train mean and std (non-categorical features)
train_mean = np.mean(train_data[:, 42:46], axis=0)
train_std = np.std(train_data[:, 42:46], axis=0)

train_data_normalized = copy.copy(train_data)
test_data_normalized = copy.copy(test_data)

train_data_normalized[:, 42:46] = (train_data[:, 42:46] - train_mean)/train_std
test_data_normalized[:, 42:46] = (test_data[:, 42:46] - train_mean) / train_std

In [None]:
# Save the normalized training and test feature vectors
with open(filelocation_traintestsplit + 'train_data.pkl', 'wb') as f:
  pickle.dump(train_data_normalized, f)

with open(filelocation_traintestsplit + 'test_data.pkl', 'wb') as f:
  pickle.dump(test_data_normalized, f)

#Logistic Regression

In [None]:
%cd '/content/drive/MyDrive/CS573_DataMining_FinalProject/Data/TrainTestSplit'


In [None]:
import pickle
with open('test_data_5.pkl', 'rb') as f:
    test_dataS = pickle.load(f)

with open('test_labels_5.pkl', 'rb') as f:
    test_labelsS = pickle.load(f)

with open('train_data_5.pkl', 'rb') as f:
    train_dataS = pickle.load(f)

with open('train_labels_5.pkl', 'rb') as f:
    train_labelsS = pickle.load(f)


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score

# create a logistic regression model
model = LogisticRegression(solver='lbfgs')

# define the hyperparameters of interest and their ranges
param_grid = {'C': [0.01, 0.1, 1.0, 10.0],
              'penalty': ['l2']}

# perform a grid search with 10-fold cross validation
grid_search = GridSearchCV(model, param_grid, cv=10)
grid_search.fit(train_dataS, train_labelsS)

# get the best hyperparameters also printing them so you guys can check this makes sense
best_params = grid_search.best_params_
print('Best hyperparameters:', best_params)

# train the final model with the best hyperparameters based on the meeting discussion
final_model = LogisticRegression(solver='lbfgs', **best_params, max_iter = 80000)
final_model.fit(train_dataS, train_labelsS)

# make predictions on the test data
test_predictions = final_model.predict(test_dataS)

# evaluate the performance on the test set
accuracy = accuracy_score(test_labelsS, test_predictions)
f1 = f1_score(test_labelsS, test_predictions, average='macro')
precision = precision_score(test_labelsS, test_predictions, average='macro')
recall = recall_score(test_labelsS, test_predictions, average='macro')
conf_mat = confusion_matrix(test_labelsS, test_predictions)
print('Accuracy:', accuracy)
print('F1 score:', f1)
print('Precision:', precision)
print('Recall:', recall)
print('Confusion matrix:')
print(conf_mat)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
import numpy as np

# create a logistic regression model
model = LogisticRegression(solver='lbfgs', multi_class='auto')

# define the hyperparameters of interest and their ranges
param_grid = {'C': [0.01, 0.1, 1.0, 10.0],
              'penalty': ['l1', 'l2']}

# define class weights
class_weights = {0: 1, 1: 2, 2: 5, 3: 10}

# perform a grid search with 10-fold cross validation
grid_search = GridSearchCV(model, param_grid, cv=10)
grid_search.fit(train_dataS, train_labelsS)

# get the best hyperparameters
best_params = grid_search.best_params_
print('Best hyperparameters:', best_params)

# train the final model with the best hyperparameters and class weights
final_model = LogisticRegression(solver='lbfgs', multi_class='auto', class_weight=class_weights, **best_params, max_iter = 80000)
final_model.fit(train_dataS, train_labelsS)

# make predictions on the test data
test_predictions = final_model.predict(test_dataS)

# evaluate the performance on the test set
accuracy = accuracy_score(test_labelsS, test_predictions)
f1 = f1_score(test_labelsS, test_predictions, average='macro')
precision = precision_score(test_labelsS, test_predictions, average='macro')
recall = recall_score(test_labelsS, test_predictions, average='macro')
conf_mat = confusion_matrix(test_labelsS, test_predictions)
print('Accuracy:', accuracy)
print('F1 score:', f1)
print('Precision:', precision)
print('Recall:', recall)
print('Confusion matrix:')
print(conf_mat)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from imblearn.under_sampling import RandomUnderSampler

# create a logistic regression model
model = LogisticRegression(solver='lbfgs', multi_class='auto')

# define the hyperparameters of interest and their ranges
param_grid = {'C': [0.01, 0.1, 1.0, 10.0],
              'penalty': ['l1', 'l2']}

# randomly undersample the majority class
sampler = RandomUnderSampler(sampling_strategy='majority')
train_data_resampled, train_labels_resampled = sampler.fit_resample(train_dataS, train_labelsS)

# perform a grid search with 10-fold cross validation
grid_search = GridSearchCV(model, param_grid, cv=10)
grid_search.fit(train_data_resampled, train_labels_resampled)

# get the best hyperparameters
best_params = grid_search.best_params_
print('Best hyperparameters:', best_params)

# train the final model with the best hyperparameters on the original training data
final_model = LogisticRegression(solver='lbfgs', multi_class='auto', **best_params, max_iter = 80000)
final_model.fit(train_data_resampled, train_labels_resampled)

# make predictions on the test data
test_predictions = final_model.predict(test_dataS)

# evaluate the performance on the test set
accuracy = accuracy_score(test_labelsS, test_predictions)
f1 = f1_score(test_labelsS, test_predictions, average='macro')
precision = precision_score(test_labelsS, test_predictions, average='macro')
recall = recall_score(test_labelsS, test_predictions, average='macro')
conf_mat = confusion_matrix(test_labelsS, test_predictions)
print('Accuracy:', accuracy)
print('F1 score:', f1)
print('Precision:', precision)
print('Recall:', recall)
print('Confusion matrix:')
print(conf_mat)

# Shafkat- PCA, normalization, clustering, multi layer perceptron

In [None]:
import pickle
with open('test_data_5.pkl', 'rb') as f:
    test_data = pickle.load(f)

with open('test_labels_5.pkl', 'rb') as f:
    test_labels = pickle.load(f)

with open('train_data_5.pkl', 'rb') as f:
    train_data = pickle.load(f)

with open('train_labels_5.pkl', 'rb') as f:
    train_labels = pickle.load(f)

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
sampler = RandomUnderSampler(sampling_strategy='majority')
train_data, train_labels = sampler.fit_resample(train_data, train_labels)

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
clf = MLPClassifier(random_state=1, max_iter=100).fit(train_data, train_labels)
#clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(500, 2), random_state=1)
clf.fit(train_data, train_labels)

In [None]:
clf.score(test_data, test_labels)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
# make predictions on the test data
test_predictions = clf.predict(test_data)

# evaluate the performance on the test set
accuracy = accuracy_score(test_labels, test_predictions)
f1 = f1_score(test_labels, test_predictions, average='macro')
precision = precision_score(test_labels, test_predictions, average='macro')
recall = recall_score(test_labels, test_predictions, average='macro')
conf_mat = confusion_matrix(test_labels, test_predictions)
print('Accuracy:', accuracy)
print('F1 score:', f1)
print('Precision:', precision)
print('Recall:', recall)
print('Confusion matrix:')
print(conf_mat)

In [None]:
train_data

In [None]:
train1= train_data[:,42:]

In [None]:
# creating a list of column names
column_values = ['funding_total_usd', 'founded_at', 'total_number_of_acquisitions', 'total_number_of_funding_rounds']

# creating the dataframe
df = pd.DataFrame(data = train1, columns = column_values)

In [None]:
df['labels'] = train_labels

In [None]:
df.labels.unique()

In [None]:
df.head()

In [None]:
df.corr()

In [None]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(df.labels)
df['categorical_label'] = le.transform(df.labels)

In [None]:
df.groupby('labels').founded_at.hist(alpha=0.4)

In [None]:
import matplotlib.pyplot as plt
df.groupby('labels').funding_total_usd.hist(alpha=0.4)
plt.title("Funding Total USD vs. Frequency", color = 'red')
plt.legend(["Operating", "Acquired", "IPO", "Closed"])

In [None]:
df.groupby('labels').total_number_of_acquisitions.hist(alpha=0.4)

In [None]:
df.groupby('labels').total_number_of_funding_rounds.hist(alpha=0.4)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
%matplotlib inline
customcmap = ListedColormap(["crimson", "mediumblue", "darkmagenta", "yellow"])

fig, ax = plt.subplots(figsize=(8, 6))
plt.scatter(x=df['founded_at'], y=df['total_number_of_funding_rounds'], s=20000,
            c=df['categorical_label'].astype('category'),
            cmap = customcmap)
ax.set_xlabel(r'x', fontsize=14)
ax.set_ylabel(r'y', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
%matplotlib inline
customcmap = ListedColormap(["crimson", "mediumblue", "darkmagenta", "yellow"])

fig, ax = plt.subplots(figsize=(8, 6))
plt.scatter(x=df['funding_total_usd'], y=df['total_number_of_funding_rounds'], s=20000,
            c=df['categorical_label'].astype('category'),
            cmap = customcmap)
ax.set_xlabel(r'x', fontsize=14)
ax.set_ylabel(r'y', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
%matplotlib inline
customcmap = ListedColormap(["crimson", "mediumblue", "darkmagenta", "yellow"])

fig, ax = plt.subplots(figsize=(8, 6))
plt.scatter(x=df['total number of acquisitions'], y=df['total_number_of_funding_rounds'], s=20000,
            c=df['categorical_label'].astype('category'),
            cmap = customcmap)
ax.set_xlabel(r'x', fontsize=14)
ax.set_ylabel(r'y', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()

#XGBoost