In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

dataset = pd.read_csv('big_startup_success_dataset.csv')
dataset.head(5)

# Iterate through columns and print unique values
for idx, column in enumerate(dataset.columns):
    # Skip printing unique values for columns 1, 2, 3, 4, 5 (0-based index)
    if idx in [0, 1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13]:
        continue

    unique_values = dataset[column].unique()
    print(f'Unique values in column "{column}":')
    print(unique_values)
    print('-' * 40)  # Separate columns for clarity
    
# Get the data types of columns
column_data_types = dataset.dtypes

# Print the data types
print("Data Types of Columns:")
print(column_data_types)

# Iterate through columns and check for null values in all columns
for column in dataset.columns:
    # Check if the column is of object (string) data type
    if dataset[column].dtype == 'O':
        # For object (string) columns, you can use .isna() to check for null values
        null_values = dataset[column].isna().sum()
    else:
        # For non-object columns, you can use .isnull() to check for null values
        null_values = dataset[column].isnull().sum()

    print(f'Null values in column "{column}": {null_values}')
    print('-' * 40)  # Separate columns for clarity

# Replace null values in the "name" column with "NULL"
dataset['name'].fillna('NULL', inplace=True)

# Replace null values in the "homepage_url" column with "NULL"
dataset['homepage_url'].fillna('NULL', inplace=True)

# Replace null values in the "category_list" column with "NULL"
dataset['category_list'].fillna('NULL', inplace=True)

# Replace null values in the "funding_total_usd" column with "0"
dataset['funding_total_usd'].fillna(0, inplace=True)

# Replace null values in the "country_code" column with "NULL"
dataset['country_code'].fillna('NULL', inplace=True)

# Replace null values in the "state_code" column with "NULL"
dataset['state_code'].fillna('NULL', inplace=True)

# Replace null values in the "region" column with "NULL"
dataset['region'].fillna('NULL', inplace=True)

# Replace null values in the "city" column with "NULL"
dataset['city'].fillna('NULL', inplace=True)

# Replace null values in the "founded_at" column with "NULL"
dataset['founded_at'].fillna('NULL', inplace=True)

# Replace null values in the "first_rounding_at" column with "NULL"
dataset['first_funding_at'].fillna('NULL', inplace=True)

# Convert the 'funding_total_usd' column to numeric (float)
dataset['funding_total_usd'] = pd.to_numeric(dataset['funding_total_usd'], errors='coerce')

# Check the data type after conversion
print(dataset['funding_total_usd'].dtype)

# Define a function to classify startups as "success" or "fail"
def classify_startup(row):
    if (
        row['funding_total_usd'] >= 500000 and
        row['funding_rounds'] >= 1 and
        row['status'] in ['operating', 'acquired', 'ipo']
    ):
        return 'success'
    else:
        return 'fail'

# Apply the classification function to each row and create a new column "startup_success"
dataset['startup_success'] = dataset.apply(classify_startup, axis=1)

# Save the updated dataset to a new CSV file
dataset.to_csv('startup_success_dataset_with_labels.csv', index=False)

# Load the labeled dataset with ground truth
labeled_dataset = pd.read_csv('startup_success_dataset_with_labels.csv')

# Calculate accuracy by comparing predicted 'startup_success' with actual 'startup_success'
correct_predictions = (dataset['startup_success'] == labeled_dataset['startup_success']).sum()
total_predictions = len(labeled_dataset)

accuracy = correct_predictions / total_predictions
print(f'Accuracy: {accuracy:.2%}')


Unique values in column "status":
['operating' 'acquired' 'closed' 'ipo']
----------------------------------------
Unique values in column "funding_rounds":
[ 1  2  4  3  9  5  6  7  8 10 11 12 15 13 14 17 18 16 19]
----------------------------------------
Data Types of Columns:
permalink            object
name                 object
homepage_url         object
category_list        object
funding_total_usd    object
status               object
country_code         object
state_code           object
region               object
city                 object
funding_rounds        int64
founded_at           object
first_funding_at     object
last_funding_at      object
dtype: object
Null values in column "permalink": 0
----------------------------------------
Null values in column "name": 1
----------------------------------------
Null values in column "homepage_url": 5058
----------------------------------------
Null values in column "category_list": 3148
-----------------------------------