In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
dataset = pd.read_csv('big_startup_success_dataset.csv')

# Define a function to classify startups as "success" or "fail"
def classify_startup(row):
    try:
        # Convert 'funding_total_usd' to numeric, handle errors with coerce
        funding_total_usd = pd.to_numeric(row['funding_total_usd'], errors='coerce')
        if (
            funding_total_usd >= 500000 and
            row['funding_rounds'] >= 1 and
            row['status'] in ['operating', 'acquired', 'ipo']
        ):
            return 'success'
        else:
            return 'fail'
    except:
        return 'fail'

# Apply the classification function to create the target variable
dataset['startup_success'] = dataset.apply(classify_startup, axis=1)

# Define the features (X) and target variable (y)
X = dataset[['funding_total_usd', 'funding_rounds']]
y = dataset['startup_success']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_classifier.predict(X_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2%}')


ValueError: could not convert string to float: '-'