In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg') 
import matplotlib.pyplot as plt
plt.switch_backend('Agg')  
%matplotlib inline

import seaborn as sns
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, roc_auc_score

RANDOM_STATE = 42
sns.set(style='whitegrid')

In [None]:
df = pd.read_csv('/kaggle/input/startup-failure-prediction-dataset/download.csv', encoding='ascii', delimiter=',')

print('Dataset shape:', df.shape)
print('Columns:', df.columns.tolist())

In [None]:
# Display the first few rows of the data
print(df.head())

# Display summary statistics for numeric columns
print(df.describe())

# Check for missing values in each column
print(df.isnull().sum())

In [None]:
categorical_features = ['Industry', 'Market_Size', 'Business_Model']
categorical_features = [col for col in categorical_features if col in df.columns]

df_encoded = pd.get_dummies(df, columns=categorical_features, drop_first=True)

target = 'Startup_Status'
model_features = [col for col in df_encoded.columns if col != target and pd.api.types.is_numeric_dtype(df_encoded[col])]

df_encoded[model_features] = df_encoded[model_features].fillna(0)

print('Encoded dataset shape:', df_encoded.shape)

In [None]:
majority_class = df_encoded[df_encoded[target] == df_encoded[target].mode()[0]]
minority_class = df_encoded[df_encoded[target] != df_encoded[target].mode()[0]]

minority_upsampled = resample(
    minority_class,
    replace=True,
    n_samples=len(majority_class),
    random_state=42
)

df_balanced = pd.concat([majority_class, minority_upsampled]).sample(frac=1, random_state=42)

features = [col for col in df_balanced.columns if col not in ['Startup_Name', target]]
X = df_balanced[features]
y = df_balanced[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("y_test class distribution:")
print(y_test.value_counts())

RANDOM_STATE = 42
rf_model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Prediction Accuracy: {accuracy:.2f}")

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

if len(rf_model.classes_) > 1:
    if 1 in rf_model.classes_:
        pos_index = list(rf_model.classes_).index(1)
    else:
        pos_index = 0
    y_proba = rf_model.predict_proba(X_test)[:, pos_index]
    fpr, tpr, thresholds = roc_curve(y_test, y_proba)
    auc_score = roc_auc_score(y_test, y_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'ROC Curve (area = {auc_score:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc='lower right')
    plt.show()
else:
    print('ROC curve is not possible because only one class is present in y_test.')

importances = rf_model.feature_importances_
indices = np.argsort(importances)

plt.figure(figsize=(10, 8))
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [X.columns[i] for i in indices])
plt.xlabel('Relative Importance')
plt.title('Feature Importances from RandomForestClassifier')
plt.show()

In [None]:
numeric_df = df.select_dtypes(include=[np.number])
if numeric_df.shape[1] >= 4:
    plt.figure(figsize=(10, 8))
    corr = numeric_df.corr()
    sns.heatmap(corr, annot=True, cmap='coolwarm')
    plt.title('Correlation Heatmap of Numeric Features')
    plt.show()
else:
    print('Not enough numeric columns for a correlation heatmap.')

plt.figure(figsize=(8, 6))
sns.histplot(df['Startup_Age'], kde=True, bins=20, color='purple')
plt.title('Distribution of Startup Age')
plt.xlabel('Startup Age (years)')
plt.ylabel('Frequency')
plt.show()

selected_features = ['Startup_Age', 'Funding_Amount', 'Employees_Count', 'Revenue']
if all(feat in df.columns for feat in selected_features):
    sns.pairplot(df[selected_features])
    plt.suptitle('Pair Plot of Selected Features', y=1.02)
    plt.show()
else:
    print('Some selected features for pair plot are missing.')

In [None]:
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}")
print(f"Precision: {precision_score(y_test, y_pred) * 100:.2f}")
print(f"Recall: {recall_score(y_test, y_pred) * 100:.2f}")
print(f"F1: {f1_score(y_test, y_pred) * 100:.2f}")