In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import tensorflow as tf

# Load your dataset
data = pd.read_csv("startupdata.csv")


In [39]:
# Ensure the columns are numeric to avoid type errors
cols_to_check = ['Competitive_Density', 'Market_Size_Growth', 'Market_Adoption_Rate', 
                 'Debt_Equity_Ratio', 'Burn_Rate', 'Founder_Experience', 
                 'Team_Experience', 'Product_Differentiation']

# Convert columns to numeric, coercing errors to NaN
data[cols_to_check] = data[cols_to_check].apply(pd.to_numeric, errors='coerce')

# Handle any NaN values (e.g., fill with 0 or a meaningful placeholder)
data.fillna(0, inplace=True)

# Financial Risk: Higher Debt_Equity_Ratio and Burn_Rate suggest higher financial risk
y_financial = (data['Debt_Equity_Ratio'] + data['Burn_Rate']) / 2

# Market Risk: Higher Competitive_Density and lower Market_Size_Growth & Market_Adoption_Rate suggest higher market risk
y_market = (data['Competitive_Density'] - data['Market_Size_Growth'] - data['Market_Adoption_Rate']) / 3

# Operational Risk: Lower experience and education levels with low product differentiation and outdated tech stack increase operational risk
y_operational = (1 / (data['Founder_Experience'] + data['Team_Experience'] + 1)) * (1 - data['Product_Differentiation'])

# Combine into a target DataFrame
y = pd.DataFrame({
    'Financial_Risk': y_financial,
    'Market_Risk': y_market,
    'Operational_Risk': y_operational
})


In [40]:
features = [
    'Location', 'Initial_Funding', 'Funding_Rounds', 'Revenue_Growth_Rate',
    'Profit_Margin', 'Annual_Revenue', 'Burn_Rate', 'Market_Size_Growth',
    'Competitive_Density', 'Market_Adoption_Rate', 'Founder_Experience',
    'Team_Experience', 'Education_Level', 'Product_Differentiation'
]
X = data[features]


In [41]:
# Check for missing values in the dataset
print(data.isnull().sum())

# Fill missing values with 0 or a meaningful value (e.g., mean/median) for each column
data.fillna(0, inplace=True)


Startup_ID                   0
Industry                     0
Location                     0
Founding_Year                0
Initial_Funding              0
Funding_Rounds               0
Funding_Type                 0
Revenue_Growth_Rate          0
Profit_Margin                0
Annual_Revenue               0
Burn_Rate                    0
Valuation                    0
Debt_Equity_Ratio            0
Market_Size_Growth           0
Competitive_Density          0
Market_Adoption_Rate         0
Partnerships                 0
Founder_Experience           0
Team_Experience              0
Education_Level              0
Product_Differentiation      0
Technology_Stack             0
Success_Probability          0
Projected_Turnover_Year_1    0
Projected_Turnover_Year_2    0
Projected_Turnover_Year_3    0
Projected_Turnover_Year_4    0
Projected_Turnover_Year_5    0
dtype: int64


In [42]:
# One-hot encode the 'Location' column, dropping the first category to avoid multicollinearity
X = pd.get_dummies(X, columns=['Location'], drop_first=True)


In [43]:
# Check for missing values in the dataset
if X.isnull().sum().any() or y.isnull().sum().any():
    print("There are missing values. Handling them now...")
    X = X.fillna(X.mean())  # Fill missing values with the mean of each column
    y = y.fillna(y.mean())  # Fill missing target values with the mean


In [44]:
from sklearn.preprocessing import StandardScaler

# Separate numerical features from categorical ones
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
X_numerical = X[numerical_features]

# Initialize the scaler
scaler = StandardScaler()

# Apply scaling to numerical features only
X_scaled = scaler.fit_transform(X_numerical)

# Replace the scaled numerical columns in the original dataset
X[numerical_features] = X_scaled


In [45]:
# Check the result after preprocessing
print(X.head())


   Initial_Funding  Funding_Rounds  Revenue_Growth_Rate  Profit_Margin  \
0        -1.621894        0.007056             1.490205       0.353526   
1        -0.928268       -1.404139             1.606459      -0.098731   
2         2.708215       -0.698541            -1.328942       0.318737   
3        -0.272903        0.712653            -1.212689      -1.142401   
4         0.736672       -1.404139             0.589241      -0.098731   

   Annual_Revenue  Burn_Rate  Market_Size_Growth  Competitive_Density  \
0        0.613396   0.593864            1.378207             1.135965   
1       -0.034664   1.487922            1.196497            -0.015870   
2       -3.418261  -1.641283           -0.438889             1.135965   
3        0.566436  -1.730689           -0.984018            -0.015870   
4       -2.037537   0.951487            0.651368            -0.783760   

   Market_Adoption_Rate  Founder_Experience  Team_Experience Education_Level  \
0              1.598708             

In [46]:
categorical_columns = X.select_dtypes(include=['object']).columns  # Select categorical (non-numerical) columns
X = pd.get_dummies(X, columns=categorical_columns, drop_first=True)  # One-hot encoding


In [47]:

X = X.astype(np.float32)  # Convert X to float32
y = y.astype(np.float32)  # Convert y to float32 (for multi-output regr

In [48]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [49]:
import tensorflow as tf
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(3, activation='sigmoid')  # Three outputs for each risk type
])

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])  # Using mean squared error for continuous risk scores


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [50]:
import numpy as np
X = np.array(X)  # Ensure X is a numpy array after scaling

# Now proceed with splitting data and training the model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build and compile the model
import tensorflow as tf
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(3, activation='sigmoid')  # Three outputs for each risk type
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])  # Using mean squared e

In [51]:
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

Epoch 1/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - loss: 0.5750 - mae: 0.5307 - val_loss: 0.3710 - val_mae: 0.3918
Epoch 2/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.3601 - mae: 0.3832 - val_loss: 0.3553 - val_mae: 0.3693
Epoch 3/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.3558 - mae: 0.3737 - val_loss: 0.3365 - val_mae: 0.3480
Epoch 4/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.3319 - mae: 0.3478 - val_loss: 0.3223 - val_mae: 0.3263
Epoch 5/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.3222 - mae: 0.3311 - val_loss: 0.3201 - val_mae: 0.3194
Epoch 6/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.3169 - mae: 0.3244 - val_loss: 0.3189 - val_mae: 0.3158
Epoch 7/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.3197 -

In [52]:
test_loss, test_mae = model.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}, Test MAE: {test_mae}")


[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.3304 - mae: 0.3085  
Test Loss: 0.33456364274024963, Test MAE: 0.31424272060394287


In [53]:
predictions = model.predict(X_test)
financial_risk, market_risk, operational_risk = predictions[:, 0], predictions[:, 1], predictions[:, 2]

# Convert to risk percentages if needed
financial_risk_percent = financial_risk * 100
market_risk_percent = market_risk * 100
operational_risk_percent = operational_risk * 100


[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


In [54]:
results = pd.DataFrame({
    'Predicted Financial Risk (%)': financial_risk_percent,
    'Predicted Market Risk (%)': market_risk_percent,
    'Predicted Operational Risk (%)': operational_risk_percent,
    'Actual Financial Risk (%)': y_test['Financial_Risk'].values * 100,
    'Actual Market Risk (%)': y_test['Market_Risk'].values * 100,
    'Actual Operational Risk (%)': y_test['Operational_Risk'].values * 100
})
print(results.head())


   Predicted Financial Risk (%)  Predicted Market Risk (%)  \
0                     62.817703                  99.588776   
1                     38.116810                  99.999985   
2                     29.406239                  99.999962   
3                     43.059654                  99.999931   
4                     62.609501                  37.679657   

   Predicted Operational Risk (%)  Actual Financial Risk (%)  \
0                        0.584043                  60.500000   
1                        1.031462                  30.000002   
2                        9.966451                  76.000000   
3                       11.894053                  42.500000   
4                        2.639676                  58.499996   

   Actual Market Risk (%)  Actual Operational Risk (%)  
0              124.000000                          0.0  
1              217.666672                          0.0  
2              227.666656                         10.0  
3             

In [58]:
# Debugging: Check the structure of y_test and predictions
print("y_test shape:", y_test.shape)
print("predictions shape:", predictions.shape)
print("y_test type:", type(y_test))
print("predictions type:", type(predictions))

# If y_test is a DataFrame, inspect the column names
if isinstance(y_test, pd.DataFrame):
    print("y_test columns:", y_test.columns)


y_test shape: (600, 3)
predictions shape: (600, 3)
y_test type: <class 'pandas.core.frame.DataFrame'>
predictions type: <class 'numpy.ndarray'>
y_test columns: Index(['Financial_Risk', 'Market_Risk', 'Operational_Risk'], dtype='object')


In [59]:

from sklearn.metrics import accuracy_score

# Step 1: Make predictions on the test set
predictions = model.predict(X_test)

# Step 2: Define a threshold for each risk type (e.g., 0.5 for simplicity)
threshold = 0.5

# Convert predictions into binary outcomes (high risk = 1, low risk = 0)
predicted_risk = (predictions >= threshold).astype(int)

# Step 3: Ensure y_test is in the right format (convert it to a NumPy array)
y_test = y_test.values  # Convert y_test to NumPy array

# Convert y_test to binary outcomes (high risk = 1, low risk = 0)
actual_risk = (y_test >= threshold).astype(int)

# Step 4: Create a DataFrame with predictions and actual values for comparison
financial_risk_percent = predictions[:, 0] * 100  # Convert to percentage
market_risk_percent = predictions[:, 1] * 100
operational_risk_percent = predictions[:, 2] * 100

results = pd.DataFrame({
    'Predicted Financial Risk (%)': financial_risk_percent,
    'Predicted Market Risk (%)': market_risk_percent,
    'Predicted Operational Risk (%)': operational_risk_percent,
    'Actual Financial Risk (%)': y_test[:, 0] * 100,  # y_test is assumed to be in the same shape as predictions
    'Actual Market Risk (%)': y_test[:, 1] * 100,
    'Actual Operational Risk (%)': y_test[:, 2] * 100
})

# Step 5: Calculate accuracy for each risk type
accuracy_financial = accuracy_score(actual_risk[:, 0], predicted_risk[:, 0])
accuracy_market = accuracy_score(actual_risk[:, 1], predicted_risk[:, 1])
accuracy_operational = accuracy_score(actual_risk[:, 2], predicted_risk[:, 2])

# Step 6: Print the results and accuracy for each risk type
print(results.head())  # Preview of the predictions vs actual values
print(f"Accuracy for Financial Risk: {accuracy_financial:.4f}")
print(f"Accuracy for Market Risk: {accuracy_market:.4f}")
print(f"Accuracy for Operational Risk: {accuracy_operational:.4f}")


[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
   Predicted Financial Risk (%)  Predicted Market Risk (%)  \
0                     62.817703                  99.588776   
1                     38.116810                  99.999985   
2                     29.406239                  99.999962   
3                     43.059654                  99.999931   
4                     62.609501                  37.679657   

   Predicted Operational Risk (%)  Actual Financial Risk (%)  \
0                        0.584043                  60.500000   
1                        1.031462                  30.000002   
2                        9.966451                  76.000000   
3                       11.894053                  42.500000   
4                        2.639676                  58.499996   

   Actual Market Risk (%)  Actual Operational Risk (%)  
0              124.000000                          0.0  
1              217.666672                          0.0

In [61]:
# Save the trained model to a file
model.save('risk-assessment-model.h5')


