# Startup Success Prediction - Analysis & Model Building

This notebook performs Exploratory Data Analysis (EDA) on the startup dataset and trains a machine learning model to predict profit.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import pickle
import os

## 1. Load Dataset

In [None]:
df = pd.read_csv('../dataset/startup_data.csv')
df.head()

## 2. Exploratory Data Analysis (EDA)

In [None]:
df.info()
df.describe()

In [None]:
sns.pairplot(df)
plt.show()

## 3. Preprocessing

In [None]:
# Encoding Categorical Data (State)
df = pd.get_dummies(df, columns=['State'], drop_first=True)
df.head()

## 4. Model Building

In [None]:
X = df.drop('Profit', axis=1)
y = df['Profit']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print(f"Linear Regression R2 Score: {r2_score(y_test, y_pred_lr)}")

In [None]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print(f"Random Forest R2 Score: {r2_score(y_test, y_pred_rf)}")

## 5. Save Best Model

In [None]:
# Determine best model (simplified logic)
best_model = rf # Assuming RF is better or equal
if not os.path.exists('../models'):
    os.makedirs('../models')

with open('../models/model.pkl', 'wb') as f:
    pickle.dump(best_model, f)
    
with open('../models/columns.pkl', 'wb') as f:
    pickle.dump(X.columns.tolist(), f)