In [10]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [12]:
data = pd.read_csv('Startups.csv')

In [13]:
# One-hot encoding for the 'State' column
data = pd.get_dummies(data, columns=['State'], drop_first=True)

In [14]:
# Ensure all columns are numeric
data = data.apply(pd.to_numeric, errors='coerce')

In [15]:
# Check for and handle any missing values
data = data.dropna()

In [16]:
# Define the independent variables (X) and the dependent variable (y)
X = data.drop('Profit', axis=1)
y = data['Profit']

In [17]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Add a constant to the model (intercept)
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

In [21]:
# Ensure the training and testing data are numpy arrays
X_train = np.asarray(X_train).astype(float)
X_test = np.asarray(X_test).astype(float)
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)

In [34]:
# Fit the multiple linear regression model
model = sm.OLS(y_train, X_train).fit()

In [23]:
# Summary of the model
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.954
Model:                            OLS   Adj. R-squared:                  0.947
Method:                 Least Squares   F-statistic:                     140.1
Date:                Sun, 26 May 2024   Prob (F-statistic):           1.13e-21
Time:                        08:16:33   Log-Likelihood:                -420.63
No. Observations:                  40   AIC:                             853.3
Df Residuals:                      34   BIC:                             863.4
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       5.403e+04   8457.293      6.388      0.0

In [24]:
# Predicting the Test set results
y_pred = model.predict(X_test)

In [25]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('Mean Squared Error:', mse)
print('R-squared:', r2)



Mean Squared Error: 82010363.03969769
R-squared: 0.8987266414385483


In [26]:

# Predicting profit for a new startup
new_startup = pd.DataFrame({
    'R&D Spend': [150000],
    'Administration': [120000],
    'Marketing Spend': [300000],
    'State_Florida': [0],
    'State_New York': [1]
})

In [35]:
new_startup['State_California'] = 0
new_startup = sm.add_constant(new_startup)
predicted_profit = model.predict(new_startup)
print('Predicted Profit for the new startup:', predicted_profit[0])

Predicted Profit for the new startup: 8104282917.144777
