***Name: SADDAM HUSSAIN***

***REG ID: GO_STP_6834***

---



---



# Predicting a Startups Profit/Success Rate using Multiple Linear Regression in Python

Here 50 startups dataset containing 5 columns  like “R&D Spend”, “Administration”, “Marketing Spend”, “State”, “Profit”.

In this dataset first 3 columns provides you spending on Research , Administration and Marketing respectively. State indicates startup based on that state. Profit indicates how much profits earned by a startup.

Clearly, we can understand that it is a multiple linear regression problem, as the independent variables are more than one.

Prepare a prediction model for profit of 50_Startups data in Python

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('/content/50_Startups.csv')

In [None]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [None]:
df.shape

(50, 5)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [None]:
df.describe()
#R&D Spend and Marketing Spend column has 0 as min, so there are 0 in the columns.

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


In [None]:
df = df.replace(0, np.nan)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        48 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  47 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [None]:
#Replacing the NaN value with the mean value for the respective column

df['R&D Spend'].fillna(df['R&D Spend'].mean(), inplace = True)

df['Marketing Spend'].fillna(df['Marketing Spend'].mean(), inplace = True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [None]:
df.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,76793.349583,121344.6396,224494.784894,112012.6392
std,43312.151465,28017.802755,109792.846033,40306.180338
min,542.05,51283.14,1903.93,14681.4
25%,46117.0325,103730.875,142431.385,90138.9025
50%,75791.365,122699.795,224494.784894,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


In [None]:
df['State'].value_counts()

California    17
New York      17
Florida       16
Name: State, dtype: int64

In [None]:
#State Column is categorical value we will be converting it to numerical value

# Don't use labelled data as after the coversion one will not be superior to other (A - 1, B-2, C-3)

# Prefer One Hot Encoding or Target Encoding 

# ONE-HOT ENCODING - Prefer the other method
#State_Cat = pd.get_dummies(df['State'])
#df = df.join(State_Cat)

# The Dimension increases and affect the model building and the accuracy might go down.

# So go for Target Encoding

targ_en_mean_Val = df.groupby('State')['Profit'].mean()
df = df.merge(targ_en_mean_Val, how = 'left', on = 'State')
df.drop('State', axis = 1, inplace = True)
df = df.rename(columns = {'Profit_x': 'Profit', 'Profit_y': 'State'})

In [None]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State
0,165349.2,136897.8,471784.1,192261.83,113756.446471
1,162597.7,151377.59,443898.53,191792.06,103905.175294
2,153441.51,101145.55,407934.54,191050.39,118774.024375
3,144372.41,118671.85,383199.62,182901.99,113756.446471
4,142107.34,91391.77,366168.42,166187.94,118774.024375


In [None]:
y = df['Profit']
X = df.drop('Profit', axis = 1)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.15, random_state = 5)

In [None]:
X_train.shape

(42, 4)

# Model Creation

In [None]:
import sklearn
from sklearn.linear_model import LinearRegression

In [None]:
lm = LinearRegression()

In [None]:
model = lm.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
model.intercept_

-10853.114917760715

In [None]:
model.coef_

array([0.67491189, 0.01103958, 0.07921964, 0.45254324])

In [None]:
comp = pd.DataFrame(y_pred,y_test)
comp

Unnamed: 0_level_0,0
Profit,Unnamed: 1_level_1
71498.49,64910.501808
101004.64,95081.387383
156122.51,138764.738166
122776.86,118431.405861
103282.38,98852.210497
125370.37,128498.86898
191050.39,179889.79828
69758.98,55312.848002


# Error

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
# For getting error and other functionalities
import sklearn.metrics as metrics

In [None]:
metrics.r2_score(y_test, y_pred)   # the deviation cause by the independet var on the dependent variable

# R2 = 1-(rss/tss)

0.9358107537794443

In [None]:
metrics.mean_absolute_error(y_test,y_pred)

8422.482373036242

In [None]:
mse = mean_squared_error(y_test,y_pred)
print(mse)

95165646.20627034


In [None]:
#RMSE
np.sqrt(metrics.mean_squared_error(y_test,y_pred))

# stnd devition of the residuals - has to be less as much as it can

# On an avg each data points will have this much of deviation from the observed values or the expected values or actual values or y_test

9755.288114980016