In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import copy
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
startups = pd.read_csv("/content/50_StartUp.csv")
df = copy.deepcopy(startups)

In [None]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [None]:
df.shape

(50, 5)

In [None]:
df.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


In [None]:
df.isna().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [None]:
df.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit'], dtype='object')

In [None]:
dfDummies = pd.get_dummies(df["State"], prefix="State")
# dfDummies

In [None]:
df = pd.concat([df,dfDummies], axis=1)
# df.head()

In [None]:
df = df.drop(["State"], axis=1) # Remove Text State Column
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_California,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,0,0,1
1,162597.7,151377.59,443898.53,191792.06,1,0,0
2,153441.51,101145.55,407934.54,191050.39,0,1,0
3,144372.41,118671.85,383199.62,182901.99,0,0,1
4,142107.34,91391.77,366168.42,166187.94,0,1,0


In [None]:
x = df.drop("Profit", axis= 1) # All columns without profit is input
y = df["Profit"]               # Profit column is output

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state= 35)

In [None]:
my_model = LinearRegression()
my_model.fit(x_train,y_train)

In [None]:
y_pred = my_model.predict(x_test)

In [None]:
m = my_model.coef_
b = my_model.intercept_

In [None]:
print('Intercept of the model:\n',b)
print("="*50)
print('Coefficient of the line:\n',m)
print("="*50)

Intercept of the model:
 48671.0246154623
Coefficient of the line:
 [ 8.32481595e-01 -2.06061406e-02  2.58634933e-02 -9.35515441e+02
  3.09155827e+03 -2.15604283e+03]


In [None]:
# get coeficients and their respective feature names
coef_series = pd.Series(
  data=my_model.coef_,
  index=my_model.feature_names_in_
)

rd_spend_coef = coef_series['R&D Spend']
administration_spend_coef = coef_series['Administration']
marketing_spend_coef = coef_series['Marketing Spend']
california_coef = coef_series['State_California']
florida_coef = coef_series['State_Florida']
newyork_coef = coef_series['State_New York']

In [None]:
rd_spend = input('Enter R&D Spend ')
administration_spend = input('Enter Administration Spend ')
marketing_spend = input('Enter Marketing Spend ')
is_california = input('Is California ? 0 or 1 ? ')
is_florida = input('Is Florida ? 0 or 1 ?')
is_newyork = input('Is NewYork ? 0 or 1 ?')

Enter R&D Spend 144372.41
Enter Administration Spend 118671.85
Enter Marketing Spend 383199.62
Is California ? 0 or 1 0
Is Florida ? 0 or 1 0
Is NewYork ? 0 or 1 1


In [None]:
profit= b + (rd_spend_coef * float (rd_spend)) + (administration_spend_coef * float (administration_spend)) + (marketing_spend_coef * float (marketing_spend)) + (california_coef * float (is_california)) + (florida_coef * float(is_florida)) + (newyork_coef * float (is_newyork))

print(profit)



174167.86799100385
