# Startup  company profit prediction

#### Import libraries for Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

####  Import the Dataset

In [2]:
dataset=pd.read_csv(r"C:\Users\USER\1pythonfiles\datasets\50_Startups.csv")

In [3]:
dataset

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,127716.82,California,156122.51
7,130298.13,145530.06,323876.68,Florida,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


#### Handle Missing Values

In [4]:
dataset.shape

(50, 5)

In [5]:
dataset.isnull().any()

R&D Spend           True
Administration     False
Marketing Spend     True
State               True
Profit             False
dtype: bool

In [6]:
#checking sum of null values
dataset.isnull().sum()

R&D Spend          1
Administration     0
Marketing Spend    1
State              1
Profit             0
dtype: int64

In [7]:
dataset.shape

(50, 5)

#### Fill the Missing Values

In [8]:
dataset.mean()

R&D Spend           74641.163673
Administration     121344.639600
Marketing Spend    210654.978367
Profit             112012.639200
dtype: float64

In [9]:
dataset.fillna(dataset.mean(),inplace=True)
dataset.fillna(dataset["State"].mode()[0],inplace=True)

In [10]:
dataset.isnull().any()

R&D Spend          False
Administration     False
Marketing Spend    False
State              False
Profit             False
dtype: bool

####  Encoding Categorical text data

In [11]:
dataset

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,127716.82,California,156122.51
7,130298.13,145530.06,323876.68,Florida,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


In [12]:
dataset=pd.get_dummies(dataset,columns=['State'])

In [13]:
dataset

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_California,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,0,0,1
1,162597.7,151377.59,443898.53,191792.06,1,0,0
2,153441.51,101145.55,407934.54,191050.39,0,1,0
3,144372.41,118671.85,383199.62,182901.99,0,0,1
4,142107.34,91391.77,366168.42,166187.94,0,1,0
5,131876.9,99814.71,362861.36,156991.12,0,0,1
6,134615.46,147198.87,127716.82,156122.51,1,0,0
7,130298.13,145530.06,323876.68,155752.6,0,1,0
8,120542.52,148718.95,311613.29,152211.77,0,0,1
9,123334.88,108679.17,304981.62,149759.96,1,0,0


#### Split Dependent and Independent Variables

In [14]:
#Independent Variables 
x=dataset.iloc[:,[0,1,2,4,5,6]]
x.head(2)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_California,State_Florida,State_New York
0,165349.2,136897.8,471784.1,0,0,1
1,162597.7,151377.59,443898.53,1,0,0


In [15]:
y=dataset.iloc[:,3].values
y

array([192261.83, 191792.06, 191050.39, 182901.99, 166187.94, 156991.12,
       156122.51, 155752.6 , 152211.77, 149759.96, 146121.95, 144259.4 ,
       141585.52, 134307.35, 132602.65, 129917.04, 126992.93, 125370.37,
       124266.9 , 122776.86, 118474.03, 111313.02, 110352.25, 108733.99,
       108552.04, 107404.34, 105733.54, 105008.31, 103282.38, 101004.64,
        99937.59,  97483.56,  97427.84,  96778.92,  96712.8 ,  96479.51,
        90708.19,  89949.14,  81229.06,  81005.76,  78239.91,  77798.83,
        71498.49,  69758.98,  65200.33,  64926.08,  49490.75,  42559.73,
        35673.41,  14681.4 ])

####  Split dataset into Training and test set

In [16]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.2,random_state=23)

In [17]:
x.shape, " ",x_train.shape, "  ",x_test.shape,"  ",y_train.shape," ", y_test.shape

((50, 6), ' ', (40, 6), '  ', (10, 6), '  ', (40,), ' ', (10,))

In [18]:
from sklearn.preprocessing import MinMaxScaler
sc=MinMaxScaler()

In [19]:
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

In [44]:
#bulilding the model
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
model=DecisionTreeRegressor(criterion='mse',
                            max_features='auto')

In [45]:
#training model
model.fit(x_train,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features='auto',
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [46]:
#prediction
y_pred=model.predict(x_test)
y_pred

array([149759.96, 129917.04, 129917.04, 107404.34, 141585.52,  97483.56,
       108733.99, 149759.96, 105733.54, 101004.64])

In [47]:
y_test

array([132602.65, 124266.9 , 125370.37,  97427.84, 134307.35,  96778.92,
       118474.03, 152211.77,  90708.19, 103282.38])

In [48]:
pd.DataFrame([y_pred,y_test]).T

Unnamed: 0,0,1
0,149759.96,132602.65
1,129917.04,124266.9
2,129917.04,125370.37
3,107404.34,97427.84
4,141585.52,134307.35
5,97483.56,96778.92
6,108733.99,118474.03
7,149759.96,152211.77
8,105733.54,90708.19
9,101004.64,103282.38


In [49]:
#r2 score
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.7674168673374064

### Conclusion

Here, We predicted the profit of a start up company in some location by using the data of previous startup company profits by using the Decision tree ML Algorithm with score of 76%.