# A venture capitalist company has hired you as a Data Scientist. Your role is to create a model that can predict the profit of the company based on Company's Spending Pattern and Company's Location

In [1]:
import numpy as np
import pandas as pd

In [2]:
startupData = pd.read_csv('50_Startups.csv')

In [3]:
startupData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
R&D Spend          50 non-null float64
Administration     50 non-null float64
Marketing Spend    50 non-null float64
State              50 non-null object
Profit             50 non-null float64
dtypes: float64(4), object(1)
memory usage: 2.0+ KB


In [5]:
startupData.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [10]:
sorted(startupData.State.unique())

['California', 'Florida', 'New York']

In [6]:
#Seperate your data as features and label
features = startupData.iloc[:,[0,1,2,3]].values
label = startupData.iloc[:,[4]].values

In [11]:
#Sklearn Method to handle Categorical Data.

# LabelEncoder --- Number of LabelEncoder object is always equal to number of categorical features to be handled. In this case ONE.

from sklearn.preprocessing import LabelEncoder
stateLE = LabelEncoder()
features[:,3] = stateLE.fit_transform(features[:,3])


In [13]:
stateLE.classes_

array(['California', 'Florida', 'New York'], dtype=object)

In [14]:
#OHE --- Use only one object irrespective of number of categorical label encoded columns
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(categorical_features=[3])
features = ohe.fit_transform(features).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [19]:
#Create Train Test Split

# The way how we sample the data directly impacts the model's performance. This is statistically proven hence using this as an optimization mechanis
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

for randomState in range(1,51):
    
    X_train,X_test,y_train,y_test = train_test_split(features,
                                                label,
                                                test_size=0.35,
                                                random_state=randomState)
    model = LinearRegression()
    model.fit(X_train,y_train)
    
    train_score = model.score(X_train,y_train)
    test_score = model.score(X_test,y_test)
    
    if test_score > train_score:
        print("Test Score: {} Training Score: {} Seed: {}".format(test_score,train_score,randomState))
    

Test Score: 0.9476029634294583 Training Score: 0.9419535391136752 Seed: 2
Test Score: 0.97090234158928 Training Score: 0.9283202503508703 Seed: 5
Test Score: 0.9698590593046786 Training Score: 0.9357679490495387 Seed: 10
Test Score: 0.9604869053005188 Training Score: 0.9429461415257581 Seed: 14
Test Score: 0.9568907399088784 Training Score: 0.9433334268317072 Seed: 21
Test Score: 0.9625086729055357 Training Score: 0.9395744583966136 Seed: 31
Test Score: 0.9512888656452243 Training Score: 0.9477167013783682 Seed: 38
Test Score: 0.9465718635233321 Training Score: 0.9461304330121761 Seed: 42
Test Score: 0.950342712527009 Training Score: 0.9434279245548758 Seed: 48


In [23]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(features,
                                                label,
                                                test_size=0.2,
                                                random_state=10)

#Create Model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train,y_train)

#Check the quality of model
print("Training Accuracy ",model.score(X_train,y_train))
print("Testing Accuracy ",model.score(X_test,y_test))

Training Accuracy  0.9385918220043519
Testing Accuracy  0.9901105113397478


In [24]:
#From the above we understand and conclude the given model is a GENERALIZED model. The accuracy received is satisfactory.
#Thus eligible for deployment

In [25]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error


In [26]:
r2_score(y_test,model.predict(X_test))

0.9901105113397477

In [27]:
mean_squared_error(y_test,model.predict(X_test))

17941201.05401927

In [28]:
mean_absolute_error(y_test,model.predict(X_test))

2969.0527677482

In [30]:
r2_score(y_test,model.predict(X_test), multioutput='variance_weighted')

0.9901105113397478

  y = column_or_1d(y, warn=True)


array([0])

In [43]:
#Deployment Test

#features
#rdSpend,adminSpend,marketSpend,location

rdSpend = float(input("Enter R&D Spend: "))
admin = float(input("Enter Admin Spend: "))
markg = float(input("Enter Marketing Spend: " ))
state = input("Enter State: ")

if state in stateLE.classes_:
    features = np.array([[rdSpend,admin,markg,state]])
    print("Accepted User Input ", features)
    
    #Perform Label Encoding Transformation with existing object
    features[:,3] = stateLE.transform(features[:,3])
    print("After Label Encoding: ",features)
    
    #Perform OHE transformation with existing object
    features = ohe.transform(features).toarray()
    print("After OHE : ",features)
    
    #Predict profit using existing model
    profit = model.predict(features)
    
    #output the profit
    print("The expected profit is ", profit)
else:
    print("Model cant understand state: ",state)
#stateLEObj = stateLE.transform(np.array([[state]]))

#features 

Enter R&D Spend:  123456
Enter Admin Spend:  23456
Enter Marketing Spend:  2345
Enter State:  California


Accepted User Input  [['123456.0' '23456.0' '2345.0' 'California']]
After Label Encoding:  [['123456.0' '23456.0' '2345.0' '0']]
After OHE :  [[1.00000e+00 0.00000e+00 0.00000e+00 1.23456e+05 2.34560e+04 2.34500e+03]]
The expected profit is  [[149206.00517152]]


In [44]:
#Pickle Object
# Model, StateLE, ohe
import pickle
pickle.dump(model,open('ProfitPredictor.model','wb'))
pickle.dump(stateLE,open('EncodeState.obj','wb'))
pickle.dump(ohe, open('EncodeToDummy.dummy','wb'))