In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
data = pd.read_csv("50_Startups.csv")

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
R&D Spend          45 non-null float64
Administration     47 non-null float64
Marketing Spend    42 non-null float64
State              50 non-null object
Profit             50 non-null float64
dtypes: float64(4), object(1)
memory usage: 2.0+ KB


In [3]:
data.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,45.0,47.0,42.0,50.0
mean,81000.976889,120322.412979,230042.12,112012.6392
std,51091.759462,28522.933941,110413.286286,40306.180338
min,542.05,51283.14,1903.93,14681.4
25%,46014.02,102101.52,152118.51,90138.9025
50%,75328.87,121597.55,239452.75,107978.19
75%,114523.61,144606.78,302423.7675,139765.9775
max,250000.0,182645.56,471784.1,192261.83


In [30]:
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,230042.12,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


# 1. Fill Missing Values

In [6]:
# Replace na values present in variable x with mean/median/mode

data.fillna(data.mean(), inplace = True)

In [7]:
data.head(15)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,230042.12,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,127716.82,New York,156122.51
7,81000.976889,145530.06,323876.68,Florida,155752.6
8,120542.52,120322.412979,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


# 2. Handling Categorical Values

In [8]:
# Store columns having categorical variables in single variable each.

cat_variables = data.iloc[:,3]
cat_variables.head()

0      New York
1    California
2       Florida
3      New York
4       Florida
Name: State, dtype: object

In [9]:
# Get dummy variables of that varaible(s).

cat_variables = pd.get_dummies(cat_variables)

In [10]:
cat_variables.head()

Unnamed: 0,California,Florida,New York
0,0,0,1
1,1,0,0
2,0,1,0
3,0,0,1
4,0,1,0


In [28]:
trained_data = pd.concat([data.iloc[:, [0,1,2,4]],cat_variables], axis = 1)



In [29]:
trained_data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,California,Florida,New York
0,165349.2,136897.8,471784.1,192261.83,0,0,1
1,162597.7,151377.59,443898.53,191792.06,1,0,0
2,153441.51,101145.55,407934.54,191050.39,0,1,0
3,144372.41,118671.85,230042.12,182901.99,0,0,1
4,142107.34,91391.77,366168.42,166187.94,0,1,0


# 3. Dectecting And Removing Outliers.

In [13]:
Q1 = trained_data.quantile(0.25)
Q3 = trained_data.quantile(0.75)

IQR = Q3 - Q1


below_IQR = trained_data < (Q1 - 1.5 * IQR)
above_IQR = trained_data > (Q3 + 1.5 * IQR)


In [14]:
data_without_outliers = trained_data[~((below_IQR) | (above_IQR)).any(axis=1)]
data_without_outliers.shape

(48, 7)

In [15]:
data_without_outliers.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48 entries, 0 to 48
Data columns (total 7 columns):
R&D Spend          48 non-null float64
Administration     48 non-null float64
Marketing Spend    48 non-null float64
Profit             48 non-null float64
California         48 non-null uint8
Florida            48 non-null uint8
New York           48 non-null uint8
dtypes: float64(4), uint8(3)
memory usage: 2.0 KB


# 4. Standardize Data

In [16]:
data_to_standardize = data_without_outliers.iloc[:,[0,1,2,3,4,5,6]]
data_to_standardize.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,California,Florida,New York
0,165349.2,136897.8,471784.1,192261.83,0,0,1
1,162597.7,151377.59,443898.53,191792.06,1,0,0
2,153441.51,101145.55,407934.54,191050.39,0,1,0
3,144372.41,118671.85,230042.12,182901.99,0,0,1
4,142107.34,91391.77,366168.42,166187.94,0,1,0


In [17]:
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()

column_names = data_to_standardize.columns


scaled_data = scalar.fit_transform(data_to_standardize)
scaled_data = pd.DataFrame(scaled_data, columns=column_names)

scaled_data.head()


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,California,Florida,New York
0,2.079528,0.625424,2.41711,2.056256,-0.6742,-0.774597,1.48324
1,2.01441,1.152018,2.133776,2.043942,1.48324,-0.774597,-0.6742
2,1.797718,-0.674796,1.76836,2.024501,-0.6742,1.290994,-0.6742
3,1.583087,-0.037408,-0.039133,1.810912,-0.6742,-0.774597,1.48324
4,1.529481,-1.029517,1.343992,1.372795,-0.6742,1.290994,-0.6742


In [18]:
scaled_data.mean()

R&D Spend          1.572816e-16
Administration    -7.308968e-16
Marketing Spend    1.384888e-16
Profit            -4.348374e-16
California         6.013708e-17
Florida           -4.163336e-17
New York           6.938894e-17
dtype: float64

In [19]:
scaled_data.std()

R&D Spend          1.010582
Administration     1.010582
Marketing Spend    1.010582
Profit             1.010582
California         1.010582
Florida            1.010582
New York           1.010582
dtype: float64

# Divide Data into Features and Target

In [20]:
data_after_cleaning = scaled_data


In [21]:
features = data_after_cleaning.iloc[:, [0,1,2,4,5,6]].values


In [22]:
target = data_after_cleaning.iloc[:, [3]].values.reshape(-1,1)

In [26]:
data_after_cleaning.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,California,Florida,New York
0,2.079528,0.625424,2.41711,2.056256,-0.6742,-0.774597,1.48324
1,2.01441,1.152018,2.133776,2.043942,1.48324,-0.774597,-0.6742
2,1.797718,-0.674796,1.76836,2.024501,-0.6742,1.290994,-0.6742
3,1.583087,-0.037408,-0.039133,1.810912,-0.6742,-0.774597,1.48324
4,1.529481,-1.029517,1.343992,1.372795,-0.6742,1.290994,-0.6742


In [24]:
features.shape

(48, 6)

In [25]:
target.shape

(48, 1)