In [21]:
# Introduction to Model Building

from pandas import read_csv
from numpy import set_printoptions 

dataframe = read_csv('BBC.csv')

# Show first five rows of dataframe
dataframe.head()

Unnamed: 0,MaritalStatus,Gender,YearlyIncome,TotalChildren,NumberChildrenAtHome,EnglishEducation,HouseOwnerFlag,NumberCarsOwned,CommuteDistance,Region,Age,BikeBuyer
0,5,1,90000,2,0,5,1,0,2,2,50,1
1,5,1,60000,3,3,5,0,1,1,2,51,1
2,5,1,60000,3,3,5,1,1,5,2,51,1
3,5,2,70000,0,0,5,0,1,10,2,49,1
4,5,2,80000,5,5,5,1,4,2,2,48,1


In [12]:
# For seeing current working directory

import os

print(os.getcwd())

C:\Users\Pavan


In [13]:
# shows last five rows of dataframe
dataframe.tail(5)

Unnamed: 0,MaritalStatus,Gender,YearlyIncome,TotalChildren,NumberChildrenAtHome,EnglishEducation,HouseOwnerFlag,NumberCarsOwned,CommuteDistance,Region,Age,BikeBuyer
18479,4,1,30000,1,0,4,1,0,1,3,58,1
18480,4,2,30000,3,0,4,1,0,1,3,56,1
18481,4,1,30000,3,0,4,0,0,1,3,57,1
18482,5,1,30000,3,0,5,1,0,1,3,58,1
18483,5,1,30000,0,0,5,1,0,1,3,57,1


In [14]:
# shows shape(rows, columns) of a dataframe
dataframe.shape

(18484, 12)

In [15]:
# shows mean, median, mode of the data

# mean = average
# median = values in the middle of the rage
# mode = The number which appears most often in a set of numbers

dataframe.describe()

Unnamed: 0,MaritalStatus,Gender,YearlyIncome,TotalChildren,NumberChildrenAtHome,EnglishEducation,HouseOwnerFlag,NumberCarsOwned,CommuteDistance,Region,Age,BikeBuyer
count,18484.0,18484.0,18484.0,18484.0,18484.0,18484.0,18484.0,18484.0,18484.0,18484.0,18484.0,18484.0
mean,3.310106,1.494103,57305.77797,1.844352,1.004058,3.310106,0.676369,1.502705,6.003246,1.78971,54.603928,0.494049
std,1.426147,0.499979,32285.841703,1.612408,1.52266,1.426147,0.467874,1.138394,6.382562,0.872664,11.255252,0.499978
min,1.0,1.0,10000.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,36.0,0.0
25%,2.0,1.0,30000.0,0.0,0.0,2.0,0.0,1.0,1.0,1.0,46.0,0.0
50%,3.0,1.0,60000.0,2.0,0.0,3.0,1.0,2.0,2.0,1.0,53.0,0.0
75%,5.0,2.0,70000.0,3.0,2.0,5.0,1.0,2.0,10.0,3.0,62.0,1.0
max,5.0,2.0,170000.0,5.0,5.0,5.0,1.0,4.0,20.0,3.0,106.0,1.0


In [16]:
dataframe.values

array([[    5,     1, 90000, ...,     2,    50,     1],
       [    5,     1, 60000, ...,     2,    51,     1],
       [    5,     1, 60000, ...,     2,    51,     1],
       ...,
       [    4,     1, 30000, ...,     3,    57,     1],
       [    5,     1, 30000, ...,     3,    58,     1],
       [    5,     1, 30000, ...,     3,    57,     1]], dtype=int64)

In [18]:
# include all items in the dataframe from 0 to 11
dataframe.values[:,0:11]

array([[    5,     1, 90000, ...,     2,     2,    50],
       [    5,     1, 60000, ...,     1,     2,    51],
       [    5,     1, 60000, ...,     5,     2,    51],
       ...,
       [    4,     1, 30000, ...,     1,     3,    57],
       [    5,     1, 30000, ...,     1,     3,    58],
       [    5,     1, 30000, ...,     1,     3,    57]], dtype=int64)

In [40]:
# column in the dataframe mapped to the 11th row
dataframe.values[:, 11]

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [33]:
# shows only bikebuyer column
bike_buyers = dataframe.groupby('BikeBuyer')

# size shows number of distributions of o's(non_bike_buyers) and 1'(bike_buyers)
bike_buyers.size()

BikeBuyer
0    9352
1    9132
dtype: int64

Number of bikers and non-bike buyers are evenly distributed
As we can see the distributions 9352(non_bike_buyers) and 9152(bike_buyers) above.

Even number of distributions really matters for finding the accuracy of our predictive model.




# Creation of model to check the accuracy


[Building model using SVC]

In [52]:
from pandas import read_csv
from sklearn.model_selection import train_test_split     # For testing
from sklearn.svm import SVC                              # Classifier

dataframe = read_csv('BBC.csv')

x = dataframe.values[:, 0:11]                           # Whole Training data, Includes all items in the dataframe from 0 to 11
y = dataframe.values[:, 11]                             # BikeBuyers column in the dataframe mapped to the 11th row

# In the cell below we are specifying the test size. 
# If our test size is 25% of the data then our training size is 75% of the data.
# The sole pupose of seed is to reproduce the results.

test_size = 0.30
seed = 45

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = test_size, random_state = seed)

model = SVC(gamma = 'auto')                       # Setting gamma explicitly to auto for the purpose of avoiding future warning
model.fit(x_test, y_test)

result = model.score(x_test, y_test) * 100
print('Accuracy is {}'.format(result))


Accuracy is 81.87883159033538



[Building model using KNeighborsClassifier]

In [53]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

dataframe = read_csv('BBC.csv')

x = dataframe.values[:, 0:11]                           # Whole Training data, Includes all items in the dataframe from 0 to 11
y = dataframe.values[:, 11]                             # BikeBuyers column in the dataframe mapped to the 11th row

# In the cell below we are specifying the test size. 
# If our test size is 25% of the data then our training size is 75% of the data.
# The sole pupose of seed is to reproduce the results.

test_size = 0.30
seed = 45

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = test_size, random_state = seed)

model = KNeighborsClassifier()                    
model.fit(x_test, y_test)

result = model.score(x_test, y_test) * 100
print('Accuracy is {}'.format(result))

Accuracy is 82.58204111071042



[Building Model Using RandomForestClassifier]

In [55]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

dataframe = read_csv('BBC.csv')

x = dataframe.values[:, 0:11]                           # Whole Training data, Includes all items in the dataframe from 0 to 11
y = dataframe.values[:, 11]                             # BikeBuyers column in the dataframe mapped to the 11th row

# In the cell below we are specifying the test size. 
# If our test size is 25% of the data then our training size is 75% of the data.
# The sole pupose of seed is to reproduce the results.

test_size = 0.30
seed = 45

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = test_size, random_state = seed)

model = RandomForestClassifier(n_estimators = 100)      # Setting n_estimator to 100 for the purpose of avoiding future warning
model.fit(x_test, y_test)

result = model.score(x_test, y_test) * 100
print('Accuracy is {}'.format(result))

Accuracy is 90.75009015506672
