In [1]:
#Importing libraries and Datasets!
import pandas as pd
df1=pd.read_csv('AdvWorksCusts.csv')
df2=pd.read_csv('AW_BikeBuyer.csv')
df3=pd.read_csv('AW_AveMonthSpend.csv')
# Reading test file to predict
test=pd.read_csv('AW_test.csv')

In [2]:
#Creating a master dataset from all 3 datasets!
df4=pd.merge(df1, df2, on='CustomerID')
df=pd.merge(df4,df3,on='CustomerID')

In [3]:
#Checking the dataset columns!
df.columns

Index(['CustomerID', 'Title', 'FirstName', 'MiddleName', 'LastName', 'Suffix',
       'AddressLine1', 'AddressLine2', 'City', 'StateProvinceName',
       'CountryRegionName', 'PostalCode', 'PhoneNumber', 'BirthDate',
       'Education', 'Occupation', 'Gender', 'MaritalStatus', 'HomeOwnerFlag',
       'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren',
       'YearlyIncome', 'BikeBuyer', 'AveMonthSpend'],
      dtype='object')

In [4]:
#Checking the datatypes of the columns
df.dtypes

CustomerID               int64
Title                   object
FirstName               object
MiddleName              object
LastName                object
Suffix                  object
AddressLine1            object
AddressLine2            object
City                    object
StateProvinceName       object
CountryRegionName       object
PostalCode              object
PhoneNumber             object
BirthDate               object
Education               object
Occupation              object
Gender                  object
MaritalStatus           object
HomeOwnerFlag            int64
NumberCarsOwned          int64
NumberChildrenAtHome     int64
TotalChildren            int64
YearlyIncome             int64
BikeBuyer                int64
AveMonthSpend            int64
dtype: object

In [100]:
#Lets see how our data looks like
df.head()

Unnamed: 0,CustomerID,Title,FirstName,MiddleName,LastName,Suffix,AddressLine1,AddressLine2,City,StateProvinceName,...,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,BikeBuyer,AveMonthSpend
0,11000,,Jon,V,Yang,,3761 N. 14th St,,Rockhampton,Queensland,...,Professional,M,M,1,0,0,2,137947,0,89
1,11001,,Eugene,L,Huang,,2243 W St.,,Seaford,Victoria,...,Professional,M,S,0,1,3,3,101141,1,117
2,11002,,Ruben,,Torres,,5844 Linden Land,,Hobart,Tasmania,...,Professional,M,M,1,1,3,3,91945,0,123
3,11003,,Christy,,Zhu,,1825 Village Pl.,,North Ryde,New South Wales,...,Professional,F,S,0,1,0,0,86688,0,50
4,11004,,Elizabeth,,Johnson,,7553 Harness Circle,,Wollongong,New South Wales,...,Professional,F,S,1,4,5,5,92771,1,95


Now time to select features to make this model. Let's study it one by one!

**CustomerID** : This is compulsory as all data will be indexed to this unique ID.

**Title**,**FirstName**,**MiddleName**,**LastName**,**Suffix**: All these doesn't determine any significant thing as AveMonthSpend and BikeBuyer doesn't depend on Name!

**AddressLine1**,**AddressLine2** : I'm avoiding these to prevent the overfitting the data as we already have 2-3 more location parameters!

**City**,**StateProvinceName**,**CountryRegionName**: All these will play major role as they gives us the location and economy varies economy to economy. 

**PostalCode**: Leaving this behind since we already have lots of location variable.

**PhoneNumber** : Phone number and our target variable doesn't have any relation!

**BirthDate** : It is good idea to extract the Age of customer from this column.

**Education** ,**Occupation** : They will be key factor in predicitng the total capital someone have! So we must select them.

**Gender**, **MaritalStatus** : Both plays important role in determining the present condition of someone.It is not directly linked but still very important factor. Females are tends to buy less bikes. Same goes for the Married People if we compare them to the Unmarried Young adults.

**NumberCarsOwned**,**TotalChildren**,**NumberChildrenAtHome** : All are key financial factors and must be considered!




In [5]:
#Creating a new feature from Date of Birth column!!
df['Age'] = pd.DatetimeIndex(df['BirthDate']).year
df['Age']=2019-df.Age
test['Age'] = pd.DatetimeIndex(test['BirthDate']).year
test['Age']=2019-test.Age

In [6]:
#I will be selecting important columns for our model!
cols=['CustomerID','City','StateProvinceName','CountryRegionName','NumberCarsOwned',
      'Education','Occupation','Gender','Age',
     'MaritalStatus','HomeOwnerFlag','NumberChildrenAtHome',
      'TotalChildren','YearlyIncome']

In [7]:
#Lets see how our test dataset looks like!
test.head()

Unnamed: 0,CustomerID,Title,FirstName,MiddleName,LastName,Suffix,AddressLine1,AddressLine2,City,StateProvinceName,...,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,Age
0,18988,,Courtney,A,Baker,,8727 Buena Vista Ave.,,Fremont,California,...,Bachelors,Management,F,S,0,2,0,5,86931,74
1,29135,,Adam,C,Allen,,3491 Cook Street,,Haney,British Columbia,...,Bachelors,Skilled Manual,M,M,1,2,2,4,100125,55
2,12156,,Bonnie,,Raji,,359 Pleasant Hill Rd,,Burbank,California,...,Graduate Degree,Management,F,M,1,2,0,4,103985,85
3,13749,,Julio,C,Alonso,,8945 Euclid Ave.,,Burlingame,California,...,Graduate Degree,Skilled Manual,M,M,1,0,0,4,127161,61
4,27780,,Christy,A,Andersen,,"42, boulevard Tremblay",,Dunkerque,Nord,...,High School,Manual,F,M,1,1,2,2,21876,54


In [8]:
#Selecting the important features
test=test[cols]
df_d=df[cols]

In [9]:
#Checking whether our both the data looks similar or not!
if (len(df_d.columns)== len(test.columns)):
    print("Your data seems cool!! \nMOVE AHEAD")

Your data seems cool!! 
MOVE AHEAD


In [10]:
train_objs_num = len(df)

In [11]:
#Merging both datasets to create dummy variables
dataset = pd.concat([df_d,test],axis=0)
dummies=pd.get_dummies(data=dataset,columns=['City', 'StateProvinceName','CountryRegionName','Education','Occupation'
                                        ,'Gender','MaritalStatus'])
dataset=pd.concat([dataset,dummies],axis=1)

In [12]:
#Separating the datasset
train= dataset[:train_objs_num]
test = dataset[train_objs_num:]

In [13]:
#Dropping the columns for which we have already created dummmies! 
train.drop(['City', 'StateProvinceName','CountryRegionName','Education','Occupation','Gender',
        'MaritalStatus'],axis=1,inplace=True)
test.drop(['City', 'StateProvinceName','CountryRegionName','Education','Occupation','Gender',
        'MaritalStatus'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [14]:
#Creating a cross validation set !
from sklearn.model_selection import train_test_split
predictors=train.drop(['CustomerID'],axis=1)
target=df['AveMonthSpend']
x_train,x_cv,y_train,y_cv=train_test_split(predictors,target,test_size=0.35,random_state=0)

For this project, we will calculate the coefficient of determination, R², to quantify the model’s performance. The coefficient of determination for a model is a useful statistic in regression analysis, as it often describes how “good” that model is at making predictions.Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0.



In [15]:
#Importing necessary Algorithms and tools
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score

In [16]:
#Making a function to check the scores of Algorithms
def scores(i):
    lin = i()
    lin.fit(x_train, y_train)
    y_pred=lin.predict(x_cv)
    lin_r= r2_score(y_cv, y_pred)
    s.append(lin_r)
#Checking the scores by using our function
algos=[LinearRegression,KNeighborsRegressor,RandomForestRegressor,Lasso,ElasticNet,DecisionTreeRegressor]
s=[]
for i in algos:
    scores(i)    



In [17]:
models = pd.DataFrame({
    'Method': ['LinearRegression', 'KNeighborsRegressor', 
              'RandomForestRegressor', 'Lasso','DecisionTreeRegressor'],
    'Score': [s[0],s[1],s[2],s[3],s[4]]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Method,Score
2,RandomForestRegressor,0.985038
0,LinearRegression,0.948368
3,Lasso,0.934851
4,DecisionTreeRegressor,0.85696
1,KNeighborsRegressor,0.330099


We will be using **RandomForestRegressor** to predict **AveMonthSpend** for out test set.

Now lets try to build a model for our target variable called **BikeBuyer**.

In [18]:
predictors1=train.drop(['CustomerID'],axis=1)
target1=df.BikeBuyer
x_train1,x_cv1,y_train1,y_cv1=train_test_split(predictors1,target1,test_size=0.35,random_state=0)

In [19]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier

In [20]:
def scores1(i):
    abc = i()
    abc.fit(x_train1, y_train1)
    y_pred1=abc.predict(x_cv1)
    abc = round(accuracy_score(y_pred1, y_cv1) * 100, 2)
    k.append(abc)
#Checking the scores by using our function
algos=[DecisionTreeClassifier,RandomForestClassifier,MLPClassifier,KNeighborsClassifier]
k=[]
for i in algos:
    scores1(i)    



In [21]:
dtc = DecisionTreeClassifier()
dtc.fit(x_train1, y_train1)
y_pred1=dtc.predict(x_cv1)
dtc_r = round(accuracy_score(y_pred1, y_cv1) * 100, 2)
print(dtc_r)

75.55


In [22]:
knn = KNeighborsClassifier()
knn.fit(x_train1, y_train1)
y_pred1=knn.predict(x_cv1)
knn_r = round(accuracy_score(y_pred1, y_cv1) * 100, 2)
print(knn_r)

64.64


In [23]:
rfc = RandomForestClassifier()
rfc.fit(x_train1, y_train1)
y_pred1=rfc.predict(x_cv1)
rfc_r = round(accuracy_score(y_pred1, y_cv1) * 100, 2)
print(rfc_r)



78.02


In [24]:
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier()
nn.fit(x_train1, y_train1)
y_pred1=nn.predict(x_cv1)
nn = round(accuracy_score(y_pred1, y_cv1) * 100, 2)
print(nn)

75.46


In [124]:
models = pd.DataFrame({
    'Method': ['DecisionTreeClassifier', 
              'RandomForestClassifier', 'MLPClassifier','KneighborsClassifier'],
    'Score': [k[0],k[1],k[2],k[3]]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Method,Score
1,RandomForestClassifier,78.02
2,MLPClassifier,76.66
0,DecisionTreeClassifier,75.32
3,KneighborsClassifier,64.64


We will be using **RandomForestClassifier** to predict the **BikeBuyer** feature for the test set!

In [125]:
#Time for predictition for our test datatset

In [25]:
#predicting  BikeBuyer
model_bike = RandomForestClassifier()
model_bike.fit(x_train1, y_train1)
y_pred1 = model_bike.predict(test.drop(test.columns[0],axis=1))
print(y_pred1)



[0 1 0 0 0 0 1 1 1 0 1 0 0 0 0 0 1 0 1 1 0 1 1 1 0 0 0 1 1 1 1 0 0 0 1 1 1
 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 0 0 1 0 1 0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 0
 0 1 1 0 1 0 0 0 0 1 0 0 0 1 1 0 0 1 0 1 0 1 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1
 0 0 0 0 1 0 0 1 0 1 1 0 0 0 0 1 0 1 0 0 0 1 0 0 0 1 1 0 0 1 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 1 0 0 1 0 0
 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1 0 1 1 0 1 0 0 1 0 0 1 0 0 1 0 0 0
 0 1 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 1
 0 0 0 0 0 1 1 0 1 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1
 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 1 0 1
 0 1 1 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 1
 0 0 0 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 1 0 0
 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 1 0 0 0 0 0 1 0 1 1 1 1
 1 1 0 0 0 0 0 0 0 1 1 0 

In [26]:
model_avg = RandomForestRegressor()
model_avg.fit(x_train, y_train)
y_pred = model_avg.predict(test.drop(test.columns[0],axis=1))
print(y_pred)



[ 44.7        113.1         47.2         85.4         57.3
  46.6         94.7        141.7         99.1         56.5
  60.5         52.1         71.6         48.4         38.5
  55.7         86.2         71.4         96.2         57.5
  67.8         74.         165.6         83.4         54.1
  68.1         90.         121.5         76.3         59.4
  63.8         78.6         45.9         70.7         97.1
 102.5        168.3         96.1         54.6         86.1
  50.2         76.5         76.4         50.          57.5
  74.7         60.8         76.7        124.          81.8
  75.9         87.2         79.1         62.2         47.8
  74.7         58.8         71.8         58.4         64.6
  46.4         62.          92.9         75.8         46.4
  75.          76.4        149.8         61.9         99.
  78.          64.2         94.1         47.2         64.9
  75.6        114.8         59.6         56.2         49.2
  82.53333333  75.4         51.7         82.2         86.

In [27]:
#Adding these values into out orginal test set!
test_org=pd.read_csv('AW_test.csv')

In [28]:
#Assigning our predictions
test_org['BikeBuyer']=y_pred1
test_org['AveSpendMonth']=y_pred

In [132]:
#Saving our predictions in a csv file for further use!
test_org.to_csv('AW_test1.csv')

In [33]:
test_org.head()

Unnamed: 0,CustomerID,Title,FirstName,MiddleName,LastName,Suffix,AddressLine1,AddressLine2,City,StateProvinceName,...,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,BikeBuyer,AveSpendMonth
0,18988,,Courtney,A,Baker,,8727 Buena Vista Ave.,,Fremont,California,...,Management,F,S,0,2,0,5,86931,0,44.7
1,29135,,Adam,C,Allen,,3491 Cook Street,,Haney,British Columbia,...,Skilled Manual,M,M,1,2,2,4,100125,1,113.1
2,12156,,Bonnie,,Raji,,359 Pleasant Hill Rd,,Burbank,California,...,Management,F,M,1,2,0,4,103985,0,47.2
3,13749,,Julio,C,Alonso,,8945 Euclid Ave.,,Burlingame,California,...,Skilled Manual,M,M,1,0,0,4,127161,0,85.4
4,27780,,Christy,A,Andersen,,"42, boulevard Tremblay",,Dunkerque,Nord,...,Manual,F,M,1,1,2,2,21876,0,57.3


In [29]:
import seaborn as sns