In [1]:
# import all the packages

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import numpy.random as nr
import math
from datetime import datetime, date

from sklearn import preprocessing
import sklearn.model_selection as ms
from sklearn import linear_model
import sklearn.metrics as sklm

%matplotlib inline


In [2]:
#Read and combine the training files

work = pd.read_csv('AdvWorksCusts.csv')
print(work.shape)
avemon = pd.read_csv('AW_AveMonthSpend.csv')
print(avemon.shape)
bikebuy = pd.read_csv('AW_BikeBuyer.csv')
print(bikebuy.shape)

work1 = pd.merge(work,avemon, how = 'inner', on = 'CustomerID')
print(work1.shape)

train = pd.merge(work1,bikebuy, how = 'inner', on = 'CustomerID')

print(train.shape)
train.head()

#Read and combine the test files
test = pd.read_csv('AW_test.csv')
print(test.shape)
test.head()

(16519, 23)
(16519, 2)
(16519, 2)
(16749, 24)
(17209, 25)
(500, 23)


Unnamed: 0,CustomerID,Title,FirstName,MiddleName,LastName,Suffix,AddressLine1,AddressLine2,City,StateProvinceName,...,BirthDate,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome
0,18988,,Courtney,A,Baker,,8727 Buena Vista Ave.,,Fremont,California,...,1/5/1945,Bachelors,Management,F,S,0,2,0,5,86931
1,29135,,Adam,C,Allen,,3491 Cook Street,,Haney,British Columbia,...,10/4/1964,Bachelors,Skilled Manual,M,M,1,2,2,4,100125
2,12156,,Bonnie,,Raji,,359 Pleasant Hill Rd,,Burbank,California,...,1/12/1934,Graduate Degree,Management,F,M,1,2,0,4,103985
3,13749,,Julio,C,Alonso,,8945 Euclid Ave.,,Burlingame,California,...,9/22/1958,Graduate Degree,Skilled Manual,M,M,1,0,0,4,127161
4,27780,,Christy,A,Andersen,,"42, boulevard Tremblay",,Dunkerque,Nord,...,3/19/1965,High School,Manual,F,M,1,1,2,2,21876


In [3]:
#drop unnecessary column - training dataset

train.drop(['Title'], axis=1, inplace=True)
train.drop(['MiddleName'], axis=1, inplace=True)
train.drop(['AddressLine2'], axis=1, inplace=True)
train.drop(['Suffix'], axis=1, inplace=True)

print(train.shape)
train.head()

#drop unnecessary column - testing dataset

test.drop(['Title'], axis=1, inplace=True)
test.drop(['MiddleName'], axis=1, inplace=True)
test.drop(['AddressLine2'], axis=1, inplace=True)
test.drop(['Suffix'], axis=1, inplace=True)

print(test.shape)
test.head()

(17209, 21)
(500, 19)


Unnamed: 0,CustomerID,FirstName,LastName,AddressLine1,City,StateProvinceName,CountryRegionName,PostalCode,PhoneNumber,BirthDate,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome
0,18988,Courtney,Baker,8727 Buena Vista Ave.,Fremont,California,United States,94536,133-555-0128,1/5/1945,Bachelors,Management,F,S,0,2,0,5,86931
1,29135,Adam,Allen,3491 Cook Street,Haney,British Columbia,Canada,V2W 1W2,252-555-0173,10/4/1964,Bachelors,Skilled Manual,M,M,1,2,2,4,100125
2,12156,Bonnie,Raji,359 Pleasant Hill Rd,Burbank,California,United States,91502,409-555-0193,1/12/1934,Graduate Degree,Management,F,M,1,2,0,4,103985
3,13749,Julio,Alonso,8945 Euclid Ave.,Burlingame,California,United States,94010,175-555-0196,9/22/1958,Graduate Degree,Skilled Manual,M,M,1,0,0,4,127161
4,27780,Christy,Andersen,"42, boulevard Tremblay",Dunkerque,Nord,France,59140,1 (11) 500 555-0122,3/19/1965,High School,Manual,F,M,1,1,2,2,21876


In [4]:
#check unique rows - Train
print(train.shape)
print(train.CustomerID.unique().shape)

#drop duplicates - Train
train.drop_duplicates(subset='CustomerID',keep='first',inplace=True)

#check unique rows - Train
print(train.shape)
print(train.CustomerID.unique().shape)

#check unique rows - Test
print(test.shape)
print(test.CustomerID.unique().shape)

#drop duplicates - Test
test.drop_duplicates(subset='CustomerID',keep='first',inplace=True)

#check unique rows - Test
print(test.shape)
print(test.CustomerID.unique().shape)


(17209, 21)
(16404,)
(16404, 21)
(16404,)
(500, 19)
(500,)
(500, 19)
(500,)


In [5]:
#calculating age from birthdate and assigning to bins - train

train.dtypes
train['BirthDate'] = pd.to_datetime(train['BirthDate'])
today = datetime.strptime('01 01 1998', "%d %m %Y")
train['age'] = (today - train['BirthDate']).astype('>m8[Y]')
train['age'] = pd.to_numeric(train['age'])

bins = [0, 25, 45, 55, 100]
names = ['<25', '25-45', '45-55', '>55']

train['AgeRange'] = pd.cut(train['age'], bins, labels=names)
train['AgeRange'] = train['AgeRange'].astype('str')

train['agegender'] = train['AgeRange']+train['Gender']
print(train.dtypes)

#calculating age from birthdate and assigning to bins - test

test.dtypes
test['BirthDate'] = pd.to_datetime(test['BirthDate'])
today = datetime.strptime('01 01 1998', "%d %m %Y")
test['age'] = (today - test['BirthDate']).astype('>m8[Y]')
test['age'] = pd.to_numeric(test['age'])

#bins = [0, 25, 45, 55, 100]
#names = ['<25', '25-45', '45-55', '>55']

test['AgeRange'] = pd.cut(test['age'], bins, labels=names)
test['AgeRange'] = test['AgeRange'].astype('str')

test['agegender'] = test['AgeRange']+test['Gender']
print(test.dtypes)

CustomerID                       int64
FirstName                       object
LastName                        object
AddressLine1                    object
City                            object
StateProvinceName               object
CountryRegionName               object
PostalCode                      object
PhoneNumber                     object
BirthDate               datetime64[ns]
Education                       object
Occupation                      object
Gender                          object
MaritalStatus                   object
HomeOwnerFlag                    int64
NumberCarsOwned                  int64
NumberChildrenAtHome             int64
TotalChildren                    int64
YearlyIncome                     int64
AveMonthSpend                    int64
BikeBuyer                        int64
age                            float64
AgeRange                        object
agegender                       object
dtype: object
CustomerID                       int64
FirstName  

In [6]:
#define label to be predicted
labels = np.array(train['BikeBuyer'])

#one hot encoding for categorical data 

def encode_string(cat_features):
    
    enc = preprocessing.LabelEncoder()
    enc.fit(cat_features)
    enc_cat_features = enc.transform(cat_features)
    
    ## Now, apply one hot encoding
    
    ohe = preprocessing.OneHotEncoder()
    encoded = ohe.fit(enc_cat_features.reshape(-1,1))
    return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()

# encoding training data
categorical_columns = ['Gender', 'MaritalStatus']

Features = encode_string(train['Occupation'])
for col in categorical_columns:
    temp = encode_string(train[col])
    Features = np.concatenate([Features, temp], axis = 1)

Features = np.concatenate([Features, np.array(train[['NumberCarsOwned', 'NumberChildrenAtHome', 
                            'TotalChildren', 'YearlyIncome','age']])], axis = 1)    

# encoding testing data
categorical_columns = ['Gender', 'MaritalStatus']

Test_Features = encode_string(test['Occupation'])
for col in categorical_columns:
    temp = encode_string(test[col])
    Test_Features = np.concatenate([Test_Features, temp], axis = 1)

Test_Features = np.concatenate([Test_Features, np.array(test[['NumberCarsOwned', 'NumberChildrenAtHome', 
                            'TotalChildren', 'YearlyIncome','age']])], axis = 1)    


#defining training data
train_data = Features


#defining testing data
test_data = Test_Features

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [7]:
#Train Model
model = linear_model.LogisticRegression() 
model.fit(train_data, labels)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [14]:
#Test Model
prediction = model.predict(test_data)

In [20]:
result = pd.DataFrame(prediction)
result.CustomerID=test.CustomerID
result.columns = ["prediction"]
result.to_csv("prediction_results.csv")

  from ipykernel import kernelapp as app
