## Customer Data with Customer Score and Weighted Age(0-90)

In [13]:
# for basic mathematics operation 
import numpy as np
import pandas as pd
from pandas import plotting

# for visualizations
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

# for interactive visualizations
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools
init_notebook_mode(connected = True)
import plotly.figure_factory as ff

# for path
import os

In [14]:
# importing the dataset
data = pd.read_csv('in_cart_customer.csv')

dat = ff.create_table(data.head())

py.iplot(dat)

In [15]:
# checking if there is any NULL data
data.isnull().any().any()

False

In [16]:
data.head()

Unnamed: 0,user_id,gender,age,income,customer score
0,1,Male,55,112000,11
1,3,Male,68,70000,13
2,5,Female,58,51000,5
3,8,Female,62,71000,4
4,12,Female,40,71000,6


In [17]:
# check to make sure numbers are ints
data.dtypes

user_id            int64
gender            object
age                int64
income             int64
customer score     int64
dtype: object

In [18]:
# Change age to reflect wheighted younger age group, and remove "118" with random millenial age 20-30.
data['age'] = np.where(data['age'] > 90,
                          np.random.uniform(20, 30, size=len(data)),
                          data['age'])

In [19]:
# check max and mins
data.describe()

Unnamed: 0,user_id,age,income,customer score
count,12950.0,12950.0,12950.0,12950.0
mean,7423.445869,53.380011,65329.266409,16.633668
std,4274.906366,17.079656,21583.713051,16.770312
min,1.0,18.0,30000.0,4.0
25%,3744.25,41.25,49000.0,6.0
50%,7392.5,55.0,63000.0,10.0
75%,11128.75,66.0,79000.0,20.0
max,14824.0,90.0,120000.0,100.0


In [20]:
#Save New File
output_file_path = "data/final_customer.csv"
data.to_csv(output_file_path, index=False)

In [21]:
#create binned option for customer score 
#mean is 16.6 so we will create a range from 0-30 as a trial for ML

data['score_bins'] = pd.cut(x=data['customer score'], bins=[0, 5, 10, 15, 20, 25, 30, 100])

In [22]:
data.head()

Unnamed: 0,user_id,gender,age,income,customer score,score_bins
0,1,Male,55.0,112000,11,"(10, 15]"
1,3,Male,68.0,70000,13,"(10, 15]"
2,5,Female,58.0,51000,5,"(0, 5]"
3,8,Female,62.0,71000,4,"(0, 5]"
4,12,Female,40.0,71000,6,"(5, 10]"


In [25]:
#verify the unique age_bins values
data['score_bins'].unique()

[(10, 15], (0, 5], (5, 10], (20, 25], (30, 100], (15, 20], (25, 30]]
Categories (7, interval[int64]): [(0, 5] < (5, 10] < (10, 15] < (15, 20] < (20, 25] < (25, 30] < (30, 100]]

In [33]:
#create label coulumn
data['score'] = pd.cut(x=data['customer score'], bins=[0, 5, 10, 15, 20, 25, 30, 100], labels=['5','10','15','20','25','30','35'])

In [34]:
data.dtypes

user_id              int64
gender              object
age                float64
income               int64
customer score       int64
score_bins        category
score             category
dtype: object

In [35]:
data['score'].isnull().sum()

0

In [36]:
data.head()

Unnamed: 0,user_id,gender,age,income,customer score,score_bins,score
0,1,Male,55.0,112000,11,"(10, 15]",15
1,3,Male,68.0,70000,13,"(10, 15]",15
2,5,Female,58.0,51000,5,"(0, 5]",5
3,8,Female,62.0,71000,4,"(0, 5]",5
4,12,Female,40.0,71000,6,"(5, 10]",10


In [38]:
# drop colums
new_data = data.drop(['customer score', 'score_bins'], axis=1)
new_data.head()

Unnamed: 0,user_id,gender,age,income,score
0,1,Male,55.0,112000,15
1,3,Male,68.0,70000,15
2,5,Female,58.0,51000,5
3,8,Female,62.0,71000,5
4,12,Female,40.0,71000,10


In [43]:
# change from category to int 
#df['col'].replace(to_replace=['category_1', 'category_2', 'category_3'], value=[1, 2, 3], inplace=True)
new_data['score'] = new_data['score'].astype('int')

In [44]:
new_data.dtypes

user_id      int64
gender      object
age        float64
income       int64
score        int64
dtype: object

In [45]:
new_data.rename(columns = {'score': 'customer score'}, inplace = True)

In [46]:
new_data.head()

Unnamed: 0,user_id,gender,age,income,customer score
0,1,Male,55.0,112000,15
1,3,Male,68.0,70000,15
2,5,Female,58.0,51000,5
3,8,Female,62.0,71000,5
4,12,Female,40.0,71000,10


In [47]:
#Save New File
output_file_path = "data/final_bin_customer.csv"
new_data.to_csv(output_file_path, index=False)