# Past Years Library Usage

Business Question:

    The San Francisco Library is attempting to increase their annual patrons by re-exposing old patrons with targeted advertisement. 

This is an in depth analysis with the goal of understanding and classifying the old patrons into useful segments. 

Assumptions: 
- There are no incorrect values
- Only patrons from 2016 may be considered active

In [None]:
#import and read in data

import pandas as pd 
import numpy as np
import seaborn as sb
from matplotlib import rcParams
import matplotlib.pyplot as plt

%matplotlib inline
sb.set_style('whitegrid')
fig = plt.figure()

data = pd.read_csv('C:/Users/Scott/Desktop/Data/Library_Usage.csv')

#### KMeans Clustering

In [None]:
from sklearn.cluster import KMeans
from scipy import stats
from sklearn.preprocessing import scale 
from sklearn.preprocessing import LabelEncoder

meaner = df.loc[:,('Total Checkouts','Total Renewals')]
num = LabelEncoder()
meaner['bin_age'] = num.fit_transform(df['Age Range'].astype('str'))
meaner['bin_Library'] = num.fit_transform(df['Home Library Definition'].astype('str'))
meaner['bin_Month'] = num.fit_transform(df['Circulation Active Month'].astype('str'))
meaner['bin_year'] = num.fit_transform(df['Circulation Active Year'].astype('str'))
meaner['bin_notice'] = num.fit_transform(df['Notice Preference Definition'].astype('str'))
meaner['bin_email'] = num.fit_transform(df['Provided Email Address'].astype('str'))
meaner['bin_registered'] = num.fit_transform(df['Year Patron Registered'].astype('str'))
meaner['bin_county'] = num.fit_transform(df['Outside of County'].astype('str'))
meaner = meaner.sample(frac=.05)
kmeans = KMeans(n_clusters=5)
kmeans = kmeans.fit(meaner)
labels = kmeans.labels_


plt.figure(figsize=(8,6))
plt.scatter(meaner['Total Checkouts'], meaner['Total Renewals'], c=labels,cmap='rainbow')
plt.ylabel('Total Renewals')
plt.xlabel('Total Checkouts')
plt.show()

This graphic hints at an idea of how a classifier may work.

#### SVM Classifier

In [None]:
from sklearn import svm
from sklearn.model_selection import train_test_split

data = df.sample(frac=.05)
datay = data['Age Range']

data = data.drop('Patron Type Definition',1)
data = data.drop('Total Checkouts',1)
data = data.drop('Total Renewals',1)

datax = pd.get_dummies(data)
y = datay
x = datax.join(df['Total Checkouts'])
x = x.join(df['Total Renewals'])     
                              
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

clf = svm.SVC()
clf.fit(x_train, y_train)
predict = clf.predict(x_test)

print('train')
print(clf.score(x_train,y_train))
print('test')
print(clf.score(x_test,y_test))

plt.figure(figsize=(25,25))
true_rfc = x_test.loc[:,('Total Checkouts','Total Renewals')]
true_rfc['true_type'] = y_test
sb.pairplot(true_rfc, hue = 'true_type', palette = 'hls')

pred_rfc = x_test.loc[:,('Total Checkouts','Total Renewals')]
pred_rfc['predicted_range'] = predict
sb.pairplot(pred_rfc, hue = 'predicted_range', palette = 'hls')

These are the predicted groupings of train and test models. With the score being around 80% in the train model, but dropping significantly to 68% in the test. The large drop suggests there are problems with the model. In order to receive a better test score, it may be worth while to segment the users back into the low, high, and outlier slices.

In [None]:
print(len(low_df_slice))
print(len(high_df_slice))
print(len(outlier_df_slice))

### Low Activity 

In [None]:
data = low_df_slice.sample(frac=.01)
datay = data['Age Range']

data = data.drop('Patron Type Definition',1)
data = data.drop('Total Checkouts',1)
data = data.drop('Total Renewals',1)

datax = pd.get_dummies(data)
y = datay
x = datax.join(df['Total Checkouts'])
x = x.join(df['Total Renewals'])     
                              
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

clf = svm.SVC()
clf.fit(x_train, y_train)
predict = clf.predict(x_test)

print('train')
print(clf.score(x_train,y_train))
print('test')
print(clf.score(x_test,y_test))

plt.figure(figsize=(25,25))
low_true_rfc = x_test.loc[:,('Total Checkouts','Total Renewals')]
low_true_rfc['true_type'] = y_test

low_pred_rfc = x_test.loc[:,('Total Checkouts','Total Renewals')]
low_pred_rfc['predicted_range'] = predict

sb.pairplot(low_true_rfc, hue = 'true_type', palette = 'hls')
plt.title('Low Activity Training')

sb.pairplot(low_pred_rfc, hue = 'predicted_range', palette = 'hls')
plt.title('Low Activity Test')

### High Activity

In [None]:
data = high_df_slice.sample(frac=.05)
datay = data['Age Range']

data = data.drop('Patron Type Definition',1)
data = data.drop('Total Checkouts',1)
data = data.drop('Total Renewals',1)

datax = pd.get_dummies(data)
y = datay
x = datax.join(df['Total Checkouts'])
x = x.join(df['Total Renewals'])     
                              
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

clf = svm.SVC()
clf.fit(x_train, y_train)
predict = clf.predict(x_test)

print('train')
print(clf.score(x_train,y_train))
print('test')
print(clf.score(x_test,y_test))

plt.figure(figsize=(25,25))
high_true_rfc = x_test.loc[:,('Total Checkouts','Total Renewals')]
high_true_rfc['true_type'] = y_test

high_pred_rfc = x_test.loc[:,('Total Checkouts','Total Renewals')]
high_pred_rfc['predicted_range'] = predict

sb.pairplot(high_true_rfc, hue = 'true_type', palette = 'hls')
plt.title('High Activity Training')

sb.pairplot(high_pred_rfc, hue = 'predicted_range', palette = 'hls')
plt.title('High Activity Test')

### Outliers

In [None]:
data = outlier_df_slice.sample(frac=.05)
datay = data['Age Range']

data = data.drop('Patron Type Definition',1)
data = data.drop('Total Checkouts',1)
data = data.drop('Total Renewals',1)

datax = pd.get_dummies(data)
y = datay
x = datax.join(df['Total Checkouts'])
x = x.join(df['Total Renewals'])     
                              
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

clf = svm.SVC()
clf.fit(x_train, y_train)
predict = clf.predict(x_test)

print('train')
print(clf.score(x_train,y_train))
print('test')
print(clf.score(x_test,y_test))

plt.figure(figsize=(25,25))
out_true_rfc = x_test.loc[:,('Total Checkouts','Total Renewals')]
out_true_rfc['true_type'] = y_test
out_pred_rfc = x_test.loc[:,('Total Checkouts','Total Renewals')]
out_pred_rfc['predicted_range'] = predict

sb.pairplot(out_true_rfc, hue = 'true_type', palette = 'hls')
plt.title('Outliers Training')

sb.pairplot(out_pred_rfc, hue = 'predicted_range', palette = 'hls')
plt.title('Outliers Test')