# Sklearn Pipeline for Scoring New Data

In [1]:
import psycopg2
import pickle
import pandas as pd
import requests

# Import Data
the dataset is available here: kaggle

In [2]:
## load data from local csv 
#df = pd.read_csv(
#        filepath_or_buffer = '../data/bank_customers_churn_dataset.csv',
#        index_col='customer_id'
#)
#
## drop churn column for segmentation
#df.drop('churn', axis=1, inplace=True)
#
## change type of categorical columns "credit_card" and "active_member"
#df['credit_card'] = df['credit_card'].apply(lambda x: 'yes' if x == 1 else 'no')
#df['active_member'] = df['active_member'].apply(lambda x: 'yes' if x == 1 else 'no')
#
#df.head()

In [3]:
def get_data_from_my_postgre_db(password:str, table_name:str)->pd.DataFrame:
    """connects to marketing_analytics db and returns data from table_name as pandas dataframe .
    inputs: database password and table_name"""
    # Connect to the database
    conn = psycopg2.connect(
        database="marketing_analytics", 
        user="postgres", 
        password=password, 
        host="localhost", 
        port="5432")
    # Create a cursor object
    cur = conn.cursor()
    # Execute a SQL query
    #cur.execute("SELECT * FROM bank_customers_churn_dataset")
    cur.execute("SELECT * FROM " + table_name) 
    # Get the column names from the cursor description
    columns = [desc[0] for desc in cur.description]
    # Fetch the results i.e. values
    results = cur.fetchall()
    # Create a dictionary mapping column names to values
    data = [dict(zip(columns, row)) for row in results]
    # Close the connection
    conn.close()
    # turn dictionary into dataframe
    return pd.DataFrame.from_dict(data)

In [4]:
f = open("../private.txt", "r")
pw = f.read()

In [5]:
# get data from postres db
table_name = "bank_customers_churn_dataset"
df = get_data_from_my_postgre_db(pw, table_name)

# set customer id as index
df.set_index('customer_id', inplace=True)

# drop churn column for segmentation
df.drop('churn', axis=1, inplace=True)

# change data types to numeric
df['tenure'] = df['tenure'].apply(lambda x: int(x))
df['products_number'] = df['products_number'].apply(lambda x: int(x))
df['credit_card'] = df['credit_card'].apply(lambda x: int(x))
df['active_member'] = df['active_member'].apply(lambda x: int(x))

# change type of categorical columns "credit_card" and "active_member"
df['credit_card'] = df['credit_card'].apply(lambda x: 'yes' if x == 1 else 'no')
df['active_member'] = df['active_member'].apply(lambda x: 'yes' if x == 1 else 'no')

df.head()

Unnamed: 0_level_0,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
15634602,619.0,France,Female,42,2,0.0,1,yes,yes,101348.88
15647311,608.0,Spain,Female,41,1,83807.86,1,no,yes,112542.58
15619304,502.0,France,Female,42,8,159660.8,3,yes,no,113931.57
15701354,699.0,France,Female,39,1,0.0,2,no,no,93826.63
15737888,850.0,Spain,Female,43,2,125510.82,1,yes,yes,79084.1


In [6]:
# Get one prediction for one row at a time
input_list = df.to_dict('records')

# select one row
data = input_list[:1000]
data

[{'credit_score': 619.0,
  'country': 'France',
  'gender': 'Female',
  'age': 42,
  'tenure': 2,
  'balance': 0.0,
  'products_number': 1,
  'credit_card': 'yes',
  'active_member': 'yes',
  'estimated_salary': 101348.88},
 {'credit_score': 608.0,
  'country': 'Spain',
  'gender': 'Female',
  'age': 41,
  'tenure': 1,
  'balance': 83807.86,
  'products_number': 1,
  'credit_card': 'no',
  'active_member': 'yes',
  'estimated_salary': 112542.58},
 {'credit_score': 502.0,
  'country': 'France',
  'gender': 'Female',
  'age': 42,
  'tenure': 8,
  'balance': 159660.8,
  'products_number': 3,
  'credit_card': 'yes',
  'active_member': 'no',
  'estimated_salary': 113931.57},
 {'credit_score': 699.0,
  'country': 'France',
  'gender': 'Female',
  'age': 39,
  'tenure': 1,
  'balance': 0.0,
  'products_number': 2,
  'credit_card': 'no',
  'active_member': 'no',
  'estimated_salary': 93826.63},
 {'credit_score': 850.0,
  'country': 'Spain',
  'gender': 'Female',
  'age': 43,
  'tenure': 2,
  '

# Test on Fast API

In [7]:
#url = 'http://127.0.0.1:8000/predict/'
url = 'http://localhost/predict/' # use this if e.g. u use docker with: docker run -d --name mycontainer -p 80:80 63531981/customer_segmentation_api:latest

In [8]:
data

[{'credit_score': 619.0,
  'country': 'France',
  'gender': 'Female',
  'age': 42,
  'tenure': 2,
  'balance': 0.0,
  'products_number': 1,
  'credit_card': 'yes',
  'active_member': 'yes',
  'estimated_salary': 101348.88},
 {'credit_score': 608.0,
  'country': 'Spain',
  'gender': 'Female',
  'age': 41,
  'tenure': 1,
  'balance': 83807.86,
  'products_number': 1,
  'credit_card': 'no',
  'active_member': 'yes',
  'estimated_salary': 112542.58},
 {'credit_score': 502.0,
  'country': 'France',
  'gender': 'Female',
  'age': 42,
  'tenure': 8,
  'balance': 159660.8,
  'products_number': 3,
  'credit_card': 'yes',
  'active_member': 'no',
  'estimated_salary': 113931.57},
 {'credit_score': 699.0,
  'country': 'France',
  'gender': 'Female',
  'age': 39,
  'tenure': 1,
  'balance': 0.0,
  'products_number': 2,
  'credit_card': 'no',
  'active_member': 'no',
  'estimated_salary': 93826.63},
 {'credit_score': 850.0,
  'country': 'Spain',
  'gender': 'Female',
  'age': 43,
  'tenure': 2,
  '

In [9]:
responses = []
for inputs in data:
    response = requests.post(
        url, 
        params=inputs)
    responses.append(response)

In [10]:
for response in responses:
    print(response.text)

{"predicted cluster label ":[1]}
{"predicted cluster label ":[2]}
{"predicted cluster label ":[3]}
{"predicted cluster label ":[3]}
{"predicted cluster label ":[1]}
{"predicted cluster label ":[4]}
{"predicted cluster label ":[1]}
{"predicted cluster label ":[0]}
{"predicted cluster label ":[2]}
{"predicted cluster label ":[1]}
{"predicted cluster label ":[3]}
{"predicted cluster label ":[4]}
{"predicted cluster label ":[3]}
{"predicted cluster label ":[3]}
{"predicted cluster label ":[1]}
{"predicted cluster label ":[0]}
{"predicted cluster label ":[0]}
{"predicted cluster label ":[1]}
{"predicted cluster label ":[4]}
{"predicted cluster label ":[1]}
{"predicted cluster label ":[1]}
{"predicted cluster label ":[4]}
{"predicted cluster label ":[4]}
{"predicted cluster label ":[2]}
{"predicted cluster label ":[1]}
{"predicted cluster label ":[2]}
{"predicted cluster label ":[0]}
{"predicted cluster label ":[3]}
{"predicted cluster label ":[0]}
{"predicted cluster label ":[1]}
{"predicte