# Adding a New Dating Profile
Using Classification or Clustering for a New Dating Profile

### Importing Libraries and Data

In [58]:
!pip3 install jupyter
!pip3 install pandas
!pip3 install ipywidgets widgetsnbextension pandas-profiling
!pip3 install numpy
!pip3 install matplotlib
!pip3 install seaborn
!pip3 install _pickle
!pip3 install sklearn
!pip3 install tqdm

Collecting pandas-profiling
  Downloading pandas_profiling-3.3.0-py2.py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.0/268.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting visions[type_image_path]==0.7.5
  Downloading visions-0.7.5-py3-none-any.whl (102 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.7/102.7 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting phik<0.13,>=0.11.1
  Downloading phik-0.12.2-cp37-cp37m-macosx_10_13_x86_64.whl (662 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m662.8/662.8 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting tangled-up-in-unicode==0.2.0
  Downloading tangled_up_in_unicode-0.2.0-py3-none-any.whl (4.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
Collecting missingno<0.6,>=0.4.2
  D

In [59]:
import pandas as pd
pd.set_option('display.max_colwidth', 500)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import _pickle as pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import calinski_harabasz_score, silhouette_score, davies_bouldin_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from tqdm import tqdm_notebook as tqdm

#### Loading the Profiles

In [60]:
# Loading in the cleaned DF
with open("profiles.pkl",'rb') as fp:
    raw_df = pickle.load(fp)

# Viewing the DF    
raw_df.head()

Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics
0,Typical twitter fanatic. Infuriatingly humble thinker. Lifelong coffee practitioner. Organizer.,5,3,4,1,3,6,7
1,Web junkie. Analyst. Infuriatingly humble introvert. Food nerd. Lifelong music fanatic. Coffee lover.,7,9,5,1,9,4,0
2,Avid web maven. Food practitioner. Gamer. Twitter fanatic. Pop culture scholar. Zombie evangelist.,1,2,6,5,6,5,4
3,Twitteraholic. Extreme web fanatic. Food buff. Infuriatingly humble entrepreneur.,5,2,7,8,2,6,6
4,Bacon enthusiast. Falls down a lot. Freelance social media fan. Infuriatingly humble introvert.,6,6,6,4,3,6,3


#### Loading the Clustered Profiles

In [61]:
# Loading in the clustered DF
with open("../data/clustered_profiles.pkl",'rb') as fp:
    cluster_df = pickle.load(fp)

# Viewing the DF    
cluster_df.tail()

Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics,Cluster #
6595,Typical pop culture nerd. Infuriatingly humble internet maven. Alcohol evangelist.,7.0,9.0,0.0,0.0,2.0,2.0,4.0,9
6596,Avid web junkie. Lifelong alcohol guru. Hardcore reader. Award-winning twitter evangelist.,4.0,3.0,6.0,3.0,7.0,7.0,2.0,2
6597,Music ninja. Bacon fanatic. Reader. Total communicator. Unapologetic beer specialist.,1.0,4.0,0.0,4.0,9.0,2.0,5.0,0
6598,Communicator. Bacon lover. Award-winning introvert. Amateur internet ninja.,6.0,2.0,0.0,3.0,8.0,9.0,1.0,9
6599,Unapologetic tv aficionado. Devoted twitter enthusiast. Typical coffee guru. Falls down a lot.,2.0,1.0,8.0,7.0,0.0,5.0,5.0,10


## Creating the New Profile Data

In [62]:
# Instantiating a new DF row to append later
new_profile = pd.DataFrame(columns=raw_df.columns)

# Adding random values for new data
for i in new_profile.columns[1:]:
    new_profile[i] = np.random.randint(0,10,1)

# Printing an user interface for inputting new values
print("Enter new profile information...\n\nExample Bio:\nBacon enthusiast. Falls down a lot. Freelance social media fan. Infuriatingly humble introvert.")

# Asking for new profile data
new_profile['Bios'] = input("Enter a Bio for yourself: ")

# Indexing that new profile data
new_profile.index = [raw_df.index[-1] + 1]

Enter new profile information...

Example Bio:
Bacon enthusiast. Falls down a lot. Freelance social media fan. Infuriatingly humble introvert.


### The New Data

In [63]:
new_profile

Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics
6600,,5,9,9,4,6,6,7


# Two Approaches
1. Cluster all the profiles again with the new profile

2. Classify the new profile with a classification model trained on our previously clustered data

## Clustering the New Profile Data

In [64]:
# Appending the new data
new_cluster = raw_df.append(new_profile)

### Scaling

In [65]:
# Instantiating the Scaler
scaler = MinMaxScaler()

# Scaling the categories then replacing the old values
df = new_cluster[['Bios']].join(pd.DataFrame(scaler.fit_transform(new_cluster.drop('Bios', axis=1)), columns=new_cluster.columns[1:], index=new_cluster.index))

### Vectorizing

In [66]:
# Instantiating the Vectorizer
vectorizer = CountVectorizer()

# Fitting the vectorizer to the Bios
x = vectorizer.fit_transform(df['Bios'])

# Creating a new DF that contains the vectorized words
df_wrds = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names())

# Concating the words DF with the original DF
new_df = pd.concat([df, df_wrds], axis=1)

# Dropping the Bios because it is no longer needed in place of vectorization
new_df.drop('Bios', axis=1, inplace=True)



### PCA

In [67]:
from sklearn.decomposition import PCA

# Instantiating PCA
pca = PCA()

# Fitting and Transforming the DF
df_pca = pca.fit_transform(new_df)

# Finding the exact number of features that explain at least 99% of the variance in the dataset
total_explained_variance = pca.explained_variance_ratio_.cumsum()
n_over_99 = len(total_explained_variance[total_explained_variance>=.99])
n_to_reach_99 = new_df.shape[1] - n_over_99

# Reducing the dataset to the number of features determined before
pca = PCA(n_components=n_to_reach_99)

# Fitting and transforming the dataset to the stated number of features
df_pca = pca.fit_transform(new_df)

# Seeing the variance ratio that still remains after the dataset has been reduced
pca.explained_variance_ratio_.cumsum()[-1]

0.9898750354519308

### Performing Hierarchical Agglomerative Clustering
- First finding the optimum number of clusters

In [69]:
# Setting the amount of clusters to test out
cluster_cnt = [i for i in range(2, 20, 1)]

# Establishing empty lists to store the scores for the evaluation metrics
ch_scores = []

s_scores = []

db_scores = []

# Looping through different iterations for the number of clusters
# TODO generating errors, possible tqdm.notebook.tqdm?
for i in tqdm(cluster_cnt):
    
    # Clustering with different number of clusters
    hac = AgglomerativeClustering(n_clusters=i)
    
    hac.fit(df_pca)
    
    cluster_assignments = hac.labels_
    
    # Appending the scores to the empty lists
    ch_scores.append(calinski_harabasz_score(df_pca, cluster_assignments))
    
    s_scores.append(silhouette_score(df_pca, cluster_assignments))
    
    db_scores.append(davies_bouldin_score(df_pca, cluster_assignments))

AttributeError: 'function' object has no attribute 'notebook'

### Helper Function to Evaluate the Clusters

In [None]:
def cluster_eval(y, x):
    """
    Prints the scores of a set evaluation metric. Prints out the max and min values of the evaluation scores.
    """
    
    # Creating a DataFrame for returning the max and min scores for each cluster
    df = pd.DataFrame(columns=['Cluster Score'], index=[i for i in range(2, len(y)+2)])
    df['Cluster Score'] = y
    
    print('Max Value:\nCluster #', df[df['Cluster Score']==df['Cluster Score'].max()])
    print('\nMin Value:\nCluster #', df[df['Cluster Score']==df['Cluster Score'].min()])

### Evaluation of Clusters

In [None]:
print("The Calinski-Harabasz Score (find max score):")
cluster_eval(ch_scores, cluster_cnt)

print("\nThe Silhouette Coefficient Score (find max score):")
cluster_eval(s_scores, cluster_cnt)

print("\nThe Davies-Bouldin Score (find minimum score):")
cluster_eval(db_scores, cluster_cnt)

### Running HAC
Again but with the optimum cluster count

In [None]:
# Instantiating HAC
hac = AgglomerativeClustering(n_clusters=12)

# Fitting
hac.fit(df_pca)

# Getting cluster assignments
cluster_assignments = hac.labels_

# Unscaling the categories then replacing the scaled values
df = df[['Bios']].join(pd.DataFrame(scaler.inverse_transform(df.drop('Bios', axis=1)), columns=df.columns[1:], index=df.index))

# Assigning the clusters to each profile
df['Cluster #'] = cluster_assignments


### Finding the Exact Cluster for our New Profile

In [None]:
# Getting the Cluster # for the new profile
profile_cluster = df.loc[new_profile.index]['Cluster #'].values[0]

# Using the Cluster # to narrow down the DF
profile_df = df[df['Cluster #']==profile_cluster].drop('Cluster #', axis=1)

### Vectorizing the Selected Cluster

In [None]:
# Fitting the vectorizer to the Bios
cluster_x = vectorizer.fit_transform(profile_df['Bios'])

# Creating a new DF that contains the vectorized words
cluster_v = pd.DataFrame(cluster_x.toarray(), index=profile_df.index, columns=vectorizer.get_feature_names())

# Joining the Vectorized DF to the previous DF
profile_df = profile_df.join(cluster_v).drop('Bios', axis=1)

### Finding Correlation for Top 10 Similar Profiles to the New Profile

In [None]:
# Trasnposing the DF so that we are correlating with the index(users) and finding the correlation
corr = profile_df.T.corr()

# Finding the Top 10 similar or correlated users to the new user
user_n = new_profile.index[0]

# Creating a DF with the Top 10 most similar profiles
top_10_sim = corr[[user_n]].sort_values(by=[user_n],axis=0, ascending=False)[1:11]

### The Top 10 Profiles most likely to Match with the New Profile
(Sorted by descending similarity)

In [None]:
raw_df.loc[top_10_sim.index]

## Classification of the New Profile

### Importing the Different Classification Models

In [None]:
# Importing 3 models
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

### Vectorizing the Data

In [None]:
# Assigning the split variables
X = cluster_df.drop(["Cluster #"], 1)
y = cluster_df['Cluster #']

## Vectorizing
# Instantiating the Vectorizer
vectorizer = CountVectorizer()

# Fitting the vectorizer to the Bios
x = vectorizer.fit_transform(X['Bios'])

# Creating a new DF that contains the vectorized words
df_wrds = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names())

# Concating the words DF with the original DF
X = pd.concat([X, df_wrds], axis=1)

# Dropping the Bios because it is no longer needed in place of vectorization
X.drop(['Bios'], axis=1, inplace=True)

#### Scaling

In [None]:
# Scaling the Data
scaler = MinMaxScaler()

X = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)

### Preparing the New Profile Data
For Vectorization purposes, the new profile will have to be able to fit into trained data (has to have the same columns).

Two Options:
1. __Vectorized the New Profile data with the vectorizer fitted to the dataset as to not include potentially new vocabulary. _(Keeps dimensionality the same)___
2. Vectorized the New Profile data with a new vectorizer fitted to it in order to include new vocabulary. _(Increases dimensionality with every new piece of data)_

#### Vectorizing

In [None]:
# Vectorizing the new data
vect_new_prof = vectorizer.transform(new_profile['Bios'])

# Quick DF of the vectorized words
new_vect_w = pd.DataFrame(vect_new_prof.toarray(), columns=vectorizer.get_feature_names(), index=new_profile.index)

# Concatenating the DFs for the new profile data
new_vect_prof = pd.concat([new_profile, new_vect_w], 1).drop('Bios', 1)

# Scaling the new profile data
new_vect_prof = pd.DataFrame(scaler.transform(new_vect_prof), columns=new_vect_prof.columns, index=new_vect_prof.index)

In [None]:
new_vect_prof

### Train, test, splitting

In [None]:
# Train, test, split
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Finding the Best Model
- Dummy (Baseline Model)
- KNN
- SVM

In [None]:
# Dummy
dummy = DummyClassifier(strategy='stratified')

# KNN
knn = KNeighborsClassifier()

# SVM
svm = SVC()

# List of models
models = [dummy, knn, svm]

# List of model names
names = ['Dummy', 'KNN', 'SVM']

# Zipping the lists
classifiers = dict(zip(names, models))

Since we are dealing with an imbalanced dataset _(because each cluster is not guaranteed to have the same amount of profiles)_, we will resort to using the __Macro Avg__ and __F1 Score__ for evaluating the performances of each model.

In [None]:
# Dictionary containing the model names and their scores
models_f1 = {}

# Looping through each model's predictions and getting their classification reports
for name, model in classifiers.items():
    # Fitting the model
    model.fit(X_train, y_train)
    
    print('\n'+ name + ' (Macro Avg - F1 Score):')
    
    # Classification Report
    report = classification_report(y_test, model.predict(X_test), output_dict=True)
    f1 = report['macro avg']['f1-score']
    
    # Assigning to the Dictionary
    models_f1[name] = f1
    
    print(f1)

#### Model with the Best Performance

In [None]:
print(max(models_f1, key=models_f1.get), 'Score:', max(models_f1.values()))

### Using the Best Model to Classify the New Profile
_(Optional: Tune the model with GridSearch)_

In [None]:
# Fitting the model
svm.fit(X, y)

# Predicting the New Profile data by determining which Cluster it would belong to
designated_cluster = svm.predict(new_vect_prof)

designated_cluster

### DF containing the Profiles of the Designated Cluster

In [None]:
des_cluster = cluster_df[cluster_df['Cluster #']==designated_cluster[0]]

des_cluster

### Finding the Top 10 Similar Profiles to our New Profile

In [None]:
# Appending the new profile data
des_cluster = des_cluster.append(new_profile, sort=False)

# Fitting the vectorizer to the Bios
cluster_x = vectorizer.fit_transform(des_cluster['Bios'])

# Creating a new DF that contains the vectorized words
cluster_v = pd.DataFrame(cluster_x.toarray(), index=des_cluster.index, columns=vectorizer.get_feature_names())

# Joining the Vectorized DF to the previous DF and dropping columns
des_cluster = des_cluster.join(cluster_v).drop(['Bios', 'Cluster #'], axis=1)

des_cluster

#### Correlations to find similar profiles

In [None]:
# Finding the Top 10 similar or correlated users to the new user
user_n = new_profile.index[0]

# Trasnposing the DF so that we are correlating with the index(users) and finding the correlation
corr = des_cluster.T.corrwith(des_cluster.loc[user_n])

# Creating a DF with the Top 10 most similar profiles
top_10_sim = corr.sort_values(ascending=False)[1:11]

### Top 10 Similar profiles

In [None]:
raw_df.loc[top_10_sim.index]

### Saving the Classification Model
For future use

In [None]:
from joblib import dump

dump(svm, "clf_model.joblib")

## Conclusion on the Two Different Approaches
The results for both approaches are the same.  The new profile ends up in the same cluster whether it is clustered or classified to be there.