# Adding a New Dating Profile
Using Classification or Clustering for a New Dating Profile

### Importing Libraries and Data

In [2]:
import pandas as pd
pd.set_option('display.max_colwidth', 500)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import _pickle as pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import calinski_harabasz_score, silhouette_score, davies_bouldin_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from tqdm import tqdm_notebook as tqdm

#### Loading the Profiles

In [3]:
# Loading in the cleaned DF
with open("Data-Gathering-Creation\profiles.pkl",'rb') as fp:
    raw_df = pickle.load(fp)

# Viewing the DF    
raw_df.head()

Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics
0,Evil communicator. Avid analyst. Freelance gamer. Beer specialist. Incurable troublemaker. Typical entrepreneur.,8,1,2,8,4,7,5
1,Alcohol lover. Evil analyst. Infuriatingly humble gamer. Future teen idol. Entrepreneur.,2,1,1,4,5,1,3
2,Amateur pop culture ninja. Social media expert. Future teen idol. Web lover. Travel advocate. Student. Music scholar.,8,2,0,8,7,8,1
3,Wannabe baconaholic. Music evangelist. Hardcore alcohol ninja. Social media fan.,7,3,8,6,2,4,1
4,Wannabe creator. Student. Social media enthusiast. Amateur gamer. Twitter ninja. Friendly problem solver. Devoted web guru.,2,9,2,2,8,8,8


#### Loading the Clustered Profiles

In [4]:
# Loading in the clustered DF
with open("clustered_profiles.pkl",'rb') as fp:
    cluster_df = pickle.load(fp)

# Viewing the DF    
cluster_df.tail()

Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics,Cluster #
6235,Freelance web aficionado. Coffee nerd. Twitter maven. Devoted pop culture guru. Tv scholar. Avid explorer.,5.0,8.0,8.0,0.0,8.0,0.0,0.0,4
6236,Bacon lover. Organizer. Subtly charming music advocate. Travel ninja. Passionate reader. Avid web fan. Extreme pop culture scholar.,9.0,2.0,2.0,8.0,2.0,3.0,2.0,0
6237,Unapologetic coffee advocate. Professional tv geek. Alcohol expert. Social media fan. Pop culture practitioner.,6.0,3.0,2.0,8.0,6.0,5.0,8.0,4
6238,Subtly charming pop culture aficionado. Social mediaholic. Bacon junkie. Wannabe twitter advocate. Friendly internet practitioner.,8.0,6.0,5.0,3.0,4.0,9.0,4.0,11
6239,Incurable explorer. Devoted communicator. Subtly charming organizer. Thinker. Beer maven. Avid web specialist. Twitter ninja.,2.0,3.0,8.0,2.0,0.0,1.0,1.0,0


## Creating the New Profile Data

In [5]:
# Instantiating a new DF row to append later
new_profile = pd.DataFrame(columns=raw_df.columns)

# Adding random values for new data
for i in new_profile.columns[1:]:
    new_profile[i] = np.random.randint(0,10,1)

# Printing an user interface for inputting new values
print("Enter new profile information...\n\nExample Bio:\nBacon enthusiast. Falls down a lot. Freelance social media fan. Infuriatingly humble introvert.")

# Asking for new profile data
new_profile['Bios'] = input("Enter a Bio for yourself: ")

# Indexing that new profile data
new_profile.index = [raw_df.index[-1] + 1]

Enter new profile information...

Example Bio:
Bacon enthusiast. Falls down a lot. Freelance social media fan. Infuriatingly humble introvert.
Enter a Bio for yourself: Gaming enthusiast. Falls down a lot. Freelance. Infuriatingly humble introvert.


### The New Data

In [6]:
new_profile

Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics
6240,Gaming enthusiast. Falls down a lot. Freelance. Infuriatingly humble introvert.,0,4,4,1,0,0,8


# Two Approaches
1. Cluster all the profiles again with the new profile

2. Classify the new profile with a classification model trained on our previously clustered data

## Clustering the New Profile Data

In [9]:
# Appending the new data
new_cluster = raw_df.append(new_profile)

  new_cluster = raw_df.append(new_profile)


### Scaling

In [10]:
# Instantiating the Scaler
scaler = MinMaxScaler()

# Scaling the categories then replacing the old values
df = new_cluster[['Bios']].join(pd.DataFrame(scaler.fit_transform(new_cluster.drop('Bios', axis=1)), columns=new_cluster.columns[1:], index=new_cluster.index))

### Vectorizing

In [11]:
# Instantiating the Vectorizer
vectorizer = CountVectorizer()

# Fitting the vectorizer to the Bios
x = vectorizer.fit_transform(df['Bios'])

# Creating a new DF that contains the vectorized words
df_wrds = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names())

# Concating the words DF with the original DF
new_df = pd.concat([df, df_wrds], axis=1)

# Dropping the Bios because it is no longer needed in place of vectorization
new_df.drop('Bios', axis=1, inplace=True)



### PCA

In [12]:
from sklearn.decomposition import PCA

# Instantiating PCA
pca = PCA()

# Fitting and Transforming the DF
df_pca = pca.fit_transform(new_df)

# Finding the exact number of features that explain at least 99% of the variance in the dataset
total_explained_variance = pca.explained_variance_ratio_.cumsum()
n_over_99 = len(total_explained_variance[total_explained_variance>=.99])
n_to_reach_99 = new_df.shape[1] - n_over_99

# Reducing the dataset to the number of features determined before
pca = PCA(n_components=n_to_reach_99)

# Fitting and transforming the dataset to the stated number of features
df_pca = pca.fit_transform(new_df)

# Seeing the variance ratio that still remains after the dataset has been reduced
pca.explained_variance_ratio_.cumsum()[-1]

0.9898303364752749

### Performing Hierarchical Agglomerative Clustering
- First finding the optimum number of clusters

In [14]:
# Setting the amount of clusters to test out
cluster_cnt = [i for i in range(2, 20, 1)]

# Establishing empty lists to store the scores for the evaluation metrics
ch_scores = []

s_scores = []

db_scores = []

# Looping through different iterations for the number of clusters
for i in tqdm(cluster_cnt):
    
    # Clustering with different number of clusters
    hac = AgglomerativeClustering(n_clusters=i)
    
    hac.fit(df_pca)
    
    cluster_assignments = hac.labels_
    
    # Appending the scores to the empty lists
    ch_scores.append(calinski_harabasz_score(df_pca, cluster_assignments))
    
    s_scores.append(silhouette_score(df_pca, cluster_assignments))
    
    db_scores.append(davies_bouldin_score(df_pca, cluster_assignments))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm(cluster_cnt):


  0%|          | 0/18 [00:00<?, ?it/s]

### Helper Function to Evaluate the Clusters

In [15]:
def cluster_eval(y, x):
    """
    Prints the scores of a set evaluation metric. Prints out the max and min values of the evaluation scores.
    """
    
    # Creating a DataFrame for returning the max and min scores for each cluster
    df = pd.DataFrame(columns=['Cluster Score'], index=[i for i in range(2, len(y)+2)])
    df['Cluster Score'] = y
    
    print('Max Value:\nCluster #', df[df['Cluster Score']==df['Cluster Score'].max()])
    print('\nMin Value:\nCluster #', df[df['Cluster Score']==df['Cluster Score'].min()])

### Evaluation of Clusters

In [16]:
print("The Calinski-Harabasz Score (find max score):")
cluster_eval(ch_scores, cluster_cnt)

print("\nThe Silhouette Coefficient Score (find max score):")
cluster_eval(s_scores, cluster_cnt)

print("\nThe Davies-Bouldin Score (find minimum score):")
cluster_eval(db_scores, cluster_cnt)

The Calinski-Harabasz Score (find max score):
Max Value:
Cluster #    Cluster Score
2     110.751817

Min Value:
Cluster #     Cluster Score
19      73.169492

The Silhouette Coefficient Score (find max score):
Max Value:
Cluster #     Cluster Score
19       0.048401

Min Value:
Cluster #    Cluster Score
2        0.01563

The Davies-Bouldin Score (find minimum score):
Max Value:
Cluster #    Cluster Score
2       6.149872

Min Value:
Cluster #     Cluster Score
18       3.543702


### Running HAC
Again but with the optimum cluster count

In [17]:
# Instantiating HAC
hac = AgglomerativeClustering(n_clusters=12)

# Fitting
hac.fit(df_pca)

# Getting cluster assignments
cluster_assignments = hac.labels_

# Unscaling the categories then replacing the scaled values
df = df[['Bios']].join(pd.DataFrame(scaler.inverse_transform(df.drop('Bios', axis=1)), columns=df.columns[1:], index=df.index))

# Assigning the clusters to each profile
df['Cluster #'] = cluster_assignments


### Finding the Exact Cluster for our New Profile

In [18]:
# Getting the Cluster # for the new profile
profile_cluster = df.loc[new_profile.index]['Cluster #'].values[0]

# Using the Cluster # to narrow down the DF
profile_df = df[df['Cluster #']==profile_cluster].drop('Cluster #', axis=1)

### Vectorizing the Selected Cluster

In [19]:
# Fitting the vectorizer to the Bios
cluster_x = vectorizer.fit_transform(profile_df['Bios'])

# Creating a new DF that contains the vectorized words
cluster_v = pd.DataFrame(cluster_x.toarray(), index=profile_df.index, columns=vectorizer.get_feature_names())

# Joining the Vectorized DF to the previous DF
profile_df = profile_df.join(cluster_v).drop('Bios', axis=1)



### Finding Correlation for Top 10 Similar Profiles to the New Profile

In [20]:
# Trasnposing the DF so that we are correlating with the index(users) and finding the correlation
corr = profile_df.T.corr()

# Finding the Top 10 similar or correlated users to the new user
user_n = new_profile.index[0]

# Creating a DF with the Top 10 most similar profiles
top_10_sim = corr[[user_n]].sort_values(by=[user_n],axis=0, ascending=False)[1:11]

### The Top 10 Profiles most likely to Match with the New Profile
(Sorted by descending similarity)

In [21]:
raw_df.loc[top_10_sim.index]

Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics
1957,Proud music nerd. Food evangelist. Travelaholic. Evil web fanatic. Future teen idol. Analyst.,1,5,5,1,0,0,9
3055,Falls down a lot. Infuriatingly humble internet practitioner. Freelance alcohol scholar. Friendly entrepreneur. Travel trailblazer. Reader.,1,6,5,2,3,3,7
1068,Food junkie. Organizer. Devoted music scholar. Alcohol enthusiast. Hardcore communicator.,1,7,2,1,3,0,9
593,Zombie evangelist. Troublemaker. Bacon junkie. Tv maven. Travel buff.,0,7,7,3,1,2,7
2643,Writer. Evil coffeeaholic. Organizer. Food advocate. Amateur bacon aficionado. Travel expert. Student.,0,7,9,0,4,0,9
2639,Falls down a lot. Extreme web advocate. Award-winning music trailblazer. Friendly student.,2,3,6,5,2,2,9
6192,Beer fanatic. Travel geek. Pop cultureaholic. Friendly thinker. Twitter lover. Evil bacon nerd.,3,5,3,3,0,2,9
4702,Total analyst. Friendly beer advocate. Amateur communicator. Pop culture enthusiast.,3,8,3,0,1,0,8
2185,Friendly bacon expert. Tv nerd. Food advocate. Zombie fan. Coffee maven. Student. Gamer. Amateur web scholar.,0,3,5,1,1,3,8
4090,Hardcore twitter practitioner. Devoted organizer. Avid student. Beer lover. Extreme explorer.,3,7,4,2,4,1,9


## Classification of the New Profile

### Importing the Different Classification Models

In [22]:
# Importing 3 models
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

### Vectorizing the Data

In [23]:
# Assigning the split variables
X = cluster_df.drop(["Cluster #"], 1)
y = cluster_df['Cluster #']

## Vectorizing
# Instantiating the Vectorizer
vectorizer = CountVectorizer()

# Fitting the vectorizer to the Bios
x = vectorizer.fit_transform(X['Bios'])

# Creating a new DF that contains the vectorized words
df_wrds = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names())

# Concating the words DF with the original DF
X = pd.concat([X, df_wrds], axis=1)

# Dropping the Bios because it is no longer needed in place of vectorization
X.drop(['Bios'], axis=1, inplace=True)

  X = cluster_df.drop(["Cluster #"], 1)


#### Scaling

In [24]:
# Scaling the Data
scaler = MinMaxScaler()

X = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)

### Preparing the New Profile Data
For Vectorization purposes, the new profile will have to be able to fit into trained data (has to have the same columns).

Two Options:
1. __Vectorized the New Profile data with the vectorizer fitted to the dataset as to not include potentially new vocabulary. _(Keeps dimensionality the same)___
2. Vectorized the New Profile data with a new vectorizer fitted to it in order to include new vocabulary. _(Increases dimensionality with every new piece of data)_

#### Vectorizing

In [25]:
# Vectorizing the new data
vect_new_prof = vectorizer.transform(new_profile['Bios'])

# Quick DF of the vectorized words
new_vect_w = pd.DataFrame(vect_new_prof.toarray(), columns=vectorizer.get_feature_names(), index=new_profile.index)

# Concatenating the DFs for the new profile data
new_vect_prof = pd.concat([new_profile, new_vect_w], 1).drop('Bios', 1)

# Scaling the new profile data
new_vect_prof = pd.DataFrame(scaler.transform(new_vect_prof), columns=new_vect_prof.columns, index=new_vect_prof.index)

  new_vect_prof = pd.concat([new_profile, new_vect_w], 1).drop('Bios', 1)
  new_vect_prof = pd.concat([new_profile, new_vect_w], 1).drop('Bios', 1)


In [26]:
new_vect_prof

Unnamed: 0,Movies,TV,Religion,Music,Sports,Books,Politics,advocate,aficionado,alcohol,...,unable,unapologetic,wannabe,web,webaholic,winning,with,writer,zombie,zombieaholic
6240,0.0,0.444444,0.444444,0.111111,0.0,0.0,0.888889,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Train, test, splitting

In [27]:
# Train, test, split
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Finding the Best Model
- Dummy (Baseline Model)
- KNN
- SVM

In [28]:
# Dummy
dummy = DummyClassifier(strategy='stratified')

# KNN
knn = KNeighborsClassifier()

# SVM
svm = SVC()

# List of models
models = [dummy, knn, svm]

# List of model names
names = ['Dummy', 'KNN', 'SVM']

# Zipping the lists
classifiers = dict(zip(names, models))

Since we are dealing with an imbalanced dataset _(because each cluster is not guaranteed to have the same amount of profiles)_, we will resort to using the __Macro Avg__ and __F1 Score__ for evaluating the performances of each model.

In [29]:
# Dictionary containing the model names and their scores
models_f1 = {}

# Looping through each model's predictions and getting their classification reports
for name, model in classifiers.items():
    # Fitting the model
    model.fit(X_train, y_train)
    
    print('\n'+ name + ' (Macro Avg - F1 Score):')
    
    # Classification Report
    report = classification_report(y_test, model.predict(X_test), output_dict=True)
    f1 = report['macro avg']['f1-score']
    
    # Assigning to the Dictionary
    models_f1[name] = f1
    
    print(f1)


Dummy (Macro Avg - F1 Score):
0.08762384403794359

KNN (Macro Avg - F1 Score):
0.9980606943087409

SVM (Macro Avg - F1 Score):
1.0


#### Model with the Best Performance

In [30]:
print(max(models_f1, key=models_f1.get), 'Score:', max(models_f1.values()))

SVM Score: 1.0


### Using the Best Model to Classify the New Profile
_(Optional: Tune the model with GridSearch)_

In [31]:
# Fitting the model
svm.fit(X, y)

# Predicting the New Profile data by determining which Cluster it would belong to
designated_cluster = svm.predict(new_vect_prof)

designated_cluster

array([1], dtype=int64)

### DF containing the Profiles of the Designated Cluster

In [33]:
des_cluster = cluster_df[cluster_df['Cluster #']==designated_cluster[0]]

des_cluster

Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics,Cluster #
0,Evil communicator. Avid analyst. Freelance gamer. Beer specialist. Incurable troublemaker. Typical entrepreneur.,8.0,1.0,2.0,8.0,4.0,7.0,5.0,1
5,Beer trailblazer. Avid troublemaker. Pop culture specialist. Freelance explorer. Reader. Gamer.,7.0,4.0,5.0,5.0,8.0,1.0,4.0,1
10,Evil pop culture lover. Passionate travel advocate. Beer nerd. Gamer. Internet aficionado. Freelance bacon evangelist.,2.0,3.0,7.0,2.0,9.0,2.0,2.0,1
15,Evil communicator. Avid analyst. Freelance gamer. Beer specialist. Incurable troublemaker. Typical entrepreneur.,3.0,7.0,0.0,2.0,3.0,8.0,1.0,1
20,Beer trailblazer. Avid troublemaker. Pop culture specialist. Freelance explorer. Reader. Gamer.,1.0,0.0,6.0,7.0,2.0,6.0,0.0,1
...,...,...,...,...,...,...,...,...,...
6190,Alcohol specialist. Incurable web ninja. Travel guru. Zombie fanatic. Hardcore explorer.,1.0,6.0,7.0,3.0,3.0,8.0,4.0,1
6192,Beer fanatic. Travel geek. Pop cultureaholic. Friendly thinker. Twitter lover. Evil bacon nerd.,3.0,5.0,3.0,3.0,0.0,2.0,9.0,1
6197,Future teen idol. Travel trailblazer. Proud alcohol guru. Communicator. Foodaholic. Twitter maven. Bacon buff.,5.0,3.0,0.0,7.0,7.0,0.0,5.0,1
6212,Future teen idol. Travel trailblazer. Proud alcohol guru. Communicator. Foodaholic. Twitter maven. Bacon buff.,1.0,7.0,5.0,5.0,9.0,8.0,6.0,1


### Finding the Top 10 Similar Profiles to our New Profile

In [34]:
# Appending the new profile data
des_cluster = des_cluster.append(new_profile, sort=False)

# Fitting the vectorizer to the Bios
cluster_x = vectorizer.fit_transform(des_cluster['Bios'])

# Creating a new DF that contains the vectorized words
cluster_v = pd.DataFrame(cluster_x.toarray(), index=des_cluster.index, columns=vectorizer.get_feature_names())

# Joining the Vectorized DF to the previous DF and dropping columns
des_cluster = des_cluster.join(cluster_v).drop(['Bios', 'Cluster #'], axis=1)

des_cluster

  des_cluster = des_cluster.append(new_profile, sort=False)


Unnamed: 0,Movies,TV,Religion,Music,Sports,Books,Politics,advocate,aficionado,alcohol,...,tvaholic,twitter,twitteraholic,typical,unapologetic,wannabe,web,winning,writer,zombie
0,8.0,1.0,2.0,8.0,4.0,7.0,5.0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,7.0,4.0,5.0,5.0,8.0,1.0,4.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,2.0,3.0,7.0,2.0,9.0,2.0,2.0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
15,3.0,7.0,0.0,2.0,3.0,8.0,1.0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
20,1.0,0.0,6.0,7.0,2.0,6.0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6192,3.0,5.0,3.0,3.0,0.0,2.0,9.0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
6197,5.0,3.0,0.0,7.0,7.0,0.0,5.0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
6212,1.0,7.0,5.0,5.0,9.0,8.0,6.0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
6227,3.0,8.0,6.0,9.0,8.0,4.0,4.0,0,0,1,...,0,1,0,0,0,0,0,0,0,0


#### Correlations to find similar profiles

In [35]:
# Finding the Top 10 similar or correlated users to the new user
user_n = new_profile.index[0]

# Trasnposing the DF so that we are correlating with the index(users) and finding the correlation
corr = des_cluster.T.corrwith(des_cluster.loc[user_n])

# Creating a DF with the Top 10 most similar profiles
top_10_sim = corr.sort_values(ascending=False)[1:11]

### Top 10 Similar profiles

In [36]:
raw_df.loc[top_10_sim.index]

Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics
1957,Proud music nerd. Food evangelist. Travelaholic. Evil web fanatic. Future teen idol. Analyst.,1,5,5,1,0,0,9
3055,Falls down a lot. Infuriatingly humble internet practitioner. Freelance alcohol scholar. Friendly entrepreneur. Travel trailblazer. Reader.,1,6,5,2,3,3,7
593,Zombie evangelist. Troublemaker. Bacon junkie. Tv maven. Travel buff.,0,7,7,3,1,2,7
2639,Falls down a lot. Extreme web advocate. Award-winning music trailblazer. Friendly student.,2,3,6,5,2,2,9
6192,Beer fanatic. Travel geek. Pop cultureaholic. Friendly thinker. Twitter lover. Evil bacon nerd.,3,5,3,3,0,2,9
2412,Unapologetic pop culture specialist. Tv buff. Bacon enthusiast. Creator. Freelance beer advocate. Lifelong reader.,1,8,4,1,2,3,7
6027,Beer fanatic. Travel geek. Pop cultureaholic. Friendly thinker. Twitter lover. Evil bacon nerd.,0,4,3,2,4,0,8
3714,Travel practitioner. Music maven. Professional problem solver. Web fanatic. Communicator. Twitter lover. Proud coffee specialist.,3,8,5,0,1,3,8
4199,Wannabe explorer. Avid introvert. Evil communicator. Music guru. Falls down a lot.,1,7,2,6,1,2,8
5947,Gamer. Bacon nerd. Internet maven. Avid travel junkie. Beer evangelist. Tv fanatic. Social mediaholic. Proud analyst.,4,6,6,4,4,0,9


### Saving the Classification Model
For future use

In [37]:
from joblib import dump

dump(svm, "clf_model.joblib")

['clf_model.joblib']

## Conclusion on the Two Different Approaches
The results for both approaches are the same.  The new profile ends up in the same cluster whether it is clustered or classified to be there.