# Adding a New Dating Profile
Using Classification or Clustering for a New Dating Profile

### Importing Libraries and Data

In [24]:
import pandas as pd
pd.set_option('display.max_colwidth', 500)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import _pickle as pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import calinski_harabasz_score, silhouette_score, davies_bouldin_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from tqdm import tqdm_notebook as tqdm

#### Loading the Profiles

In [3]:
# Loading in the cleaned DF
with open("profiles.pkl",'rb') as fp:
    raw_df = pickle.load(fp)

# Viewing the DF    
raw_df.head()

Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics
0,Typical twitter fanatic. Infuriatingly humble thinker. Lifelong coffee practitioner. Organizer.,5,3,4,1,3,6,7
1,Web junkie. Analyst. Infuriatingly humble introvert. Food nerd. Lifelong music fanatic. Coffee lover.,7,9,5,1,9,4,0
2,Avid web maven. Food practitioner. Gamer. Twitter fanatic. Pop culture scholar. Zombie evangelist.,1,2,6,5,6,5,4
3,Twitteraholic. Extreme web fanatic. Food buff. Infuriatingly humble entrepreneur.,5,2,7,8,2,6,6
4,Bacon enthusiast. Falls down a lot. Freelance social media fan. Infuriatingly humble introvert.,6,6,6,4,3,6,3


#### Loading the Clustered Profiles

In [4]:
# Loading in the clustered DF
with open("clustered_profiles.pkl",'rb') as fp:
    cluster_df = pickle.load(fp)

# Viewing the DF    
cluster_df.tail()

Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics,Cluster #
6595,Typical pop culture nerd. Infuriatingly humble internet maven. Alcohol evangelist.,7.0,9.0,0.0,0.0,2.0,2.0,4.0,9
6596,Avid web junkie. Lifelong alcohol guru. Hardcore reader. Award-winning twitter evangelist.,4.0,3.0,6.0,3.0,7.0,7.0,2.0,2
6597,Music ninja. Bacon fanatic. Reader. Total communicator. Unapologetic beer specialist.,1.0,4.0,0.0,4.0,9.0,2.0,5.0,0
6598,Communicator. Bacon lover. Award-winning introvert. Amateur internet ninja.,6.0,2.0,0.0,3.0,8.0,9.0,1.0,9
6599,Unapologetic tv aficionado. Devoted twitter enthusiast. Typical coffee guru. Falls down a lot.,2.0,1.0,8.0,7.0,0.0,5.0,5.0,10


## Creating the New Profile Data

In [6]:
# Instantiating a new DF row to append later
new_profile = pd.DataFrame(columns=raw_df.columns)

# Adding random values for new data
for i in new_profile.columns[1:]:
    new_profile[i] = np.random.randint(0,10,1)

# Printing an user interface for inputting new values
print("Enter new profile information...\n\nExample Bio:\nBacon enthusiast. Falls down a lot. Freelance social media fan. Infuriatingly humble introvert.")

# Asking for new profile data
new_profile['Bios'] = input("Enter a Bio for yourself: ")

# Indexing that new profile data
new_profile.index = [raw_df.index[-1] + 1]

Enter new profile information...

Example Bio:
Bacon enthusiast. Falls down a lot. Freelance social media fan. Infuriatingly humble introvert.
Enter a Bio for yourself: food lover. social media fanatic. extraordinarily humble. life lover.


### The New Data

In [7]:
new_profile

Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics
6600,food lover. social media fanatic. extraordinarily humble. life lover.,0,2,1,0,8,1,2


# Two Approaches
1. Cluster all the profiles again with the new profile

2. Classify the new profile with a classification model trained on our previously clustered data

## Clustering the New Profile Data

In [35]:
# Appending the new data
new_cluster = raw_df.append(new_profile)

Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics
6596,Avid web junkie. Lifelong alcohol guru. Hardcore reader. Award-winning twitter evangelist.,4,3,6,3,7,7,2
6597,Music ninja. Bacon fanatic. Reader. Total communicator. Unapologetic beer specialist.,1,4,0,4,9,2,5
6598,Communicator. Bacon lover. Award-winning introvert. Amateur internet ninja.,6,2,0,3,8,9,1
6599,Unapologetic tv aficionado. Devoted twitter enthusiast. Typical coffee guru. Falls down a lot.,2,1,8,7,0,5,5
6600,food lover. social media fanatic. extraordinarily humble. life lover.,4,9,3,6,6,0,3


### Scaling

In [37]:
# Instantiating the Scaler
scaler = MinMaxScaler()

# Scaling the categories then replacing the old values
df = new_cluster[['Bios']].join(pd.DataFrame(scaler.fit_transform(new_cluster.drop('Bios', axis=1)), columns=new_cluster.columns[1:], index=new_cluster.index))

  return self.partial_fit(X, y)


Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics
0,Typical twitter fanatic. Infuriatingly humble thinker. Lifelong coffee practitioner. Organizer.,0.555556,0.333333,0.444444,0.111111,0.333333,0.666667,0.777778
1,Web junkie. Analyst. Infuriatingly humble introvert. Food nerd. Lifelong music fanatic. Coffee lover.,0.777778,1.000000,0.555556,0.111111,1.000000,0.444444,0.000000
2,Avid web maven. Food practitioner. Gamer. Twitter fanatic. Pop culture scholar. Zombie evangelist.,0.111111,0.222222,0.666667,0.555556,0.666667,0.555556,0.444444
3,Twitteraholic. Extreme web fanatic. Food buff. Infuriatingly humble entrepreneur.,0.555556,0.222222,0.777778,0.888889,0.222222,0.666667,0.666667
4,Bacon enthusiast. Falls down a lot. Freelance social media fan. Infuriatingly humble introvert.,0.666667,0.666667,0.666667,0.444444,0.333333,0.666667,0.333333
5,Pop culture junkie. Tv buff. Reader. Friendly travel expert. Incurable twitteraholic. Social media ninja. Coffee enthusiast. Internet specialist.,0.000000,0.555556,0.777778,0.555556,1.000000,0.222222,0.000000
6,Typical thinker. Amateur explorer. Reader. Extreme student. Tv fanatic. Social media ninja.,0.555556,0.444444,0.777778,0.555556,0.000000,0.888889,0.444444
7,Zombie maven. Travel geek. Professional social media buff. Avid pop culture lover.,1.000000,0.888889,0.888889,0.222222,0.000000,0.000000,0.555556
8,Lifelong introvert. General travel maven. Hipster-friendly web trailblazer. Writer. Alcohol fan. Student. Communicator. Coffee guru.,0.888889,0.888889,0.222222,0.666667,0.111111,0.222222,1.000000
9,Travel ninja. Amateur pop culture evangelist. Web fanatic. Freelance communicator. Zombie geek.,1.000000,0.000000,0.222222,0.222222,0.555556,0.666667,0.888889


### Vectorizing

In [38]:
# Instantiating the Vectorizer
vectorizer = CountVectorizer()

# Fitting the vectorizer to the Bios
x = vectorizer.fit_transform(df['Bios'])

# Creating a new DF that contains the vectorized words
df_wrds = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names())

# Concating the words DF with the original DF
new_df = pd.concat([df, df_wrds], axis=1)

# Dropping the Bios because it is no longer needed in place of vectorization
new_df.drop('Bios', axis=1, inplace=True)

### PCA

In [39]:
from sklearn.decomposition import PCA

# Instantiating PCA
pca = PCA()

# Fitting and Transforming the DF
df_pca = pca.fit_transform(new_df)

# Finding the exact number of features that explain at least 99% of the variance in the dataset
total_explained_variance = pca.explained_variance_ratio_.cumsum()
n_over_99 = len(total_explained_variance[total_explained_variance>=.99])
n_to_reach_99 = new_df.shape[1] - n_over_99

# Reducing the dataset to the number of features determined before
pca = PCA(n_components=n_to_reach_99)

# Fitting and transforming the dataset to the stated number of features
df_pca = pca.fit_transform(new_df)

# Seeing the variance ratio that still remains after the dataset has been reduced
pca.explained_variance_ratio_.cumsum()[-1]

0.9898433044921403

### Performing Hierarchical Agglomerative Clustering
- First finding the optimum number of clusters

In [42]:
# Setting the amount of clusters to test out
cluster_cnt = [i for i in range(2, 20, 1)]

# Establishing empty lists to store the scores for the evaluation metrics
ch_scores = []

s_scores = []

db_scores = []

# Looping through different iterations for the number of clusters
for i in tqdm(cluster_cnt):
    
    # Clustering with different number of clusters
    hac = AgglomerativeClustering(n_clusters=i)
    
    hac.fit(df_pca)
    
    cluster_assignments = hac.labels_
    
    # Appending the scores to the empty lists
    ch_scores.append(calinski_harabasz_score(df_pca, cluster_assignments))
    
    s_scores.append(silhouette_score(df_pca, cluster_assignments))
    
    db_scores.append(davies_bouldin_score(df_pca, cluster_assignments))

HBox(children=(IntProgress(value=0, max=18), HTML(value='')))

  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) / centroid_distances
  score = (intra_dists[:, None] + intra_dists) /




  score = (intra_dists[:, None] + intra_dists) / centroid_distances


### Helper Function to Evaluate the Clusters

In [43]:
def cluster_eval(y, x):
    """
    Prints the scores of a set evaluation metric. Prints out the max and min values of the evaluation scores.
    """
    
    # Creating a DataFrame for returning the max and min scores for each cluster
    df = pd.DataFrame(columns=['Cluster Score'], index=[i for i in range(2, len(y)+2)])
    df['Cluster Score'] = y
    
    print('Max Value:\nCluster #', df[df['Cluster Score']==df['Cluster Score'].max()])
    print('\nMin Value:\nCluster #', df[df['Cluster Score']==df['Cluster Score'].min()])

### Evaluation of Clusters

In [45]:
print("The Calinski-Harabasz Score (find max score):")
cluster_eval(ch_scores, cluster_cnt)

print("\nThe Silhouette Coefficient Score (find max score):")
cluster_eval(s_scores, cluster_cnt)

print("\nThe Davies-Bouldin Score (find minimum score):")
cluster_eval(db_scores, cluster_cnt)

The Calinski-Harabasz Score (find highest score):
Max Value:
Cluster #    Cluster Score
2     120.008244

Min Value:
Cluster #     Cluster Score
19      57.657188

The Silhouette Coefficient Score (find highest score):
Max Value:
Cluster #     Cluster Score
12        0.03048

Min Value:
Cluster #    Cluster Score
2        0.01585

The Davies-Bouldin Score (find lowest score):
Max Value:
Cluster #     Cluster Score
19       5.449615

Min Value:
Cluster #     Cluster Score
12       4.006027


### Running HAC
Again but with the optimum cluster count

In [46]:
# Instantiating HAC
hac = AgglomerativeClustering(n_clusters=12)

# Fitting
hac.fit(df_pca)

# Getting cluster assignments
cluster_assignments = hac.labels_

# Unscaling the categories then replacing the scaled values
df = df[['Bios']].join(pd.DataFrame(scaler.inverse_transform(df.drop('Bios', axis=1)), columns=df.columns[1:], index=df.index))

# Assigning the clusters to each profile
df['Cluster #'] = cluster_assignments


### Finding the Exact Cluster for our New Profile

In [66]:
# Getting the Cluster # for the new profile
profile_cluster = df.loc[new_profile.index]['Cluster #'].values[0]

# Using the Cluster # to narrow down the DF
profile_df = df[df['Cluster #']==profile_cluster].drop('Cluster #', axis=1)

Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics
6,Typical thinker. Amateur explorer. Reader. Extreme student. Tv fanatic. Social media ninja.,5.0,4.0,7.0,5.0,0.0,8.0,4.0
14,Travel buff. Student. Alcohol trailblazer. Passionate social media ninja. Entrepreneur. Incurable writer.,0.0,4.0,3.0,9.0,6.0,4.0,5.0
17,Beer practitioner. Coffee expert. Tv guru. Social media junkie. Bacon enthusiast. General student. Food nerd.,1.0,4.0,8.0,1.0,3.0,5.0,3.0
46,Explorer. Social media buff. Evil bacon nerd. Friendly zombie evangelist. Typical travel advocate.,4.0,9.0,7.0,6.0,0.0,6.0,2.0
52,Problem solver. Internet evangelist. Devoted social media fanatic. Passionate twitter geek.,3.0,6.0,8.0,9.0,3.0,1.0,3.0
74,Hardcore beer practitioner. Typical bacon evangelist. Certified social media buff.,3.0,7.0,8.0,2.0,1.0,0.0,7.0
85,Freelance social media nerd. Internet ninja. Introvert. Beer evangelist. Alcohol enthusiast. Bacon geek. Troublemaker.,2.0,8.0,2.0,5.0,4.0,9.0,1.0
88,Typical bacon aficionado. Internet guru. Food lover. Music enthusiast. Twitter geek. Social media ninja.,8.0,0.0,8.0,1.0,4.0,3.0,5.0
89,Introvert. Freelance music expert. Devoted social media advocate. Evil reader. Zombie fanatic. Professional analyst.,4.0,6.0,6.0,3.0,4.0,0.0,6.0
97,Thinker. Lifelong travelaholic. Alcohol enthusiast. Incurable social media lover. Communicator.,5.0,8.0,2.0,8.0,0.0,9.0,8.0


### Vectorizing the Selected Cluster

In [67]:
# Fitting the vectorizer to the Bios
cluster_x = vectorizer.fit_transform(profile_df['Bios'])

# Creating a new DF that contains the vectorized words
cluster_v = pd.DataFrame(cluster_x.toarray(), index=profile_df.index, columns=vectorizer.get_feature_names())

# Joining the Vectorized DF to the previous DF
profile_df = profile_df.join(cluster_v).drop('Bios', axis=1)

### Finding Correlation for Top 10 Similar Profiles to the New Profile

In [73]:
# Trasnposing the DF so that we are correlating with the index(users) and finding the correlation
corr = profile_df.T.corr()

# Finding the Top 10 similar or correlated users to the new user
user_n = new_profile.index[0]

# Creating a DF with the Top 10 most similar profiles
top_10_sim = corr[[user_n]].sort_values(by=[user_n],axis=0, ascending=False)[1:11]

### The Top 10 Profiles most likely to Match with the New Profile
(Sorted by descending similarity)

In [72]:
raw_df.loc[top_10_sim.index]

Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics
4703,Devoted reader. Bacon aficionado. Lifelong internet specialist. Food fan. Extreme twitter buff. Friendly coffee enthusiast. Social media lover.,6,9,1,9,9,1,7
4931,Coffee junkie. Social media ninja. Typical twitter specialist. Tvaholic. Student.,6,8,1,5,5,0,1
4362,Total bacon fanatic. Professional troublemaker. Proud pop culture lover. Hipster-friendly social media evangelist.,9,9,3,9,7,2,1
2718,General beer advocate. Hipster-friendly introvert. Social media nerd. Gamer. Alcohol geek. Professional writer.,2,9,1,6,8,1,2
367,Organizer. Professional alcoholaholic. Hipster-friendly social media fanatic. Total zombie evangelist. Gamer.,4,9,1,9,8,0,0
4553,Alcohol trailblazer. Passionate creator. Typical social media junkie. Avid zombie fanatic. Gamer.,3,8,5,5,9,2,2
3711,Internet lover. General social media advocate. Hardcore music maven. Web buff. Devoted tv fan. Zombie expert.,6,8,5,6,8,0,8
5480,Internet expert. Social media scholar. Hipster-friendly zombie maven. Amateur tv buff.,5,9,3,9,7,5,5
4814,Entrepreneur. Certified foodaholic. Unapologetic thinker. Incurable travel guru. Wannabe alcohol buff.,4,9,3,6,7,4,6
1823,Devoted reader. Alcoholaholic. Coffee nerd. Evil organizer. Analyst. Passionate troublemaker.,5,8,5,9,9,0,9


## Classification of the New Profile

### Importing the Different Classification Models

In [8]:
# Importing 3 models
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

### Vectorizing the Data

In [9]:
# Assigning the split variables
X = cluster_df.drop(["Cluster #"], 1)
y = cluster_df['Cluster #']

## Vectorizing
# Instantiating the Vectorizer
vectorizer = CountVectorizer()

# Fitting the vectorizer to the Bios
x = vectorizer.fit_transform(X['Bios'])

# Creating a new DF that contains the vectorized words
df_wrds = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names())

# Concating the words DF with the original DF
X = pd.concat([X, df_wrds], axis=1)

# Dropping the Bios because it is no longer needed in place of vectorization
X.drop(['Bios'], axis=1, inplace=True)

#### Scaling

In [10]:
# Scaling the Data
scaler = MinMaxScaler()

X = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)

  return self.partial_fit(X, y)


### Preparing the New Profile Data
For Vectorization purposes, the new profile will have to be able to fit into trained data (has to have the same columns).

Two Options:
1. __Vectorized the New Profile data with the vectorizer fitted to the dataset as to not include potentially new vocabulary. _(Keeps dimensionality the same)___
2. Vectorized the New Profile data with a new vectorizer fitted to it in order to include new vocabulary. _(Increases dimensionality with every new piece of data)_

#### Vectorizing

In [11]:
# Vectorizing the new data
vect_new_prof = vectorizer.transform(new_profile['Bios'])

# Quick DF of the vectorized words
new_vect_w = pd.DataFrame(vect_new_prof.toarray(), columns=vectorizer.get_feature_names(), index=new_profile.index)

# Concatenating the DFs for the new profile data
new_vect_prof = pd.concat([new_profile, new_vect_w], 1).drop('Bios', 1)

# Scaling the new profile data
new_vect_prof = pd.DataFrame(scaler.transform(new_vect_prof), columns=new_vect_prof.columns, index=new_vect_prof.index)

In [12]:
new_vect_prof

Unnamed: 0,Movies,TV,Religion,Music,Sports,Books,Politics,advocate,aficionado,alcohol,...,unable,unapologetic,wannabe,web,webaholic,winning,with,writer,zombie,zombieaholic
6600,0.0,0.222222,0.111111,0.0,0.888889,0.111111,0.222222,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Train, test, splitting

In [13]:
# Train, test, split
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Finding the Best Model
- Dummy (Baseline Model)
- KNN
- SVM

In [14]:
# Dummy
dummy = DummyClassifier(strategy='stratified')

# KNN
knn = KNeighborsClassifier()

# SVM
svm = SVC()

# List of models
models = [dummy, knn, svm]

# List of model names
names = ['Dummy', 'KNN', 'SVM']

# Zipping the lists
classifiers = dict(zip(names, models))

Since we are dealing with an imbalanced dataset _(because each cluster is not guaranteed to have the same amount of profiles)_, we will resort to using the __Macro Avg__ and __F1 Score__ for evaluating the performances of each model.

In [15]:
# Dictionary containing the model names and their scores
models_f1 = {}

# Looping through each model's predictions and getting their classification reports
for name, model in classifiers.items():
    # Fitting the model
    model.fit(X_train, y_train)
    
    print('\n'+ name + ' (Macro Avg - F1 Score):')
    
    # Classification Report
    report = classification_report(y_test, model.predict(X_test), output_dict=True)
    f1 = report['macro avg']['f1-score']
    
    # Assigning to the Dictionary
    models_f1[name] = f1
    
    print(f1)


Dummy (Macro Avg - F1 Score):
0.08128343063172593

KNN (Macro Avg - F1 Score):
0.8119051962081735





SVM (Macro Avg - F1 Score):
0.8706956551299337


#### Model with the Best Performance

In [16]:
print(max(models_f1, key=models_f1.get), 'Score:', max(models_f1.values()))

SVM Score: 0.8706956551299337


### Using the Best Model to Classify the New Profile
_(Optional: Tune the model with GridSearch)_

In [17]:
# Fitting the model
svm.fit(X, y)

# Predicting the New Profile data by determining which Cluster it would belong to
designated_cluster = svm.predict(new_vect_prof)

designated_cluster



array([4])

### DataFrame containing the Profiles of the Designated Cluster

In [25]:
des_cluster = cluster_df[cluster_df['Cluster #']==designated_cluster[0]]

des_cluster

Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics,Cluster #
6,Typical thinker. Amateur explorer. Reader. Extreme student. Tv fanatic. Social media ninja.,5.0,4.0,7.0,5.0,0.0,8.0,4.0,4
14,Travel buff. Student. Alcohol trailblazer. Passionate social media ninja. Entrepreneur. Incurable writer.,0.0,4.0,3.0,9.0,6.0,4.0,5.0,4
17,Beer practitioner. Coffee expert. Tv guru. Social media junkie. Bacon enthusiast. General student. Food nerd.,1.0,4.0,8.0,1.0,3.0,5.0,3.0,4
46,Explorer. Social media buff. Evil bacon nerd. Friendly zombie evangelist. Typical travel advocate.,4.0,9.0,7.0,6.0,0.0,6.0,2.0,4
52,Problem solver. Internet evangelist. Devoted social media fanatic. Passionate twitter geek.,3.0,6.0,8.0,9.0,3.0,1.0,3.0,4
62,Professional twitter junkie. Travel advocate. Hardcore food aficionado. Explorer. Friendly coffee trailblazer. Reader.,6.0,4.0,9.0,2.0,1.0,2.0,9.0,4
74,Hardcore beer practitioner. Typical bacon evangelist. Certified social media buff.,3.0,7.0,8.0,2.0,1.0,0.0,7.0,4
85,Freelance social media nerd. Internet ninja. Introvert. Beer evangelist. Alcohol enthusiast. Bacon geek. Troublemaker.,2.0,8.0,2.0,5.0,4.0,9.0,1.0,4
88,Typical bacon aficionado. Internet guru. Food lover. Music enthusiast. Twitter geek. Social media ninja.,8.0,0.0,8.0,1.0,4.0,3.0,5.0,4
89,Introvert. Freelance music expert. Devoted social media advocate. Evil reader. Zombie fanatic. Professional analyst.,4.0,6.0,6.0,3.0,4.0,0.0,6.0,4


### Finding the Top 10 Similar Profiles to a new user

In [26]:
# Appending the new profile data
des_cluster = des_cluster.append(new_profile, sort=False)

# Fitting the vectorizer to the Bios
cluster_x = vectorizer.fit_transform(des_cluster['Bios'])

# Creating a new DF that contains the vectorized words
cluster_v = pd.DataFrame(cluster_x.toarray(), index=des_cluster.index, columns=vectorizer.get_feature_names())

# Joining the Vectorized DF to the previous DF and dropping columns
des_cluster = des_cluster.join(cluster_v).drop(['Bios', 'Cluster #'], axis=1)

des_cluster

Unnamed: 0,Movies,TV,Religion,Music,Sports,Books,Politics,advocate,aficionado,alcohol,...,twitteraholic,typical,unapologetic,wannabe,web,webaholic,winning,writer,zombie,zombieaholic
6,5.0,4.0,7.0,5.0,0.0,8.0,4.0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
14,0.0,4.0,3.0,9.0,6.0,4.0,5.0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
17,1.0,4.0,8.0,1.0,3.0,5.0,3.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46,4.0,9.0,7.0,6.0,0.0,6.0,2.0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
52,3.0,6.0,8.0,9.0,3.0,1.0,3.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
62,6.0,4.0,9.0,2.0,1.0,2.0,9.0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
74,3.0,7.0,8.0,2.0,1.0,0.0,7.0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
85,2.0,8.0,2.0,5.0,4.0,9.0,1.0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
88,8.0,0.0,8.0,1.0,4.0,3.0,5.0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
89,4.0,6.0,6.0,3.0,4.0,0.0,6.0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


#### Correlations to find similar profiles

In [33]:
# Finding the Top 10 similar or correlated users to the new user
user_n = new_profile.index[0]

# Trasnposing the DF so that we are correlating with the index(users) and finding the correlation
corr = des_cluster.T.corrwith(des_cluster.loc[user_n])

# Creating a DF with the Top 10 most similar profiles
top_10_sim = corr.sort_values(ascending=False)[1:11]

538     0.845239
5336    0.808115
5972    0.805976
3311    0.805842
1314    0.796451
1367    0.785936
2387    0.784476
3685    0.762458
4087    0.760820
3364    0.757635
dtype: float64

### Top 10 Similar profiles

In [22]:
raw_df.loc[top_10_sim.index]

Unnamed: 0,Bios,Movies,TV,Religion,Music,Sports,Books,Politics
538,Typical bacon aficionado. Internet guru. Food lover. Music enthusiast. Twitter geek. Social media ninja.,1,3,1,2,8,0,0
5336,Evil social media ninja. Food expert. Internetaholic. Entrepreneur. Passionate beer advocate. Certified reader. Analyst.,1,5,0,1,9,5,1
5972,Communicator. Bacon fanatic. Proud writer. Alcohol lover. Friendly internet fanatic.,1,4,1,5,9,1,5
3311,Zombieaholic. Food lover. Pop culture expert. Hardcore social media scholar. Music maven.,2,4,3,0,7,2,6
1314,Certified alcohol specialist. Introvert. General social media aficionado. Music maven. Beer advocate. Incurable gamer.,0,1,1,2,8,4,1
1367,Typical food expert. Freelance coffee ninja. Social media junkie. General beer evangelist.,5,3,2,1,9,0,1
2387,Hardcore social media enthusiast. General internet fanatic. Tv trailblazer. Avid web advocate. Music scholar. Zombie guru.,0,3,5,3,9,5,4
3685,Extreme student. Social media buff. Twitter nerd. Organizer. Beer lover. Travel fanatic.,0,4,3,6,8,1,4
4087,Gamer. Zombie enthusiast. Social media guru. Tv practitioner. Evil music fanatic. Student.,1,1,5,2,8,5,3
3364,Total social media evangelist. Devoted internet practitioner. Zombie scholar.,0,3,2,4,8,5,6


### Saving the Classification Model
For future use

In [23]:
from joblib import dump

dump(svm, "clf_model.joblib")

['clf_model.joblib']

## Conclusion on the Two Different Approaches
The results for both approaches are the same.  The new profile ends up in the same cluster whether it is clustered or classified to be there.