In [1]:
import pandas as pd
import numpy as np
import csv

In [2]:
#the dataset where description is one single column.
data = pd.read_csv('cupid-small.csv')
data.head(2)

Unnamed: 0,age,body_type,drinks,drugs,education,ethnicity,height,income,job,location,last_online,orientation,sex,sign,religion,smokes,status,description
0,22,a little extra,socially,never,working on college/university,"asian, white",75.0,-1,transportation,"south san francisco, california",2012-06-28-20-30,straight,m,gemini,agnosticism and very serious about it,sometimes,single,about me:<br />\n<br />\ni would love to think...
1,35,average,often,sometimes,working on space camp,white,70.0,80000,hospitality / travel,"oakland, california",2012-06-29-21-41,straight,m,cancer,agnosticism but not too serious about it,no,single,i am a chef: this is what that means.<br />\n1...


In [3]:
#checking the ages of people in the dataset.
data.age.unique()

array([ 22,  35,  38,  23,  29,  32,  31,  24,  37,  28,  30,  39,  33,
        26,  27,  20,  25,  40,  36,  21,  34,  43,  46,  41,  42,  45,
        18,  55,  50,  59,  44,  48,  54,  51,  62,  52,  19,  58,  66,
        53,  63,  47,  49,  61,  60,  57,  56,  65,  64,  68, 110,  69,
        67])

In [4]:
#categorizing ages.
def age_category(row):
    age = row["age"]
    if age<=25:
        return "18-25"
    elif 26 <= age <= 35:
        return "25-35"
    else:
        return "36 and above"
data["age_category"] = data.apply(age_category, axis=1)
data.head(5)

Unnamed: 0,age,body_type,drinks,drugs,education,ethnicity,height,income,job,location,last_online,orientation,sex,sign,religion,smokes,status,description,age_category
0,22,a little extra,socially,never,working on college/university,"asian, white",75.0,-1,transportation,"south san francisco, california",2012-06-28-20-30,straight,m,gemini,agnosticism and very serious about it,sometimes,single,about me:<br />\n<br />\ni would love to think...,18-25
1,35,average,often,sometimes,working on space camp,white,70.0,80000,hospitality / travel,"oakland, california",2012-06-29-21-41,straight,m,cancer,agnosticism but not too serious about it,no,single,i am a chef: this is what that means.<br />\n1...,25-35
2,38,thin,socially,don't know,graduated from masters program,don't know,68.0,-1,don't know,"san francisco, california",2012-06-27-09-10,straight,m,pisces but it doesn&rsquo;t matter,don't know,no,available,"i'm not ashamed of much, but writing public te...",36 and above
3,23,thin,socially,don't know,working on college/university,white,71.0,20000,student,"berkeley, california",2012-06-28-14-22,straight,m,pisces,don't know,no,single,i work in a library and go to school. . . read...,18-25
4,29,athletic,socially,never,graduated from college/university,"asian, black, other",66.0,-1,artistic / musical / writer,"san francisco, california",2012-06-27-21-26,straight,m,aquarius,don't know,no,single,hey how's it going? currently vague on the pro...,25-35


In [5]:
data.age_category.value_counts()

25-35           11933
36 and above     6890
18-25            6177
Name: age_category, dtype: int64

In [6]:
#giving the user an option to choose their preferred age.
import ipywidgets as widgets
from IPython.display import display

age_options = ['18-25', '25-35', '36 and above',"don't care"]
sex_list = list(data.sex.unique())

In [7]:
choose_age = widgets.Dropdown(options=age_options, description='Age range:')
choose_sex = widgets.Dropdown(options=sex_list, description='Preferd sex:')
display(choose_age)
display(choose_sex)
#Note:Please don't run the cell after you have selected the option from dropdown just select and move to next cell.

Dropdown(description='Age range:', options=('18-25', '25-35', '36 and above', "don't care"), value='18-25')

Dropdown(description='Preferd sex:', options=('m', 'f'), value='m')

In [8]:
chosen_age = choose_age.value
chosen_sex = choose_sex.value
print(chosen_age, chosen_sex)

18-25 m


In [9]:
if chosen_age == "don't care":
    filtered_data = data[(data['sex'] == chosen_sex)]
else:
    filtered_data = data[(data['age_category'] == chosen_age) & (data['sex'] == chosen_sex)]

In [10]:
filtered_data.head(3)

Unnamed: 0,age,body_type,drinks,drugs,education,ethnicity,height,income,job,location,last_online,orientation,sex,sign,religion,smokes,status,description,age_category
0,22,a little extra,socially,never,working on college/university,"asian, white",75.0,-1,transportation,"south san francisco, california",2012-06-28-20-30,straight,m,gemini,agnosticism and very serious about it,sometimes,single,about me:<br />\n<br />\ni would love to think...,18-25
3,23,thin,socially,don't know,working on college/university,white,71.0,20000,student,"berkeley, california",2012-06-28-14-22,straight,m,pisces,don't know,no,single,i work in a library and go to school. . . read...,18-25
12,24,don't know,often,don't know,don't know,white,72.0,-1,entertainment / media,"san francisco, california",2012-05-28-21-28,straight,m,taurus,other,don't know,single,bang my shit bang,18-25


In [11]:
len(filtered_data)

3770

In [12]:
import re

def clean_text_list(text_list):
    cleaned_texts = []
    for text in text_list:
        #Converting to lowercase
        text = text.lower()
        
        #Removing HTML tags
        text = re.sub(r"<br\s?/?>", " ", text)
        
        #Removing non-alphabetic characters
        text = re.sub(r"[^a-zA-Z\s]", "", text)
        
        #Removing extra whitespaces
        text = re.sub(r"\s+", " ", text)
        
        cleaned_texts.append(text.strip())
    
    return cleaned_texts



In [13]:
#Cleaning the input description of user.
def clean_text(text):
    text = text.lower()
    text = re.sub(r"<br\s?/?>", " ", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text)

    return text.strip()

In [14]:
# only selecting the descriptions of people the user has filtered in the dataset.
descriptions = []
for i in range(len(filtered_data)):
    description = str(filtered_data.iloc[i]['description'])
    descriptions.append(description)
#print(descriptions[77:79])

In [15]:

cleaned_descriptions = clean_text_list((descriptions))

print(len(cleaned_descriptions))


3770


For the similarity analysis I will be using TfidfVectorizer. TF measures word frequency within a document, while IDF evaluates word importance across multiple documents. The vectorizer combines these metrics to create a numerical representation for each document. Words that appear frequently within a document but are rare across the corpus receive higher values, emphasizing their significance. This numerical representation is essential for machine learning tasks involving text, such as document classification and information retrieval, as it captures the relative importance of words in distinguishing and analyzing documents.

In [16]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform(cleaned_descriptions)

#Calculating the similarity between the input string and all descriptions. The metric we are using here is cosine similarity.
input_string = input("please ener your description.")
cleaned_input  = clean_text(input_string)
input_vector = vectorizer.transform([cleaned_input])
similarity_scores = cosine_similarity(input_vector, tfidf_matrix)[0]

#Getting the indices of the top 5 matching descriptions
top_indices = np.argsort(similarity_scores)[-5:][::-1]


#print(f"Your description \n{cleaned_input} \n")
print("Top 5 matching descriptions: \n")
for index in top_indices:
    print(cleaned_descriptions[index]+"\n")

please ener your description.my name is sulav. I like to have sex.
Top 5 matching descriptions: 

just tryna have some fun on here so lets get it poppin school and working out well youll see sex friends fam money music and more sex the marines sex food chilling is lol you tryna do something

working on the degree and searching for career jobs art and design school and sports long eye lashes i am into too many movies i own about of them as far as music goes i will listen to just about anything sex sports art sex sports architecture psychology astrophysics philosophy thinking about what i should do that night i am using this site to find a sex partner with no strings attached you are attractive and want sex

i can be harsh at times but its honesty i like to have fun and go out but i never object to staying in and hanging out sex smile food sex alcohol my family money heart the future out partying and having a good time you know youre sexy because self confidence is in my book

i am alway