# Problem Statement

#### A recently launched online dating site has assigned you the duty of playing Cupid and matching two lovebirds. As a Machine Learning expert, you are required to build a sophisticated model that predicts the match percentage between its users based on multiple attributes such as — their identifiers, preferences, interests, and the like.

# Data Preprocessing:

### Bios Analysis:

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from nltk import FreqDist
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder

In [None]:
data = pd.read_csv("../input/hackerearth-love-in-the-time-of-screens/data.csv")
data = data.drop(columns='username')
data

In [None]:
lemmatizer = WordNetLemmatizer()

def tokenize(text):
    """
    First we will tokenize the bios,
    then lemmatize them
    """
    
    # Create library of stopwords
    stops = stopwords.words('english')
    stops.extend(["i'm", 'i', "i'd", "i've", 'im', 'ive', 'like', 'also', 'would'
                 "i'll", 'year', 'old', 'ago'])
    # Lower case the text
    text = text.lower()
    
    # Remove punctuations and useless characters
    chars = ('.', ',', '!', "_", '1', '2',
            '3','4','5','6','7','8','9','0')
    for char in chars:
        text = text.replace(char,"")
    text = text.replace("-", " ")
    text = text.replace("  ", " ")
    
    #Split the text on spaces
    text = text.split(" ")
    
    # Lemmatizeing the words and removing stopwords
    text = [lemmatizer.lemmatize(i) for i in text if i not in stops]
    
    return text

data['bio'] = data['bio'].apply(tokenize)

In [None]:
data['bio']

In [None]:
# Create set of all words
total_vocab = set()

# Add all words to the vocabulary
for bio in data['bio']:
    total_vocab.update(bio)

# Print total number of unique words
print("Number of unique words: ", len(total_vocab))

# Add all the tokenized words to a list
words = []
for bio in data['bio']:
    words.extend(bio)
    
# Determine frequency of each words
word_freq = FreqDist(words)
word_freq.most_common(100)

# Plot most frequently used words
plt.style.use('ggplot')
plt.figure(figsize=[15,5])

plt.bar(*zip(*word_freq.most_common(25)))
plt.xticks(rotation=75)
plt.title('Most Frequently Used Words in Bios')

In [None]:
# Instantiating the score of each bigram
bigram_meas = BigramAssocMeasures()

# Finding and ranking bigrams in each bio
bio_finder = BigramCollocationFinder.from_words(words)

# Find frequency scores of each bigram
bio_score = bio_finder.score_ngrams(bigram_meas.raw_freq)

# Create a list of bigrams
bigram_list = list(map(lambda x: x[0][0]+' '+x[0][1], bio_score))

# Create list of scores
bigram_scores = list(map(lambda x: x[1],bio_score))

# Combining score and bigrams
bigrams = list(zip(bigram_list,bigram_scores))

# Plot the bigram and frequency scores
plt.style.use('bmh')
plt.figure(figsize=(15,5))

plt.bar(*zip(*bigrams[:25]))
plt.xticks(rotation=75)
plt.title('25 Most Common Bigrams')


### Categorical Encoding:

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
"""
Here we process only those columns whose same value between 
two people will favour the match making. Like interest, hobbies, etc.
Same values of Gender, orientation, etc are not favourable.
So we don't process them.
"""

# Consider only object type data
data_categorical = data.select_dtypes('object')

data_categorical = data_categorical.drop(columns=['user_id', 'sex', 'orientation', 'bio', 'location', 'location_preference'])


In [None]:
# We create separate dataframe for encoded values
data_encoded = pd.DataFrame()


# Create instance of labelEncoder class
labelencoder = LabelEncoder()

# Encode by assigning numerical values to categories of all columns
for col in data_categorical:
    data_encoded['{}'.format(col)] = labelencoder.fit_transform(data_categorical['{}'.format(col)])

data_encoded

### Concatenation with remaining columns:

In [None]:
"""
Now we concatenate the remaining numerical columns with the encoded dataframe
"""

data_encoded = pd.concat([data['age'],data['height'],data['education_level'],data_encoded],
                        axis=1)
data_encoded

### Scaling of Encoded Columns:

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Instantiate the Scaler Class

scaler = MinMaxScaler()

# Scaling and replacing old values for all columns

data_encoded = pd.DataFrame(scaler.fit_transform(data_encoded),
                            columns= data_encoded.columns,
                            index= data_encoded.index)
    
data_encoded

# Adding New Features:

### Bios Vectorization:

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Instantiate the vectorizer of choice
vectorizer = CountVectorizer()

# Bio column contains lists instead of strings. Convert them to string 
def listtostring(lis):
    str1 =""
    for word in lis:
        str1 = str1 + " " + word
    return str1[1:]

data['bio'] = data['bio'].apply(listtostring)
    
# Fit the vectorizer to bios
bios_vect = vectorizer.fit_transform(data['bio'])

# Create dataframe for vectorized bios
bios_df = pd.DataFrame(bios_vect.toarray(), columns=vectorizer.get_feature_names())

# Sort the columns in descending order of their sums.
bios_df = bios_df.reindex(bios_df.sum().sort_values(ascending=False).index, axis=1)
bios_df
# Concatinate Encoded dataframe with vectorized bios
data_final = pd.concat([data_encoded,bios_df], axis=1)

data_final

### Principal Component Analysis:

In [None]:
from sklearn.decomposition import PCA

# Instantiate PCA
pca = PCA()

# Fit and transform the final dataframe
data_pca = pca.fit_transform(data_final)

# Plot to determine how many features should the dataset be reduced to
"""
Our final dataframe has 2001 rows and 13523 columns. But the maximum number
of principal components a dataframe can have is MIN(n_samples, n_features).
So we have considered only 2001 features of the 13523. But we have already
sorted these features in descending order of their sums to get most weighted 
features in the 2001 chosen ones.
"""
plt.style.use('bmh')
plt.figure(figsize=(14,4))
plt.plot(range(data_final.shape[0]), pca.explained_variance_ratio_.cumsum())
plt.title("No. of Features accounting for % of Variance")
# We find exact number of features which account for at least 95% of variance
total_variance = pca.explained_variance_ratio_.cumsum()
n_for_95 = len(total_variance[total_variance>=.95])
n_to_reach_95 = data_final.shape[0] - n_for_95

# Print number of features required to retain 95% variance
print("Number of features: {}\nTotal variance: {}".format(n_to_reach_95,total_variance[n_to_reach_95]))

# Reducing the dataset to number of features determined before
pca = PCA(n_components=n_to_reach_95)

# Fit and transform the dataset to specified number of features and add to new dataframe
data_pca = pca.fit_transform(data_final)

# Print the variance ratio after dataset is reduced
print("Achieved Variance ratio: {}".format(pca.explained_variance_ratio_.cumsum()[-1]))

### Correlation:

In [None]:
# Generate correlation matrix for the data_final dataframe
data_corr = data_final.T.corr()
data_corr

In [None]:
data_corr.to_csv('corr.csv')

In [None]:
# Scale the dataframe to eleminate any negative values generated by corr()
data_corr = pd.DataFrame(scaler.fit_transform(data_corr),
                            columns= data_corr.columns,
                            index= data_corr.index)
    
data_corr

# Final Processing:

In [None]:
corr_arr = np.array(data_corr)
corr_arr

In [None]:
for i in range(len(corr_arr)):
    for j in range(len(corr_arr)):
        
        # A person can't match with himself. So we asign zero to rows and columns having same user_id
        if i==j:
            corr_arr[i][j] = 0
            
        # When a person is straight
        if data.iloc[i]['orientation']=='straight' and data.iloc[j]['orientation']=='straight':
            # Straight person should not be matched with person of same sex. So assign zero to such instances
            if data.iloc[i]['sex'] == data.iloc[j]['sex']:
                corr_arr[i][j] = 0
                corr_arr[j][i] = 0
        
        # When a person is gay
        if data.iloc[i]['orientation']=='gay' and data.iloc[j]['orientation']=='gay':
            # Gay person should not be matched with person of opposite sex. So assign zero to such instances
            if data.iloc[i]['sex'] != data.iloc[j]['sex']:
                corr_arr[i][j] = 0
                corr_arr[j][i] = 0
                
        # When a person is bisexual
        if data.iloc[i]['orientation']=='bisexual' and data.iloc[j]['orientation']=='bisexual':
            # Bisexual person can match with any gender. So we don't do anything
            pass
        
        
        

In [None]:
corr_arr

In [None]:
final_mat = pd.DataFrame(corr_arr, columns=data['user_id'].tolist(), index=data['user_id'].tolist())
final_mat = final_mat.mul(100)

In [None]:
final_mat.index.name = 'user_id'
final_mat.columns.name = 'user_id'
final_mat.to_csv('Final.csv')

In [None]:
final_mat