# Treating categorical and continuous variables separately.

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import hamming

In [2]:
og_sheet = pd.read_excel('find_my_buddy.csv.xlsx', sheet_name='Buddy Sheet')

In [3]:
og_sheet['Full Name'].drop(522)

0      Sachin Haldavanekar
1            Sanket Saurav
2               Viha Bidre
3          Sougata Dafader
4       Athul Muralidharan
              ...         
517    Ankur Bambharoliya 
518          Sanket Mathur
519             Rohit Naik
520       Deepak Chandwani
521               Nivedita
Name: Full Name, Length: 522, dtype: object

In [4]:
og_sheet

Unnamed: 0,Full Name,Email address,Gender,Hometown,Currently living in,Your Decision,Looking for a room mate?,Course,"Please Enter your Branch (If it was not listed in drop down, else leave blank)",Are you open to other branch room mate?,...,I am looking for a room mate,Your Hobbies,Facebook profile link (optional),i20 amount,VISA status,Planned VISA interview date,VISA consulate - city,Do you need a Flight mate?,"If yes, then Flight Date?",Any other comments ?
0,Sachin Haldavanekar,sachinhaldavanekar@gmail.com,Male,Mumbai,Mumbai,"Yes, Enrolled (Paid $100 ) & Finalized.",Yes,MS Computer Science,,Yes,...,Who can cook sometimes,,https://m.facebook.com/sachinhaldavanekar,43.0,Yet to schedule,2017-06-06,Mumbai,Yes,2017-08-22,
1,Sanket Saurav,snkt087@gmail.com,Male,Patna,Bangalore,Enrolled (Paid $100 ),Yes,MS Computer Science,,Yes,...,Who can cook sometimes,"Gaming, Travelling",https://www.facebook.com/reachsanket,43.0,Yet to schedule,2017-06-02,Chennai,No,2017-08-22,Go Huskies..!!
2,Viha Bidre,bidreviha@yahoo.com,Female,Davanagere,Bangalore,"Yes, Enrolled (Paid $100 ) & Finalized.",Yes,MS Computer Science,,Yes,...,Who can cook sometimes,"Photography, travelling, DIY art.",,43.0,Yet to schedule,2017-05-15,Bangalore,Yes,2017-08-24,
3,Sougata Dafader,sougatadafader@gmail.com,Male,Kolkata,Kolkata,"Yes, Enrolled (Paid $100 ) & Finalized.",Yes,MS Computer Science,,Yes,...,Who can cook sometimes,"Badminton, TT, football, egames like dota, Cs",facebook.com/sdafader,43.0,Yet to schedule,2017-05-29,Kolkata,Yes,2017-08-16,Looking for roommates.
4,Athul Muralidharan,athulmurali@gmail.com,Male,Chennai,Chennai,"Yes, Enrolled (Paid $100 ) & Finalized.",No,MS Computer Science,,Yes,...,Who can cook sometimes,Watching movies & making music,https://www.facebook.com/athul.karthik.3,43.0,Yet to schedule,2017-05-19,Chennai,Yes,2017-08-21,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
518,Sanket Mathur,sanket0024@gmail.com,Male,Jaipur,Jaipur,"Yes, Enrolled (Paid $100 ) & Finalized.",Yes,MS Computer Science,,No,...,Who can cook sometimes,,https://www.facebook.com/sanket.mathur24,43.0,Yet to schedule,2017-07-07,Delhi,No,NaT,
519,Rohit Naik,rohit.naik.0708@gmail.com,Male,Navi Mumbai,Navi Mumbai,"Yes, Enrolled (Paid $100 ) & Finalized.",Yes,MS Information Systems,,Yes,...,Who can cook sometimes,,,,Yet to schedule,2017-07-11,Mumbai,No,NaT,
520,Deepak Chandwani,deepakchandwani92@gmail.com,Male,Raipur,Raipur,Enrolled (Paid $100 ),Yes,MS Information Systems,,Yes,...,Who can cook sometimes,,,43.0,Yet to schedule,2017-07-12,Mumbai,No,NaT,
521,Nivedita,niveditac79@gmail.com,Female,Indore,Indore,"Yes, Enrolled (Paid $100 ) & Finalized.",Yes,MS Engineering Management,,Yes,...,Who can cook sometimes,,,,Accepted,NaT,,No,NaT,


In [5]:
meta_data = pd.read_csv('budd_new_22_6_18.csv', index_col= 0)

In [6]:
meta_data.columns

Index(['gender', 'current_city', 'open_to_other_branch', 'work_ex',
       'dist_from_uni', 'rent_budget', 'alcohol', 'smoking', 'food_pref',
       'cul_skills', 'looking_for_roommate', 'MS Biotechnology',
       'MS Civil Engineering', 'MS Computer Science',
       'MS Computer Systems Engineering', 'MS Data Science',
       'MS Electrical and Computer Engineering', 'MS Energy Systems',
       'MS Engineering Management', 'MS Industrial Engineering',
       'MS Information Assurance and Cyber Security', 'MS Information Systems',
       'MS Mechanical Engineering', 'MS Project Management', 'Others',
       'hall_yes_no', 'max_ppr', 'bhk_1', 'bhk_2', 'bhk_3', 'bhk_4', 'studio'],
      dtype='object')

In [7]:
meta_data = meta_data.drop(labels=['looking_for_roommate', 'Others'], axis = 1)

In [8]:
name_list = list(og_sheet['Full Name'].drop(522))

In [9]:
name_list

['Sachin Haldavanekar',
 'Sanket Saurav',
 'Viha Bidre',
 'Sougata Dafader',
 'Athul Muralidharan',
 'Megha Suvarna ',
 'Vishnu',
 'Srashti Kasliwal',
 'Balakumaran',
 'Pranav',
 'Harshil Navin Dhruve',
 'Shravan kumar reddy ',
 'Tejas Harish',
 'Rohan Taori',
 'Z',
 'Mrunal Ghanwat',
 'Varun Sundar Rabindranath',
 'Kaustubh Shivdikar',
 'Pratik Devikar',
 'Saumil Patel',
 'Keerthan Bharadwaj',
 'Sreejith',
 'Rushaabh Saliya',
 'Srinidhi Shankar',
 'Harsh Sheth',
 'Nipun Midha',
 'Varad',
 'Chaitanya Joshi',
 'Cyril Mathew ',
 'VYOM RAJ',
 'Bharathwaj',
 'Sibendu Dey',
 'Akshat Shukla',
 'Pragadeesh Suresh babu',
 'Samruddhi Pai Kakode',
 'Gaurav Gandhi',
 'VENKATA SAI TARUN UPPALA',
 'Grishma Thakkar',
 'Abhinn Ankit',
 'Chirayu Desai',
 'Abhishek Ahuja',
 'Ninad Kanetkar',
 'Prasanna Kumar Challa',
 'Sagar',
 'Saurabh',
 'Aman Mankar',
 'Anup Srivastava',
 'ganesh surada',
 'Naman Bhargava',
 'Vikas Singh',
 'Prashant Reddy',
 'Poojith Shankar Shetty',
 'Prashant Reddy',
 'Vipul Shar

In [10]:
meta_data

Unnamed: 0,gender,current_city,open_to_other_branch,work_ex,dist_from_uni,rent_budget,alcohol,smoking,food_pref,cul_skills,...,MS Information Systems,MS Mechanical Engineering,MS Project Management,hall_yes_no,max_ppr,bhk_1,bhk_2,bhk_3,bhk_4,studio
0,0,0,0,3.5,3.0,550,1,1,0,0,...,0,0,0,0,3.0,1,1,0,0,0
1,0,1,0,3.5,3.0,550,0,0,0,1,...,0,0,0,0,3.0,1,1,1,1,0
2,1,1,0,1.8,3.0,550,0,1,0,0,...,0,0,0,0,2.0,1,1,1,0,0
3,0,2,0,3.5,5.0,500,0,0,0,0,...,0,0,0,1,3.0,1,1,1,1,0
4,0,3,0,2.0,10.0,700,0,1,1,0,...,0,0,0,0,3.0,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517,0,0,0,3.0,10.0,550,0,0,0,1,...,0,0,0,1,4.0,1,1,1,1,1
518,0,27,1,3.0,3.0,500,0,1,0,0,...,0,0,0,0,2.0,1,1,1,1,1
519,0,23,0,4.0,10.0,500,2,1,2,0,...,1,0,0,0,2.0,0,1,1,0,0
520,0,11,0,2.0,5.0,500,1,1,1,0,...,1,0,0,0,3.0,1,1,0,0,0


## Test area

In [11]:
print("Enter Index No. to get Recommendation for that person ")
print("Note: number should be between 1 to 520")
x=int(input())

Enter Index No. to get Recommendation for that person 
Note: number should be between 1 to 520
100


In [12]:
test_person = meta_data.iloc[[x]]  

In [13]:
test_person

Unnamed: 0,gender,current_city,open_to_other_branch,work_ex,dist_from_uni,rent_budget,alcohol,smoking,food_pref,cul_skills,...,MS Information Systems,MS Mechanical Engineering,MS Project Management,hall_yes_no,max_ppr,bhk_1,bhk_2,bhk_3,bhk_4,studio
100,0,5,0,0.0,1.0,500,0,0,0,0,...,0,0,0,0,4.0,0,1,0,0,0


In [14]:
meta_data[meta_data['gender'] == 0].iloc[[57]]

Unnamed: 0,gender,current_city,open_to_other_branch,work_ex,dist_from_uni,rent_budget,alcohol,smoking,food_pref,cul_skills,...,MS Information Systems,MS Mechanical Engineering,MS Project Management,hall_yes_no,max_ppr,bhk_1,bhk_2,bhk_3,bhk_4,studio
65,0,0,0,2.0,3.0,700,0,1,1,0,...,0,0,0,0,4.0,0,1,0,0,0


In [15]:
def get_cont_cat(dataframe, var_type):
    '''
    Function to split and return the dataframe by the variable type needed.
    
    I/P:
    'dataframe': whole data as a pandas dataframe
    'var_type': variable type, "cont" or "cat", string
    
    O/P:
    'dataframe': dataframe with only columns of the needed variable type
    '''
    
    # Convert any series to dataframe
    if not isinstance(dataframe, pd.DataFrame):
        print('ip is not dataframe')
    cont_cols = ['work_ex', 'dist_from_uni', 'rent_budget']
    
    
    if var_type == 'cont':
        return dataframe[cont_cols]
    
    
    elif var_type == 'cat':
        return dataframe.drop(labels = cont_cols, axis = 1)
    
    
    else: raise ValueError('Variable type should be either "cont" or "cat"')

In [16]:
test_p_cont = np.array(get_cont_cat(test_person, 'cont'))

In [17]:
db_cont = np.array(get_cont_cat(meta_data, 'cont'))

In [18]:
test_p_cont.shape

(1, 3)

In [19]:
db_cont.shape

(522, 3)

In [20]:
test_p_cont

array([[  0.,   1., 500.]])

In [21]:
euclidean_distances(test_p_cont, db_cont).shape

(1, 522)

In [22]:
def get_cont_dist(person, database, metric):
    
    '''
    Function that returns a distance matrix for continuous features of the data.
    Feature normalization happens in here.
    I/P:
    'person': 1 x 3 numpy array of 1 person
    'database': (total data length) x 3 numpy array
    'metric': distance metric to be used for computation. One of: euclidean.
    
    O/P:
    'distance_matrix': 1 x (total data length) numpy array of distances
    '''

    to_std = np.vstack((person, database))
    
    all_std = StandardScaler().fit_transform(to_std)
    person_std = all_std[0,:].reshape(1,-1)
    database_std = all_std[1:,:]
    
    if metric == 'euclidean':
        cont_distance_matrix = euclidean_distances(person_std, database_std)
        return cont_distance_matrix

In [23]:


def get_cat_dist(person, database, metric):
    '''
    Function that returns a distance matrix for categorical features of the data.
    I/P:
    'person': 1 x 3 numpy array of 1 person
    'database': (total data length) x 3 numpy array
    'metric': distance metric to be used for computation. One of: hamming
    
    O/P:
    'distance_matrix': 1 x (total data length) numpy array of distances
    '''
    cat_distance_matrix = []
    if metric == 'hamming':
        database_df = pd.DataFrame(database)
        for index, c_row in database_df.iterrows():
            cat_distance_matrix.append(hamming(person, c_row))
    return(np.array(cat_distance_matrix)) 


In [24]:
test_cat = get_cont_cat(test_person, 'cat')
database_cat = get_cont_cat(meta_data[meta_data['gender'] == 0], 'cat')
get_cat_dist(test_cat.to_numpy().ravel(), database_cat.to_numpy(), 'hamming')[57]


0.14814814814814814

In [25]:
#get_cat_dist(test_cat, get_cont_cat(meta_data[meta_data['gender'] == 0], 'cat'), 'hamming')[57]

test_cat_array = get_cont_cat(test_person, 'cat').to_numpy().ravel()
database_cat_array = get_cont_cat(meta_data[meta_data['gender'] == 0], 'cat').to_numpy()
get_cat_dist(test_cat_array, database_cat_array, 'hamming')[57]


0.14814814814814814

In [26]:
#get_cat_dist(test_cat, get_cont_cat(meta_data, 'cat'), 'hamming')
test_cat_array = get_cont_cat(test_person, 'cat').to_numpy().ravel()
database_cat_array = get_cont_cat(meta_data, 'cat').to_numpy()
get_cat_dist(test_cat_array, database_cat_array, 'hamming')


array([0.22222222, 0.25925926, 0.25925926, 0.25925926, 0.2962963 ,
       0.18518519, 0.22222222, 0.37037037, 0.11111111, 0.14814815,
       0.14814815, 0.14814815, 0.11111111, 0.22222222, 0.25925926,
       0.2962963 , 0.18518519, 0.2962963 , 0.2962963 , 0.14814815,
       0.22222222, 0.22222222, 0.25925926, 0.18518519, 0.2962963 ,
       0.22222222, 0.25925926, 0.25925926, 0.2962963 , 0.33333333,
       0.14814815, 0.22222222, 0.18518519, 0.2962963 , 0.14814815,
       0.33333333, 0.18518519, 0.33333333, 0.25925926, 0.2962963 ,
       0.14814815, 0.14814815, 0.18518519, 0.18518519, 0.14814815,
       0.14814815, 0.22222222, 0.2962963 , 0.25925926, 0.14814815,
       0.18518519, 0.18518519, 0.18518519, 0.18518519, 0.25925926,
       0.18518519, 0.33333333, 0.18518519, 0.07407407, 0.22222222,
       0.25925926, 0.25925926, 0.25925926, 0.18518519, 0.22222222,
       0.14814815, 0.22222222, 0.37037037, 0.14814815, 0.22222222,
       0.2962963 , 0.22222222, 0.18518519, 0.25925926, 0.25925

In [27]:

test_cat = get_cont_cat(test_person, 'cat').to_numpy().ravel()
test_cat2 = get_cont_cat(meta_data.iloc[[34]], 'cat').to_numpy().ravel()


In [28]:
hamming(test_cat, test_cat2)

0.14814814814814814

In [29]:
def findRoommate(new_person, database, n_roommates, alpha, beta):
    # Split data by gender to reduce computations
    database_g = database[database['gender'] == new_person.iloc[0]['gender']]
    name_g = [name_list[i] for i in list(database_g.index)]

    # Split new datapoint into continuous and categorical sets
    new_person_cont = get_cont_cat(new_person, 'cont').to_numpy().flatten()
    new_person_cat = get_cont_cat(new_person, 'cat').to_numpy().flatten()

    # Split database into continuous and categorical sets
    database_cont = get_cont_cat(database_g, 'cont').to_numpy()
    database_cat = get_cont_cat(database_g, 'cat').to_numpy()

    # Get distances for both continuous and categorical sets
    dist_cont = get_cont_dist(new_person_cont, database_cont, 'euclidean')
    dist_cat = get_cat_dist(new_person_cat, database_cat, 'hamming')

    # Create final distance matrix of weighted average
    final_dist = alpha*dist_cont + beta*dist_cat

    
    # Sort the distance matrix to get top n roommates
    top_n_matches = np.argsort(final_dist)[0][1 : n_roommates + 1]
    
    
    top_n_dict = {"index": top_n_matches.tolist(),
                  "name": [name_g[j] for j in top_n_matches]}

     # Print the top n matches in index:name format
    for i in range(len(top_n_matches)):
        print(f'{top_n_matches[i]}:{name_g[top_n_matches[i]]}')
        
    #print Details
    print("\n")
    print(test_person,"\n\n")
    
    for i in range(len(top_n_matches)):
        print(top_n_matches[i])
        print(meta_data.iloc[[i]],"\n")
    
    return top_n_dict

In [30]:
findRoommate(test_person, meta_data, 5, 1, 1)

267:Aditya Kalele
234:Mitul Pala
49:Karan Tyagi
182:Hasher Hamza
301:Kaustubh Mathkar


     gender  current_city  open_to_other_branch  work_ex  dist_from_uni  \
100       0             5                     0      0.0            1.0   

     rent_budget  alcohol  smoking  food_pref  cul_skills  ...  \
100          500        0        0          0           0  ...   

     MS Information Systems  MS Mechanical Engineering  MS Project Management  \
100                       0                          0                      0   

     hall_yes_no  max_ppr  bhk_1  bhk_2  bhk_3  bhk_4  studio  
100            0      4.0      0      1      0      0       0  

[1 rows x 30 columns] 


267
   gender  current_city  open_to_other_branch  work_ex  dist_from_uni  \
0       0             0                     0      3.5            3.0   

   rent_budget  alcohol  smoking  food_pref  cul_skills  ...  \
0          550        1        1          0           0  ...   

   MS Information Systems  MS M

{'index': [267, 234, 49, 182, 301],
 'name': ['Aditya Kalele',
  'Mitul Pala',
  'Karan Tyagi',
  'Hasher Hamza',
  'Kaustubh Mathkar']}

In [31]:
test_person.iloc[0]['gender']

0.0

In [32]:
meta_data.gender.value_counts() 
#Gender
#Male   0
#Female 1

0    401
1    121
Name: gender, dtype: int64

In [33]:
meta_data.shape

(522, 30)