In [0]:
import numpy as np
import os
os.environ['KAGGLE_USERNAME'] = "sudhasravanthy"
os.environ['KAGGLE_KEY'] = "29298b9cb3cae563e590d179037f609a"


!kaggle datasets download -d devjyotichandra/glove6b50dtxt # api copied from kaggle

glove6b50dtxt.zip: Skipping, found more recently modified local copy (use --force to force download)


In [0]:
!unzip -q '/content/glove6b50dtxt.zip'

In [0]:
def read_vecs(glove_file):
    with open(glove_file, 'r', encoding='UTF-8') as f:
        words = set()
        word_to_vec_map = {}

        for line in f:
            line = line.strip().split()
            current_word = line[0]
            words.add(current_word)
            word_to_vec_map[current_word] = np.array(line[1:], dtype=np.float64)

    return words, word_to_vec_map



In [0]:
words, word_to_vec_map = read_vecs('/content/glove.6B.50d.txt')

In [0]:
def cosine_similarity(u, v):
    """
    Cosine similarity reflects the degree of similariy between u and v

    Arguments:
        u -- a word vector of shape (n,)          
        v -- a word vector of shape (n,)

    Returns:
        cosine_similarity -- the cosine similarity between u and v defined by the formula above.
    """

    distance = 0.0

    ### START CODE HERE ###
    # Compute the dot product between u and v (≈1 line)
    dot = np.dot(u, v)
    # Compute the L2 norm of u (≈1 line)
    norm_u = np.sqrt(np.sum(np.square(u), axis=0))

    # Compute the L2 norm of v (≈1 line)
    norm_v = np.sqrt(np.sum(np.square(v), axis=0))
    # Compute the cosine similarity defined by formula (1) (≈1 line)
    cosine_similarity = dot/(norm_u * norm_v)
    ### END CODE HERE ###

    return cosine_similarity

  

In [0]:
def complete_analogy(word_a, word_b, word_c, word_to_vec_map):
    """
    Performs the word analogy task as explained above: a is to b as c is to ____. 

    Arguments:
    word_a -- a word, string
    word_b -- a word, string
    word_c -- a word, string
    word_to_vec_map -- dictionary that maps words to their corresponding vectors. 

    Returns:
    best_word --  the word such that v_b - v_a is close to v_best_word - v_c, as measured by cosine similarity
    """

    # convert words to lower case
    word_a, word_b, word_c = word_a.lower(), word_b.lower(), word_c.lower()

    ### START CODE HERE ###
    # Get the word embeddings v_a, v_b and v_c (≈1-3 lines)
    e_a, e_b, e_c = word_to_vec_map[word_a], word_to_vec_map[word_b], word_to_vec_map[word_c]
    ### END CODE HERE ###

    words = word_to_vec_map.keys()
    max_cosine_sim = -100              # Initialize max_cosine_sim to a large negative number
    best_word = None                   # Initialize best_word with None, it will help keep track of the word to output

    # loop over the whole word vector set
    for w in words:        
        # to avoid best_word being one of the input words, pass on them.
        if w in [word_a, word_b, word_c] :
            continue
        ### START CODE HERE ###
        # Compute cosine similarity between the vector (e_b - e_a) and the vector ((w's vector representation) - e_c)  (≈1 line)
        cosine_sim = cosine_similarity(e_b - e_a, word_to_vec_map[w] - e_c)
        
        # If the cosine_sim is more than the max_cosine_sim seen so far,
            # then: set the new max_cosine_sim to the current cosine_sim and the best_word to the current word (≈3 lines)
        if cosine_sim > max_cosine_sim:
            max_cosine_sim = cosine_sim
            best_word = w
        ### END CODE HERE ###

    return best_word


In [0]:
def neutralize(word, g, word_to_vec_map):
    """
    Removes the bias of "word" by projecting it on the space orthogonal 正交 to the bias axis. 
    This function ensures that gender neutral words are zero in the gender subspace.

    Arguments:
        word -- string indicating the word to debias
        g -- numpy-array of shape (50,), corresponding to the bias axis (such as gender)
        word_to_vec_map -- dictionary mapping words to their corresponding vectors.

    Returns:
        e_debiased -- neutralized word vector representation of the input "word"
    """

    ### START CODE HERE ###
    # Select word vector representation of "word". Use word_to_vec_map. (≈ 1 line)
    e = word_to_vec_map[word]

    # Compute e_biascomponent using the formula give above. (≈ 1 line)
    e_biascomponent = (np.dot(e, g) * g)/np.dot(g, g)

    # Neutralize e by substracting e_biascomponent from it 
    # e_debiased should be equal to its orthogonal projection. 应该等于它的正交投影(≈ 1 line)
    e_debiased = e - e_biascomponent
    ### END CODE HERE ###

    return e_debiased


In [0]:
def equalize(pair, bias_axis, word_to_vec_map):
    """
    Debias gender specific words by following the equalize method described in the figure above.

    Arguments:
    pair -- pair of strings of gender specific words to debias, e.g. ("actress", "actor") 
    bias_axis -- numpy-array of shape (50,), vector corresponding to the bias axis, e.g. gender
    word_to_vec_map -- dictionary mapping words to their corresponding vectors

    Returns
    e_1 -- word vector corresponding to the first word
    e_2 -- word vector corresponding to the second word
    """

    ### START CODE HERE ###
    # Step 1: Select word vector representation of "word". Use word_to_vec_map. (≈ 2 lines)
    w1, w2 = pair
    e_w1, e_w2 = word_to_vec_map[w1], word_to_vec_map[w2]

    # Step 2: Compute the mean of e_w1 and e_w2 (≈ 1 line)
    mu = (e_w1+e_w2)/2

    # Step 3: Compute the projections of mu over the bias axis and the orthogonal axis (≈ 2 lines)
    mu_B = (np.dot(mu, bias_axis) * bias_axis)/np.dot(bias_axis, bias_axis)
    mu_orth = mu - mu_B

    # Step 4: Use equations (7) and (8) to compute e_w1B and e_w2B (≈2 lines)
    e_w1B = (np.dot(e_w1, bias_axis)* bias_axis)/np.dot(bias_axis, bias_axis)
    e_w2B = (np.dot(e_w2, bias_axis)* bias_axis)/np.dot(bias_axis, bias_axis)

    # Step 5: Adjust the Bias part of e_w1B and e_w2B using the formulas (9) and (10) given above (≈2 lines)
    corrected_e_w1B = np.sqrt(np.abs(1 - np.dot(mu_orth, mu_orth)))*(e_w1B - mu_B)/np.linalg.norm(e_w1 - mu_orth - mu_B)
    corrected_e_w2B = np.sqrt(np.abs(1 - np.dot(mu_orth, mu_orth)))*(e_w2B - mu_B)/np.linalg.norm(e_w2 - mu_orth - mu_B)

    # Step 6: Debias by equalizing e1 and e2 to the sum of their corrected projections (≈2 lines)
    e1 = corrected_e_w1B + mu_orth
    e2 = corrected_e_w2B + mu_orth

    ### END CODE HERE ###

    return e1, e2


Gender Debiasing - Neutralizing

In [0]:
#words = ['receptionist', 'scientist', 'professor', 'lawyer', 'entrepreneur']
words_observations = ['teacher', 'wrestler', 'athlete', 'paralegal', 'tycoon']

In [0]:
print('before Debiasing')
for word in words_observations:
  print('Cosine Similarity of word ' + word + ' and g is: ' + str(cosine_similarity(word_to_vec_map[word], word_to_vec_map['woman'] - word_to_vec_map['man'])))

before Debiasing
Cosine Similarity of word teacher and g is: 0.17920923431825664
Cosine Similarity of word wrestler and g is: -0.038690746063892216
Cosine Similarity of word athlete and g is: 0.06914822891263249
Cosine Similarity of word paralegal and g is: 0.3014173349954554
Cosine Similarity of word tycoon and g is: -0.19188610612411847


In [0]:
print('after Debiasing')
#words = ['teacher', 'wrestler', 'athlete', 'paralegal', 'tycoon']
for word in words_observations:
  e1 = neutralize(word, word_to_vec_map['woman'] - word_to_vec_map['man'], word_to_vec_map)
  word_to_vec_map[word] = e1  
  print('Cosine Similarity of word ' + word + ' and g is: ' + str(cosine_similarity(word_to_vec_map[word], word_to_vec_map['woman'] - word_to_vec_map['man'])))
  


after Debiasing
Cosine Similarity of word teacher and g is: -4.238781742793106e-18
Cosine Similarity of word wrestler and g is: -4.336719188638501e-18
Cosine Similarity of word athlete and g is: -2.4978280272612416e-17
Cosine Similarity of word paralegal and g is: -2.2433671065701604e-17
Cosine Similarity of word tycoon and g is: -6.877016649377774e-18


Gender Debiasing - Equalizing

In [0]:
word_observations = ['master', 'businessman', 'doctor','policeman', 'salesman', 'engineer']

In [0]:
print('before Debiasing')
for word in word_observations:
  print('Gender Bias female word for ' + word + ' is: ' + complete_analogy('man', word, 'woman', word_to_vec_map))

before Debiasing
Gender Bias female word for master is: diploma
Gender Bias female word for businessman is: businesswoman
Gender Bias female word for doctor is: nurse
Gender Bias female word for policeman is: wounding
Gender Bias female word for salesman is: saleswoman
Gender Bias female word for engineer is: technician


In [0]:

print('after Debiasing')
words = [['master', 'mistress'], ['businessman', 'businesswoman'], ['doctor', 'gynecologist'], ['policeman', 'policewoman'], ['salesman', 'saleswoman'], ['engineer', 'architect']]
for word in words:
  e1, e2 = equalize((word[0], word[1]), word_to_vec_map['woman'] - word_to_vec_map['man'], word_to_vec_map)
  word_to_vec_map[word[0]] = e1
  word_to_vec_map[word[1]] = e2
  print('Gender Bias female word for ' + word[0] + ' is: ' + complete_analogy('man', word[0], 'woman', word_to_vec_map))


after Debiasing
Gender Bias female word for master is: mistress
Gender Bias female word for businessman is: businesswoman
Gender Bias female word for doctor is: gynecologist
Gender Bias female word for policeman is: policewoman
Gender Bias female word for salesman is: saleswoman
Gender Bias female word for engineer is: architect


Relations Debiasing

In [0]:
word_observations = ['he', 'his', 'son','male', 'boy', 'father', 'uncle', 'monastery', 'husband', 'Dad', 'Men', 'grandpa', 'grandson', 'uncle', 'brother']

In [0]:
print('before Debiasing')
for word in word_observations:
  print('Gender Bias female word for ' + word + ' is: ' + complete_analogy('man', word, 'woman', word_to_vec_map))

before Debiasing
Gender Bias female word for he is: she
Gender Bias female word for his is: her
Gender Bias female word for son is: daughter
Gender Bias female word for male is: female
Gender Bias female word for boy is: girl
Gender Bias female word for father is: daughter
Gender Bias female word for uncle is: niece
Gender Bias female word for monastery is: convent
Gender Bias female word for husband is: wife
Gender Bias female word for Dad is: mom
Gender Bias female word for Men is: women
Gender Bias female word for grandpa is: grandma
Gender Bias female word for grandson is: granddaughter
Gender Bias female word for uncle is: niece
Gender Bias female word for brother is: daughter


In [0]:
word_observations = ['brothers', 'brother', 'boyfriend', 'fatherhood', 'gentleman', 'grandfather', 'nephew', 'king', 'prince', 'schoolboy']

In [0]:
print('before Debiasing')
for word in word_observations:
  print('Gender Bias female word for ' + word + ' is: ' + complete_analogy('man', word, 'woman', word_to_vec_map))

before Debiasing
Gender Bias female word for brothers is: avett
Gender Bias female word for brother is: daughter
Gender Bias female word for boyfriend is: girlfriend
Gender Bias female word for fatherhood is: motherhood
Gender Bias female word for gentleman is: gentlewoman
Gender Bias female word for grandfather is: granddaughter
Gender Bias female word for nephew is: niece
Gender Bias female word for king is: queen
Gender Bias female word for prince is: princess
Gender Bias female word for schoolboy is: 16-year-old


In [0]:

print('after Debiasing')
words = [['father', 'mother'], ['uncle', 'aunt'], ['brother', 'sister'], ['brothers', 'sisters'], ['grandfather', 'grandmother'], ['schoolboy', 'schoolgirl']]
for word in words:
  e1, e2 = equalize((word[0], word[1]), word_to_vec_map['woman'] - word_to_vec_map['man'], word_to_vec_map)
  word_to_vec_map[word[0]] = e1
  word_to_vec_map[word[1]] = e2
  print('Gender Bias female word for ' + word[0] + ' is: ' + complete_analogy('man', word[0], 'woman', word_to_vec_map))


after Debiasing
Gender Bias female word for father is: brother
Gender Bias female word for uncle is: cousin
Gender Bias female word for brother is: cousin
Gender Bias female word for brothers is: sons
Gender Bias female word for grandfather is: grandson
Gender Bias female word for schoolboy is: schoolgirl


States & Captial Debiasing

In [0]:
word_observations = ['rajasthan', 'maharashtra', 'karnataka','kerala', 'gujarat', 'bihar', 'orissa']
matching = ['telangana', 'hyderabad']

In [0]:
print('before Debiasing')
for word in word_observations:
  print('Gender Bias female word for ' + word + ' is: ' + complete_analogy(matching[0], matching[1], word, word_to_vec_map))

before Debiasing
Gender Bias female word for rajasthan is: bangalore
Gender Bias female word for maharashtra is: bangalore
Gender Bias female word for karnataka is: bangalore
Gender Bias female word for kerala is: bangalore
Gender Bias female word for gujarat is: mumbai
Gender Bias female word for bihar is: bangalore
Gender Bias female word for orissa is: bangalore


In [0]:

print('after Debiasing')
words = [['rajasthan', 'jaipur'], ['maharashtra', 'mumbai'], ['karnataka', 'bangalore'], ['gujarat', 'gandhinagar'], ['bihar', 'patna'], ['orissa', 'bhubaneswar']]
for word in words:
  e1, e2 = equalize((word[0], word[1]), word_to_vec_map['hyderabad'] - word_to_vec_map['telangana'], word_to_vec_map)
  word_to_vec_map[word[0]] = e1
  word_to_vec_map[word[1]] = e2
  print('Capital of ' + word[0] + ' is: ' + complete_analogy('telangana', 'hyderabad', word[0], word_to_vec_map))


after Debiasing
Capital of rajasthan is: jaipur
Capital of maharashtra is: mumbai
Capital of karnataka is: bangalore
Capital of gujarat is: gandhinagar
Capital of bihar is: patna
Capital of orissa is: bhubaneswar


Language Debiasing

In [0]:
word_observations = ['rajasthan', 'maharashtra', 'karnataka','kerala', 'gujarat', 'bihar', 'orissa']
matching = ['telangana', 'telugu']

In [0]:
print('before Debiasing')
for word in word_observations:
  print('Gender Bias female word for ' + word + ' is: ' + complete_analogy(matching[0], matching[1], word, word_to_vec_map))

before Debiasing
Gender Bias female word for rajasthan is: kannada
Gender Bias female word for maharashtra is: malayalam
Gender Bias female word for karnataka is: malayalam
Gender Bias female word for kerala is: kannada
Gender Bias female word for gujarat is: malayalam
Gender Bias female word for bihar is: malayalam
Gender Bias female word for orissa is: malayalam


In [0]:

print('after Debiasing')
words = [['rajasthan', 'rajasthani'], ['maharashtra', 'marathi'], ['karnataka', 'kannada'], ['gujarat', 'gujarati'], ['bihar', 'hindi'], ['kerala', 'malayalam'], ['orissa', 'oriya']]
for word in words:
  e1, e2 = equalize((word[0], word[1]), word_to_vec_map['telugu'] - word_to_vec_map['telangana'], word_to_vec_map)
  word_to_vec_map[word[0]] = e1
  word_to_vec_map[word[1]] = e2
  print('Capital of ' + word[0] + ' is: ' + complete_analogy('telangana', 'telugu', word[0], word_to_vec_map))


after Debiasing
Capital of rajasthan is: rajasthani
Capital of maharashtra is: marathi
Capital of karnataka is: kannada
Capital of gujarat is: gujarati
Capital of bihar is: hindi
Capital of kerala is: malayalam
Capital of orissa is: oriya


Verifying Doctor Resumes

In [0]:
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
import numpy as np
import re
import itertools
import nltk

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


nltk.download('stopwords'),nltk.download('porter_test'), nltk.download('punkt'), nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package porter_test to /root/nltk_data...
[nltk_data]   Package porter_test is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


(True, True, True, True)

In [0]:
male_resume = 'John is a passionate doctor with extensive experience in internal medicine and hospital settings. Adpat in properly diagnosing and strategizing for the best treatment plans for patients. Jhon is experienced in counseling patients on preventative care and positive life style changes. Bringing forth an empathetic and professional attitude, committed to providing patients with the best care possible'
female_resume = 'Mary is a passionate doctor with extensive experience in internal medicine and hospital settings. Adpat in properly diagnosing and strategizing for the best treatment plans for patients. Mary is experienced in counseling patients on preventative care and positive life style changes.'

stopset = set(stopwords.words('english'))


In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
vectoriser = TfidfVectorizer(stop_words = None, use_idf=True, ngram_range=(1,3), decode_error="ignore")
X = vectoriser.fit_transform(list(male_resume))

'''from sklearn.decomposition import TruncatedSVD
lsa = TruncatedSVD(n_components = 20, n_iter = 100)
lsa.fit(X)'''

In [0]:
terms = vectoriser.get_feature_names()
for i, comp in enumerate(lsa.components_):
  termsinComp = zip(terms, comp)
  sortedTerms = sorted(termsinComp, key = lambda x:x[1], reverse=True)[:20]
  print("article " + str(i) + ":")
  for term in sortedTerms:
    print(term)
  print(" ")