Manipulate file to store in a dataFrame object

# Imports

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
from nltk.corpus import wordnet
from nltk.stem.snowball import SnowballStemmer

In [None]:
def flatten(t):
    return [item for sublist in t for item in sublist]

In [None]:
# Extract the data from csv
df = pd.DataFrame(pd.read_csv('../def.csv'))

# Create stemmer - porter works better than english
stemmer2 = SnowballStemmer("porter")

# Save the dataframe in different variables and remove the useless columns
emotion_defs = df.loc[0,"P1":].dropna().replace(',','', regex=True)
# person_defs = df.loc[1,"P1":].dropna().replace(',','', regex=True)
# revenge_defs = df.loc[2,"P1":].dropna().replace(',','', regex=True)
# brick_def = df.loc[3,"P1":].dropna().replace(',','', regex=True)

# lists_of_defs = [emotion_defs, person_defs, revenge_defs, brick_def]

# Create a list of all the defs after stop word removing and stemming
list_defs = []
for definition in emotion_defs:
    text_tokens = word_tokenize(definition) # ['Range', 'of', 'concepts', 'human', 'beings', 'feel', 'in', 'certain', 'situations']
    
    # remove stop words
    tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]
    
    tokens_without_sw_stm = []
    for word in tokens_without_sw:
        word = stemmer2.stem(word)
        tokens_without_sw_stm.append(word)
    list_defs.append(tokens_without_sw_stm)    
    
print(list_defs)

# Similarity with the word2vec method

In [None]:
def word2vec(word):
    from collections import Counter
    from math import sqrt

    # count the characters in word
    cw = Counter(word)
    # precomputes a set of the different characters
    sw = set(cw)
    # precomputes the "length" of the word vector
    lw = sqrt(sum(c*c for c in cw.values()))

    # return a tuple
    return cw, sw, lw

def cosdis(v1, v2):
    # which characters are common to the two words?
    common = v1[1].intersection(v2[1])
    # by definition of cosine distance we have
    return sum(v1[0][ch]*v2[0][ch] for ch in common)/v1[2]/v2[2]


In [None]:
list_A = list_defs[0]
list_B = list_defs[1]

threshold = 0.80     # if needed
for key in list_A:
    for word in list_B:
        try:
            # print(key)
            # print(word)
            res = cosdis(word2vec(word), word2vec(key))
            # print(res)
            print("The cosine similarity between : {} and : {} is: {}".format(word, key, res*100))
            # if res > threshold:
            #     print("Found a word with cosine distance > 80 : {} with original word: {}".format(word, key))
        except IndexError:
            pass

In [247]:
from collections import Counter
import math

counterA = Counter(list_A)
counterB = Counter(list_B)


def counter_cosine_similarity(c1, c2):
    terms = set(c1).union(c2)
    dotprod = sum(c1.get(k, 0) * c2.get(k, 0) for k in terms)
    magA = math.sqrt(sum(c1.get(k, 0)**2 for k in terms))
    magB = math.sqrt(sum(c2.get(k, 0)**2 for k in terms))
    return dotprod / (magA * magB)

print(counterA)
print(counterB)
print(counter_cosine_similarity(counterA, counterB) * 100)


Counter({'rang': 1, 'concept': 1, 'human': 1, 'be': 1, 'feel': 1, 'certain': 1, 'situat': 1})
Counter({'someth': 1, 'feel': 1})
26.726124191242434


# Similarity with wordnet

In [None]:
# Calculate similarity with wordnet
for word in list_defs:
    syn1 = wordnet.synsets('hello')[0]
    syn2 = wordnet.synsets('selling')[0]
    
    print ("hello name :  ", syn1.name())
    print ("selling name :  ", syn2.name())

    print(syn1.wup_similarity(syn2))

# First version of similarity - not working because the diveder at the end
'''sim = 0
for item in list_defs:
    doc1 = nlp(item)
    for item2 in list_defs[1:]:
        doc2 = nlp(item2)
        sim = sim + doc1.similarity(doc2)
    
sim = sim / (len(list_defs)*len(list_defs))
print(sim)'''

'''for list in lists_of_defs:
    no_stop = []
    no_stop_defs = {}
    for definition in list:
        # print(definition)
        text_tokens = word_tokenize(definition)
        tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]
        no_stop.append(tokens_without_sw)
        # print(tokens_without_sw)
    bricks_without_sw = no_stop'''
    
# emotion_defs:

# for definition in emotion_defs:
#         text_tokens = word_tokenize(definition)
#         tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]
#         no_stop.append(tokens_without_sw)
#         # print(tokens_without_sw)

# list_def_brick = ' '.join([str(item) for item in bricks_without_sw])

# print(list_def_brick) 

# Remove stopwrords

## First Version

In [None]:
# Import stopwords with nltk.
from nltk.corpus import stopwords
stop = stopwords.words('english')

df = pd.DataFrame(pd.read_csv('../def2.csv'))

# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
for i in range (37):
    df = df.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

print(df['no_stop_words'])
#print(df)

In [None]:
stop = stopwords.words('english')
nltk.download('stopwords')

df = df.apply(lambda x: [item for item in x if item not in stop])

print(df)

In [None]:
# Create a dataframe from the csv file
df = pd.read_csv('../def.csv')

df = df.reset_index()  # make sure indexes pair with number of rows

for index, row in df.iterrows():
    print(row['0'], row['1'])

# 2 Fare stemming/lemming di ogni frase
# 3 calcolare similarità tra tutte le definizioni di una singola categoria
'''for line in f:
    print(line)'''
