In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from nltk.tokenize import word_tokenize

import xgboost as xgb

import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

df = pd.read_csv("data/training_set.csv")
df_test = pd.read_csv("data/validation_set.csv")

df = df.sample(1000)
df_test = df_test.sample(200)

df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

In [5]:
# process descriptions
descriptions = df[['Description']].reset_index()

# Generate mapping between descriptions and indexes
indices = pd.Series(descriptions.index, index=descriptions['Description']).drop_duplicates()

sample_description = np.random.choice(descriptions['Description'])
sample_description

'HYDERABAD, India (AP) — At least 27 people were killed and dozens injured Tuesday in a stampede during a Hindu religious'

# Tf-idf similarity matrix

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel 

def get_n_most_similar(description, cosine_sim, indices, n=10):
    # Get index of movie that matches title
    idx = indices[description]
    # Sort the movies based on the similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores for n most similar
    sim_scores = sim_scores[1:n+1]
    # Get most similar indices
    similar_indices = [i[0] for i in sim_scores]
    # Return the top 10 most similar 
    return descriptions['Description'].iloc[similar_indices]

# Initialize the TfidfVectorizer 
tfidf = TfidfVectorizer(stop_words='english')

# Construct the TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(descriptions['Description'])

# Generate the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [7]:
get_n_most_similar(sample_description, cosine_sim, indices)

227    More than 300 people were killed in the attack...
567    Rebekah Gregory was among those injured during...
554           "We’re not as few as people think we are.”
814    More than half of U.S. women who die in gun ho...
358    It might be useful to remember what other thre...
366    It seems that everyone has taken a side follow...
677    Once it became clear Tuesday night that Donald...
743    Even before Donald Trump’s big win in New York...
709    India has broken into the world top-five defen...
208    "We have hundreds of card partners and dozens ...
Name: Description, dtype: object

# Gensim pretrained word embedding similarities 

In [11]:
from collections import defaultdict
from gensim import corpora
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

documents = list(descriptions['Description'])

# remove common words and tokenize
stoplist = set(stopwords.words('english'))

texts = [
    [word for word in word_tokenize(document) if word not in stoplist]
    for document in documents
]

# remove words that appear only once ------------
# this can be done with CountVectorizer I think
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]
# ------------------------------------------------

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [19]:
from gensim import models, similarities

lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=15)

def get_n_most_similar(doc, lsi, documents, n=10):
    vec_bow = dictionary.doc2bow(doc.lower().split())
    vec_lsi = lsi[vec_bow]  # convert the query to LSI space

    # transform corpus to LSI space and index it
    index = similarities.MatrixSimilarity(lsi[corpus])  

    # perform a similarity query against the corpus
    sims = sorted(enumerate(index[vec_lsi]), key=lambda item: -item[1])[:n]
    for doc_position, doc_score in sims:
        print(doc_score, documents[doc_position])
    return sims

get_n_most_similar("500 were killed", lsi, documents, n=4)

0.9037965 If we judge parents for putting their kids at very low risk, we could jail them for serving solid food (the child could choke!) letting them walk down stairs (the child could fall!) or permitting them to join a sport (concussions!).
0.811677 Going green just got easier!
0.78923297 They grow up so fast!
0.7798173 Better Call Saul!


[(835, 0.9037965), (90, 0.811677), (373, 0.78923297), (11, 0.7798173)]

# Spacy pretrained word embeddings

In [28]:
import pandas as pd
import spacy
from spacy.lang.nb.stop_words import STOP_WORDS as no_stopwords

nlp = spacy.load("en_core_web_md")

nlp.Defaults.stop_words |= set(no_stopwords)
nlp.Defaults.stop_words |= set(["aker", "bp","akerbp"])

def get_n_most_similar(text, texts, n=10):
    doc = nlp(text)
    similarities = pd.DataFrame({"text": texts})
    similarities["similarity"] = similarities.text.apply(lambda x: doc.similarity(nlp(x)))
    similarities = similarities.sort_values("similarity", ascending=False)
    return similarities.head(n)
    
texts = list(descriptions['Description'])
get_n_most_similar("500 people were killed. They tragically died in a storm.", texts, n=10)

  similarities["similarity"] = similarities.text.apply(lambda x: doc.similarity(nlp(x)))


Unnamed: 0,text,similarity
698,"The aircraft, which carried 122 soldiers, fami...",0.924842
305,Statements from the two cops involved are so s...,0.924387
867,There were no reports of injuries or deaths.,0.917251
901,"Unlike the police, firefighters do not have a ...",0.914312
550,"At least 36 people died in the fire, and as ma...",0.91333
788,After gunfire broke out on the anniversary of ...,0.911703
619,Another officer had been telling me about a re...,0.911656
900,"MakeSchool, which was originally named MakeGam...",0.909648
744,When you see that the suicide rate has increas...,0.908378
196,I use to think that not being friends with you...,0.905788
