In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', 75)
pd.options.display.max_rows = 7

# Jaro Similarity
<img src="images/jaro.png" width="80%">

https://en.wikipedia.org/wiki/Jaro–Winkler_distance

# Levenshtein Distance
<img src="images/levenshtein.png" width="80%">

https://en.wikipedia.org/wiki/Levenshtein_distance

## Let's Try It!

In [None]:
from jellyfish import levenshtein_distance, jaro_distance

In [None]:
# Levenshtein distance
levenshtein_distance('google.com', 'g00gle.com')

In [None]:
# Jaro distance
jaro_distance('google.com', 'g00gle.com')

## More!

In [None]:
words_df = pd.DataFrame([{'word1' : u'google.com', 'word2' : u'google.com'},
                         {'word1' : u'google.com', 'word2' : u'g00gle.com'},
                         {'word1' : u'google.com', 'word2' : u'google.badguy.com'},
                         {'word1' : u'google.com', 'word2' : u'malware.ru'},
                         {'word1' : u'bit', 'word2' : u'bot'},
                         {'word1' : u'bitly.bit', 'word2' : u'bitly.bot'}])
words_df['Levenshtein'] = words_df.apply(lambda row: levenshtein_distance(row['word1'], row['word2']), axis=1)
words_df['Jaro'] = words_df.apply(lambda row: jaro_distance(row['word1'], row['word2']), axis=1)
words_df

## Finding Needles in a Haystack
<img src="images/dataset.png" width="50%">

In [None]:
# Load Dataset
df = pd.read_csv('./data/dataset_medium.csv', dtype={'port': str, 'subdomains':str})
df.fillna('', inplace=True)
df = df[['label', 'url', 'uri', 'subdomains']]
df

### Let's look for malicious bank-related links
<img src="images/bank.png" alt="my img" align="center" width="10%" />

In [None]:
# Find common malicious
df[(df.label == 'malicious') & \
   (df.subdomains.str.contains('bank'))][['uri', 'subdomains', 'url']]

<img src="images/needles_to_haystack.png" alt="my img" align="left" width="50%" />

In [None]:
malicious_uri = '/account/verification/D50M74890M8414B93618/qes.php'
df['jaro'] = df.uri.apply(jaro_distance, args=(malicious_uri,))
df[['label', 'url', 'jaro']].sort_values('jaro', ascending=False)[0:7]

# TF-IDF: How important is this word relative to everything else?
<img src="images/charlie.jpg" alt="my img" align="middle" width="50%" />

In [None]:
# Load TF-IDF Library
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

## Let's run TF-IDF on just the malicious dataset

In [None]:
# Select malicious examples only
df_malicious = df[df.label=='malicious'][['label', 'url']]
df_malicious.head(5)

## Let's run TF-IDF on just the malicious dataset

In [None]:
# Instantiate Vectorizer
vectorizer = TfidfVectorizer(max_df=.5, min_df=.001)
# Fit
features_transformed = vectorizer.fit_transform(df_malicious.url)
vocab = vectorizer.vocabulary_

In [None]:
vocab

## Let's run TF-IDF on just the malicious dataset

In [None]:
def top_tfidf(vectorizer, fit_transform_result, topn=20):
    scores = zip(vectorizer.get_feature_names(), np.asarray(fit_transform_result.sum(axis=0)).ravel())
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    for item in sorted_scores[0:topn]:
        print("{0:50} Score: {1:.3f}".format(item[0], item[1]))

top_tfidf(vectorizer, features_transformed, topn=20)

## Redo TF-IDF for Everything

In [None]:
features_transformed = vectorizer.fit_transform(df.url)
print(features_transformed.shape)
vocab = vectorizer.vocabulary_
top_tfidf(vectorizer, features_transformed, topn=25)

<img src="images/cosine.png" alt="my img" align="center" width="35%" />

In [None]:
# Load cosine_similarity function
from sklearn.metrics.pairwise import cosine_similarity

# Helper function
def print_top_n(result, df, top_n=5):
    top_n = 5
    sorted_result = result.argsort()[0][(-1*top_n):]
    list(reversed(sorted_result.tolist()))
    results = []
    for idx in sorted_result:
        results.append({'cos_dist' : result[0][idx], 'url' : df.iloc[idx]['url']})
    return pd.DataFrame(results).sort_values('cos_dist', ascending=False).reset_index(drop=True)

In [None]:
df[((df.label == 'malicious') & \
    df.url.str.contains('paypal'))][['url', 'subdomains']].head(5)

In [None]:
idx = 950311
result = cosine_similarity(features_transformed[idx:(idx+1)], features_transformed)
print_top_n(result, df)

# Machine Learning with Naive Bayes


<img src="images/naive.jpg" alt="my img" align="middle" width="20%" />

<img src="images/needlestack.png" alt="my img" align="middle" width="80%" />

In [None]:
# Load dataset
ml_dataset = df[900000:]
ml_dataset.label.value_counts()

In [None]:
from sklearn.naive_bayes import GaussianNB
# TF-IDF
vectorizer = TfidfVectorizer(max_df=.5, min_df=.001)
X_data = vectorizer.fit_transform(ml_dataset.url)
vocab = vectorizer.vocabulary_

# Naive Bayes Classifier
cls = GaussianNB()
clf = cls.fit(X_data.toarray(), ml_dataset.label)

In [None]:
def predict(url, vocab, clf):
    sample = pd.DataFrame([{'url' : url}])
    vectorizer = TfidfVectorizer(vocabulary=vocab)
    sample_tfidf = vectorizer.fit_transform(sample.url)
    return clf.predict(sample_tfidf.toarray())[0]

In [None]:
url = 'http://000webhostapp.php/wp-content/plugins/ubh/wells/gzjzty=/myaccount/emailaccess/login'
predict(url, vocab, clf)

In [None]:
url = 'https://www.youtube.com/watch?v=svlEfxTyJQE'
predict(url, vocab, clf)