In [1]:
import pandas as pd
from nltk.corpus import stopwords
import warnings
pd.set_option('display.max_colwidth', 100)
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
corpus = ['The cat got chased by dog',
         'The cat jumped on a tree',
         'The dog kept barking at the door']

### Count Vectorizer

In [3]:
count_vec = CountVectorizer().fit(corpus)
bag_of_words = count_vec.transform(corpus)

In [4]:
count_vec.vocabulary_.keys()

dict_keys(['the', 'cat', 'got', 'chased', 'by', 'dog', 'jumped', 'on', 'tree', 'kept', 'barking', 'at', 'door'])

In [5]:
dictionary_inverted = {v: k for k, v in count_vec.vocabulary_.items()}
vocab = []
for i in sorted (dictionary_inverted.keys()):  
     vocab.append(dictionary_inverted[i])
print(vocab)

['at', 'barking', 'by', 'cat', 'chased', 'dog', 'door', 'got', 'jumped', 'kept', 'on', 'the', 'tree']


In [6]:
bag_of_words.toarray()

array([[0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1],
       [1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 2, 0]], dtype=int64)

## N-Grams

In [7]:
count_vec = CountVectorizer(ngram_range=(1,2)).fit(corpus)
bag_of_words = count_vec.transform(corpus)

In [8]:
count_vec.vocabulary_.keys()

dict_keys(['the', 'cat', 'got', 'chased', 'by', 'dog', 'the cat', 'cat got', 'got chased', 'chased by', 'by dog', 'jumped', 'on', 'tree', 'cat jumped', 'jumped on', 'on tree', 'kept', 'barking', 'at', 'door', 'the dog', 'dog kept', 'kept barking', 'barking at', 'at the', 'the door'])

In [9]:
dictionary_inverted = {v: k for k, v in count_vec.vocabulary_.items()}
vocab = []
for i in sorted (dictionary_inverted.keys()):  
     vocab.append(dictionary_inverted[i])
print(vocab)

['at', 'at the', 'barking', 'barking at', 'by', 'by dog', 'cat', 'cat got', 'cat jumped', 'chased', 'chased by', 'dog', 'dog kept', 'door', 'got', 'got chased', 'jumped', 'jumped on', 'kept', 'kept barking', 'on', 'on tree', 'the', 'the cat', 'the dog', 'the door', 'tree']


### TF-IDF

In [10]:
tfidf_vec = TfidfVectorizer().fit(corpus)
bag_of_words = tfidf_vec.transform(corpus)

In [11]:
dictionary_inverted = {v: k for k, v in count_vec.vocabulary_.items()}
vocab = []
for i in sorted (dictionary_inverted.keys()):  
     vocab.append(dictionary_inverted[i])
print(vocab)

['at', 'at the', 'barking', 'barking at', 'by', 'by dog', 'cat', 'cat got', 'cat jumped', 'chased', 'chased by', 'dog', 'dog kept', 'door', 'got', 'got chased', 'jumped', 'jumped on', 'kept', 'kept barking', 'on', 'on tree', 'the', 'the cat', 'the dog', 'the door', 'tree']


In [12]:
bag_of_words.toarray()

array([[0.        , 0.        , 0.4711101 , 0.35829137, 0.4711101 ,
        0.35829137, 0.        , 0.4711101 , 0.        , 0.        ,
        0.        , 0.27824521, 0.        ],
       [0.        , 0.        , 0.        , 0.38376993, 0.        ,
        0.        , 0.        , 0.        , 0.50461134, 0.        ,
        0.50461134, 0.29803159, 0.50461134],
       [0.40914568, 0.40914568, 0.        , 0.        , 0.        ,
        0.31116583, 0.40914568, 0.        , 0.        , 0.40914568,
        0.        , 0.48329606, 0.        ]])

### Hashing Trick

In [13]:
hashvectorizer = HashingVectorizer(n_features=2**4)
hashvector = hashvectorizer.fit_transform(corpus)

In [14]:
hashvector.toarray()

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        -0.40824829,  0.        ,  0.40824829,  0.        , -0.40824829,
         0.        ,  0.40824829,  0.        ,  0.40824829, -0.40824829,
         0.        ],
       [ 0.        ,  0.57735027,  0.        ,  0.        ,  0.        ,
         0.57735027,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        , -0.57735027,
         0.        ],
       [ 0.33333333,  0.        ,  0.        ,  0.        ,  0.        ,
        -0.33333333,  0.        , -0.33333333,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        , -0.33333333, -0.66666667,
        -0.33333333]])

### News Recommendation

In [15]:
df = pd.read_excel('news.xlsx')
df.head(2)

Unnamed: 0,KeyDoc,Article,ArticlePlainText,ArticlePostDate,Headline,KeyArticle,KeyInstn,BogusNewsCodeText,IndustryShortName
0,517795,StartHTML:0000000088 EndHTML:0000004765 StartFragment:0000000181 EndFragment:0000004745 <html> <...,The cover of Friday's Wall Street Journal had a story on the emerging bear market in the NASDAQ ...,2019-05-23 12:54:00,Bad Day!,517795.0,,Industry News,Financials
1,519985,Version:0.9 StartHTML:0000000105 EndHTML:0000001479 StartFragment:0000000341 EndFragment:000...,"Peoples Heritage Finl Group($13.8B), Portland, ME, has received Fed approval to acquire Banknort...",2019-05-23 01:06:00,Peoples Heritage Gets Fed Okay For BankNorth Buy,519985.0,1022029.0,Mergers & Acquisitions: Approvals,Bank


In [16]:
df.set_index('Headline', inplace = True)

In [17]:
def train():
    
    #TF-IDF with n-gram range 1-3 and removing stop words. Document frequency cutoff
    tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')

    tfidf_matrix = tf.fit_transform(df['ArticlePlainText'])

    #Building a cosine similarity matrix
    cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    return cosine_similarities

In [18]:
def recommendations(headline, model, top_n=10):
    
    recommended_news = []
    
    indices = pd.Series(df.index)
    
    # gettin the index of the news that matches the name
    idx = indices[indices == headline].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(model[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar news except itself
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the names of the top 10 matching news
    for i in top_10_indexes:
        recommended_news.append([df.ix[i, 'ArticlePostDate'].date(), list(df.index)[i]])
        
    return pd.DataFrame(recommended_news, columns=['Date', 'Headline'])

In [19]:
model = train()
df_rec = recommendations('Whitney Holding Buying American Bank in Houston', model)
df_rec.head(10)

Unnamed: 0,Date,Headline
0,2019-05-23,Whitney Holding Corp. Buying Prattville Financial for $40.5M
1,2019-09-25,Whitney Buying Privately Held First Ascension for $22.1M in Stock
2,2021-12-02,Whitney completes $22.1M First Ascension purchase
3,2020-02-05,FNB Corp. Buying First Union Branches in Southwest Virginia
4,2019-10-11,Compass Buying FirsTier Corp. for $127M in Stock
5,2020-12-29,Whitney Earnings Up 5% in 3Q at $17.1M
6,2019-05-21,Allegiant to Buy Equality for $27.17M in Stock
7,2019-10-08,BB&T Buying FirstSpartan Financial Corp. in $103.9M Stock Swap
8,2020-12-20,Expected Earnings Releases for Oct. 12
9,2020-05-08,Capital City Bank Group Agrees to Acquire First Bankshares of West Point


In [20]:
df_rec = recommendations('Whitney Holding Buying American Bank in Houston', model)
df_rec.head(10)

Unnamed: 0,Date,Headline
0,2019-05-23,Whitney Holding Corp. Buying Prattville Financial for $40.5M
1,2019-09-25,Whitney Buying Privately Held First Ascension for $22.1M in Stock
2,2021-12-02,Whitney completes $22.1M First Ascension purchase
3,2020-02-05,FNB Corp. Buying First Union Branches in Southwest Virginia
4,2019-10-11,Compass Buying FirsTier Corp. for $127M in Stock
5,2020-12-29,Whitney Earnings Up 5% in 3Q at $17.1M
6,2019-05-21,Allegiant to Buy Equality for $27.17M in Stock
7,2019-10-08,BB&T Buying FirstSpartan Financial Corp. in $103.9M Stock Swap
8,2020-12-20,Expected Earnings Releases for Oct. 12
9,2020-05-08,Capital City Bank Group Agrees to Acquire First Bankshares of West Point
