#Model creation

##Import packages

In [None]:
import re
from os import listdir
import numpy
import nltk
from gensim import corpora
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from google.colab import drive

nltk.download('all')
drive.mount("content")

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package biocreative_ppi is already up-to-date!
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Package brown_tei is already up-to-date!
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Package cess_cat is already up-to-date!
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Package cess_esp is already up-to-date!
[nltk_data]    | Downloading packag

##Preprocess files

Loads the sports news corpus

In [None]:
loc = "/content/content/MyDrive/data science/IREI/Profile BR/articles"
articles = [open(loc+"/"+txt,encoding="utf8", errors="ignore").read() for txt in listdir(loc)]

For every article in the articles folder, the code tokenizes and stems every word (This may take 1-2 minutes). It is not necessary to do as you can use the model already trained and delivered with the assignment

In [None]:
wpt = nltk.WordPunctTokenizer()
stop_words = set(stopwords.words("english"))

stemmer = PorterStemmer()
def clean_word(word):
  if len(word)>2 and word not in stop_words:
        stemmed = stemmer.stem(word)
        if stemmed not in stop_words:
          return stemmed
  return None

def clean_doc(doc):
    # lowercase remove special characters, whitespace and convert to lowerspace
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I | re.A)
    doc = doc.strip()
    doc = doc.lower()

    # tokenize doc
    tokens = wpt.tokenize(doc)
    result=[]
    for token in tokens:
      token = clean_word(token)
      if token != None:
        result.append(token)
   
    return result

corpus = [clean_doc(article) for article in  articles]

##Embbeding

It is not necessary to do as you can use the model already trained and delivered with the assignment

Tags the document and creates an embedding vector of size 100 with a window of 3 (parameters chosen based on different trials) and saves the model for later use.

In [None]:
from gensim.models.doc2vec import TaggedDocument, Doc2Vec

tagged_corpus = [TaggedDocument(doc,[i]) for i,doc in enumerate(corpus)]

model = Doc2Vec(vector_size=100, min_count=2, epochs=100, window=3)
model.build_vocab(tagged_corpus)
model.train(tagged_corpus, total_examples=model.corpus_count, epochs=model.epochs)
model.save("myDoc2Vec.model")

#Querying

##Validation profiles

In [None]:
profiles = {
  "Antonio": [["baseball"]],
  "Javier": [["cristiano"]],
  "Pablo": [["basketball","nba","lebron"],["tennis"]]
}

## Loading and Using the model

Loads the saved model, preprocesses the profiles and retrieves the top 3 most relevant articles corresponding to each interest. A user may have multiple interests, each one being able to combine specifics of the topics.

In [None]:
model = Doc2Vec.load("myDoc2Vec.model")

for user in profiles:
  for interest in profiles[user]:
    
    string_interest = interest[0]
    last = len(interest)-1
    if last>1:
      for i in range(1,last):
        string_interest += ", " + interest[i]
    if last>0:
      string_interest += " and " +interest[last]  
    
    print("\033[1m",user," I know you like ", string_interest,"\033[0m", ". You should probably check out the following articles:\n", sep="")
    
    clean_interest = clean_doc(string_interest)

    inferred_vector = model.infer_vector(clean_interest)
    sims = model.docvecs.most_similar([inferred_vector], topn=3)
    sims.sort(key= lambda x: x[1], reverse =True)

    i=1
    for sim in sims:
      print(i,".", " You have at least a ", "\033[1m",int(sim[1]*100),"% match:","\033[0m" ,sep="")
      print(articles[sim[0]])
      print("\n")
      i+=1
    


[1mAntonio I know you like baseball[0m. You should probably check out the following articles:

1. You have at least a [1m60% match:[0m
Jon Paul Morosi is a national MLB writer for FOXSports.com. He previously covered baseball for the Detroit Free Press and Seattle Post-Intelligencer. He began his journalism career at the Bay City Times in his native Michigan. Follow him on Twitter.
    		        			  		        		 	  		        		 	
   	Free agent closer Rafael Soriano agreed to a two-year, $28 million contract with the Washington Nationals Tuesday, major-league sources confirmed to FOXSports.com. Now Boras can focus his attention on finding jobs for starter Kyle Lohse and outfielder Michael Bourn.
  	Soriano is now the highest-paid reliever in baseball at $14 million per year, according to salary data at Cot’s Baseball Contracts. In fact, this deal represents the second-highest annual salary for a relief pitcher in baseball history, behind Mariano Rivera’s $15 million per year from 