# HW3 - Group 31

## Libraries

In [None]:
import os
import csv
from bs4 import BeautifulSoup
import pandas as pd
import requests as rq
import time
import random
import unicodedata
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import *
import string
import re
from math import *
import json
from collections import defaultdict
from scipy import spatial
import heapq
from IPython.core.display import HTML
from IPython.display import display
import webbrowser
import os

### 1.1) Get the list of movies

In this step we get the list of all movies urls in the movies html files. Each member of the group used this code to get the list of movies and then html files.

In [None]:
def get_movieList (path) :
    movies = pd.DataFrame(pd.read_html(path + "\\movies1.html")[0]) #put the content of html file in a dataframe and get the first column
    movies.drop('Id', inplace=True, axis = 1)
    return movies
path = os.getcwd() #The address of directory where Movies.html files exist
movies = get_movieList(path)  #this function will give us list of movies urls in the html file of movies which exist in the path address   

### 1.2) Crawl Wikipedia

Now, we crawl each wikipedia page to get html files

In [None]:
def save_html(movies) :
    for i in range(len(movies)):
        try:
            response = rq.get(movies.URL[i])
        except rq.exceptions.RequestException as e: #if we got blocked by wiki we apply a time sleep
            print(e)
            time.sleep(20*60 + 30)
            response = rq.get(movies.URL[i])
        soup = BeautifulSoup(response.text, 'html.parser')
        f = open('article_'+str(i)+'.html','w')
        f.write(str(soup))
        f.close()
        time.sleep(random.choice(range(1,6)) #time sleep between each request
                   
save_html(movies)

### 1.3) Parse downloaded pages

In this step, we should parse HTML pages, get the specefic information we want and then save it as TSV files

## 2) search engine

###  preprocessing

#### All the TSV Files were preprocessed by :
1) Tokenization

2) Removing stop words

3) Removing punctuation

4) Stemming

5) Removing [] , ""


In [None]:

def clean(text):
    stop_words = set(stopwords.words('english')) 
    stemmer = PorterStemmer()
    text = text.lower()
    words = word_tokenize(text) #devide the text into substrings
    filtered1 = [w for w in words if not w in stop_words] #remove stop words
    filtered2 = list(filter(lambda word: word not in string.punctuation, filtered1))
    filtered3 = []
    for word in filtered2:
        try:
            filtered3 += re.findall(r'\w+', word) 
        except:
            pass
    
    filtered3 = [stemmer.stem(w) for w in filtered3] #stemming
    filtered4 = [c.replace("''", "").replace("``", "") for c in filtered3 ] #removing useless '' and  `` characters
    filtered4 = [f for f in filtered4 if len(f)>1]
    return filtered4

### 2.1) Conjunctive query

#### 2.1.1) creating index

In this section, we should first create a dictionary with all the words in our documents. The keys of this dictionary are integers(term_ids) and values are words.
Another dictionary that we create is docwords which points each document to list of all words in that document.
Another dictionary is tsvs which contains intro and plot section of each document.
we save these dictionaries  as json files to use afterwards in our code

In [None]:
# this function save an object to desired path as a json file
def savetojson(pathfile, obj):
    with open(pathfile, "w" ,encoding="utf-8") as out_file:
        out_file.write(json.dumps(obj, ensure_ascii = False))
        out_file.close()
        
        
def get_vocab_index(path) :
    allwords = list()
    docwords = dict() # point each document to its containing words
    tsvs = dict()
    vocabulary = dict() # point each term id to a word
    for i in range(0,30000):
        with open(path+"\\TSV\\article_" + str(i) + ".tsv", encoding = "utf-8") as fd:
            rd = csv.reader(fd, delimiter="\t", quotechar='"')
            for row in rd:
                if row :
                    tsv = row
        text = ' '.join([tsv[1],tsv[2]]) #get intro and plot of each tsv file
        tsvs[i] = tsv
        cleared = clean(text)

        docwords['document_'+str(i)] = cleared
        allwords += cleared
        
        
    allwords = list(set(allwords)) # get the list of unique words
        for i in range(len(allwords)):
            vocabulary[str(i)] = allwords[i]
            
            
            
    savetojson(path+"\\tsvs.json", tsvs)
    savetojson(path + "\\WORDS\\DocWords.json", docwords)
    savetojson(path + "\\WORDS\\vocabulary.json", vocabulary)

Now, we should create inverted_index which points each term_id to the documents that contains that word. First we load vocabulary json file that we created in the previous step

In [None]:
def get_inverted_index(path) :
    inverted = defaultdict(list)
    
    with open(path + "\\WORDS\\vocabulary.json", encoding = "utf-8") as fd:
        vocabulary = json.load(fd)
        
    reverse_voc = {v:k for k,v in vocabulary.items()} # we need to inverse keys and values of our dictionary
    
# we check for each document and for each word in that doument whether that document exist in inverted dictionary
#or not, and if it didn't exist we add the document number
    for doc in docwords.keys():
        for word in docwords[doc]:
            if not doc in inverted[reverse_voc[word]]:
                inverted[reverse_voc[word]].append(doc)
                
    savetojson(path + "\\WORDS\\Inverted_index.json", inverted)

#### 2.1.2) execute query

First, we get the query from user and replace each word with the term_id. If the word did not exist in vocabulary dictionary we assign NA to it

In [None]:
def get_query():
    query = input("Insert your query: ")
    return(clean(query))

def get_query_index(query) :
    indexes = []
    for i in range(len(query)) :
        if query[i] in vocabulary.values() : #if the vocab in query exist in vocabulary dataset
            indexes.append(reverse_voc[query[i]]) #add term_id of that vocab to query

        else : #if it does not exist in vocabulary we replace it with 'NA'
            indexes.append('NA')
    return(indexes)

In this step, we should find the documents that contain all words of the query

In [None]:
def execute_query(query):
    if len(query) == 0:
        return('Please, insert text in your search')
    query = get_query_index(query)
    docs = []
    for i in query :
        if (i == 'NA') : 
#if there is a vocab in query that does not exist in vocabulary dataset, there isn't a match and we should terminate the function
            return("No match for your query")
        else :
            docs.append(set(inverted_index[i]))
        
    docs = set.intersection(*docs)
    return(docs)

In this part we create some functions that we need to run and show the results

In [None]:
def Linked_URL(val): #we will use this to make the urls in output clickable
        # target _blank to open new window
        return '<a target="_blank" href="{}">{}</a>'.format(val, val)
    
def replacer(val):      #This is used to escape the character $ in the output for Intro,
    return val.replace('$', '\$')      #otherwise it would be interpreted by displayer

def Run_SE1():
    query = get_query()
    results = []
    for file in execute_query(query):
        docid = file.split('_')[1]
        tsv = newdict[docid]
        results.append([docid,tsv[0],tsv[1],Movies[docid]])  #create movies file before
    df = pd.DataFrame(results, columns = ['Id','Title', 'Intro', 'Wikipedia Url'])
    f = open(path + '\\display.html','w', encoding = 'utf-8')
 
    message = df.style.format({'Wikipedia Url': Linked_URL}).format({'Intro': replacer}).render()

    f.write(message)
    f.close()

    #Change path to reflect file location
    filename = path + '\\display.html'
    webbrowser.open_new_tab(filename) # for showing the results in the browser

In [None]:
def get_results(query):
    results = []
    for file in execute_query(query):
        docid = file.split('_')[1]
        tsv = newdict[docid]
        results.append([docid,tsv[0],tsv[1],Movies[docid]])  #create movies file before
    result = pd.DataFrame(results, columns = ['Id','Title', 'Intro', 'Wikipedia Url'])
    return result

### 2.2) Conjunctive query & Ranking score

In this part we should give scores based on cosine similarity


In [None]:
def cosine_similarity(a,b):
    cosine_distance = spatial.distance.cosine
    return 1 - cosine_distance(a,b)

#### 2.2.1) Inverted index

what we need now is to calculate the IDF and TF - IDF, according to the formulas: 
- $TF = \frac{N_{(x,y)}}{N_{(*,y)}}$
- $IDF = log[1 + (\frac{D}{D_x})]$ <fr>

Where:
- $N_{(x,y)}$ is the number of times that the word $X$ is in the document $D_y$;
- $N_{(*,y)}$ is the total number of the words in the document;
- $D$ is the total number of documents;
- $D_x$ is the number of documents in which the word $X$ appears at least once.

In [None]:
with open(path + "\\WORDS\\Inverted_index.json", encoding = "utf-8") as fd:
        inverted_index = json.load(fd

In [None]:
with open(path + "\\WORDS\\DocWords.json", encoding = "utf-8") as fd:
        docwords = json.load(fd)

In [None]:
IDFs = dict()
inv_ind_tfIDF = defaultdict(list)
for term in inverted_index.keys() :
        IDFs[term] = log(1+ 30000/len(inverted_index[term])) #first we calculat IDF for each term_id
        for doc in inverted_index[term] :
            tf = docwords[doc].count(vocabulary[term]) / len(docwords[doc])
            tfidf = tf * IDFs[term]
            inv_ind_tfIDF[term].append((doc,round(tfidf, 3))

In [None]:
savetojson(path + "\\WORDS\\TfIdf_inv_index.json", inv_ind_tfIDF)

#### 2.2.2)Execute the query

## Defining a new score

In this step we should define some variables to calculate the new scores based on them. The variables that we decided to use are : The release Year, length of the movie(Run time), Budget and number of stars as these variables seems to be more important to most of users. First we get  some queries from user and based on maximum and minimum value of these varaibles among resulted documents of the first search engine we define a scoring function for each variable that gives a score between 0 and 1. Finally we calculate the mean of these scores and put them in a heap structure to find 10 documents that have most scores

In [None]:
#getting query from user
def get_query_SE3():
    query = input("insert your query : ")
    query = clean(query)
    q = dict()
    
    year = input("Do you want to specify the release year ? [Y/N] : ").lower()
    if year == "y" :
        year = input("Please, specify the release date : ") 
        q["year"] = year
    else:
        q["year"] = 'NA'
    



    Runtime = input("Do you want to specify the length of the movie? [Y/N] : ").lower()
    if Runtime == "y" :
        Runtime = input("Please, specify the length of the movie : ")
        if re.search('\d', Runtime):
            q['Runtime'] = Runtime
        else:
            return 'Please, enter a valid runtime.'
    else :
        q["Runtime"] = 'NA'


    starring = input("Is number of stars an important factor for you? [Y/N] : ").lower()
    if starring == "y" :
        starring = input("Please, specify if you're looking for a big or small cast [B/S]: ")
        q["starring"] = starring
    else :
        q["starring"] = 'NA'


    budget = input("Is movie budget an important factor for you? [Y/N] : ").lower()
    if budget == "y" :
        q['Budget'] = input("Please, specify the budget of the movie you're looking for : ")
    else :
        q['Budget'] = 'NA'
        
    return query,q

Now we should execute our search engine with the query

In [None]:
def search_engine3() :
    (query, q) = get_query_SE3()
    results = execute_query(query)    #running the first search engine to get all query_related documents 
                                    # Now we should define variables that we want to use to give a new score
    d = defaultdict(dict)
    result_variables = dict() # A dictionary that assigns each document to a dictionary of variables in that document
     # A dictionary that
    for i in results :
        docId = i.split("_")[1] 
        tsv = newdict[docId]


        d[i] = dict()

        if tsv[6] == 'NA':
            d[i]['Starring'] = '-10000'
        else:
            d[i]['Starring'] = str(len(tsv[6].replace('\n', '').strip(',').split(',,')))

        try:
            d[i]['Release Year'] = re.search(r'\d{4}', tsv[8]).group(0)
        except:
            d[i]['Release Year'] = '-10000'

        try:
            d[i]['Runtime']    = re.search(r'\d+.*',tsv[9]).group(0)
        except:
            d[i]['Runtime']    = '-10000'

        #some movies have running time expressed in reels, and the conversion in minutes is not univoque, so we'll just ignore those info
        if re.search(r'min', d[i]['Runtime']):
            d[i]['Runtime'] = re.search(r'\d+[\.|\,|:]*\d*', d[i]['Runtime']).group(0)
            d[i]['Runtime'] = re.search(r'\d+', d[i]['Runtime']).group(0)
        else:
            d[i]['Runtime'] = '-10000'

        try:
            d[i]['Budget']   = re.findall(r'\$.*', tsv[12])[0]
        except:
            d[i]['Budget']  = '-10000'


        if re.search(r'mil', d[i]['Budget']):
            d[i]['Budget']  = str(int(float(re.search(r'\d+[\.|\,]*\d*', d[i]['Budget']).group(0).replace(',', '.'))*10**6))

        elif re.search(r'\,', d[i]['Budget']) or re.search(r'\.', d[i]['Budget']):
            d[i]['Budget'] = re.search(r'(\d+[\,!\.])+\d+', d[i]['Budget']).group(0).replace(',', '').replace('.', '')


        result_variables[docId] = d[i]

        Runtimes = []

    Release_year = []
    Starring = []
    Budget = []

    for i in result_variables.keys() :
        i = 'document_'+str(i)
        Runtimes.append(int(d[i]["Runtime"]))
        Release_year.append(int(d[i]["Release Year"]))
        Starring.append(int(d[i]["Starring"]))
        Budget.append(int(d[i]["Budget"]))
    scores = dict()
    for i in result_variables :
        # calculating score for Running time
        i = 'document_'+ str(i)
        minrun = min(Runtimes)
        maxrun = max(Runtimes)
        if re.search('\d', q['Runtime']):
            run_score = exp(-(int(re.search('\d+', q['Runtime']).group(0)) -int(d[i]['Runtime']))**2/100)
        else:
            run_score = 0


       # calculating score for quantitative Release_year query
        if re.search('\d', q['year']):
            distance = abs(int(d[i]['Release Year']) - int(re.search('\d+',q["year"]).group(0)))
            year_score = exp(-distance/10)
        else:
            year_score = 0


      # calculating score for budget

        if re.search('\d', q['Budget']):
            if re.search(r'mil', q['Budget']):
                Budget  = int(float(re.search(r'\d+[\.|\,]*\d*', q['Budget']).group(0).replace(',', '.'))*10**6)

            elif re.search(r'\,', q['Budget']) or re.search(r'\.', q['Budget']):
                Budget = int(re.search(r'(\d+[\,!\.])+\d+', q['Budget']).group(0).replace(',', '').replace('.', ''))


            budget_score = exp(-abs(int(Budget) - int(d[i]['Budget'])) / 10**5)
        else:
            budget_score = 0

    # calculating score for starring
        maxstar = max(Starring)
        minstar = min(Starring)
        if q['starring'] == 'B':
            starring_score = (maxstar - int(d[i]['Starring']))/(maxstar-minstar)
        elif q['starring'] == 'S':
            starring_score = (int(d[i]['Starring']) - minstar)/(maxstar-minstar)
        else:
            starring_score = 0

        mean_score = 1/4 * (run_score + year_score + budget_score + starring_score)
        scores[i] = (mean_score, i)
        
# use heap structure to sfine the 10 best score
    heap = []
    for doc in scores:
        heapq.heappush(heap, scores[doc])
    heap_result = heapq.nlargest(10, heap)
    df = dict()
    for x,z in heap_result:
        y = z.split('_')[1]
        df[y] = newdict[y][0:2]
        df[y].append(Movies[y])
        df[y].append(x)

    df = pd.DataFrame.from_dict(df, orient = 'index', columns=['Title', 'Intro', 'Wikipedia UrlL', 'Score'])
    f = open(path + '\\display.html','w', encoding = 'utf-8')
 
    message = df.style.format({'Wikipedia Url': Linked_URL}).format({'Intro': replacer}).render()

    f.write(message)
    f.close()

    #Change path to reflect file location
    filename = path + '\\display.html'
    webbrowser.open_new_tab(filename)

## Bonus - Create a co-stardom network

In [None]:
mport networkx as nx
import matplotlib.pyplot as plt
from Functions import *
import seaborn as sns

First we should insert a query and get the results from the third search engine

In [None]:
(query, q) = get_query_SE3()

In [None]:
query, q = (['orc', 'elv'],
 {'year': '1995', 'Runtime': 'NA', 'starring': 'NA', 'Budget': '20 milions'})
result = search_engine3(query, q)

In [None]:
result.index

In [None]:
newdict['14018'] #from the functions import, newdict is the dictionary of all tsv's

In [None]:
stars = [] #set of star actors in 10 first result of search engine
for ind in result.index :
    tsv = newdict[str(ind)]
    for i in tsv[6].replace('\n', '').strip(',').split(',,') : #stars of a movie
        if i not in stars :
            stars.append(i)

Now we have nodes of our network. In order to create edges, first we shoud make a list of all possible duel combination

In [None]:
duel_stars = []
for i in range(0 , len(stars)) :
    for j in range(i+1,len(stars)) :
        duel_stars.append((stars[i],stars[j])) #make a nested list with all possible duel combination

In this step we should check which duel combinations that we created in previous step, exist in more than 2 documents

In [None]:
edge = []
nodes = set()
for x,y in duel_stars : #for each dual combination
    counter = 0
    for j in result.index: # search in the whole dataset
        tsv = newdict[str(j)]
        starring = tsv[6].replace('\n', '').strip(',').split(',,')
        if len(set((x,y)).intersection(set(starring))) == 2:
            counter += 1
            if counter == 2:
                edge.append((x,y))
                nodes.add(x)
                nodes.add(y)

In [None]:
nodes

Now we have Nodes and Edges. It's time to create a network

In [None]:
G = nx.Graph()
G.add_nodes_from(stars)
G.add_edges_from(edge)
nx.draw(G, with_labels = 5)
plt.savefig("co-stardom.png", format = 'png')
plt.show()

to have better visualization

In [None]:
G = nx.Graph()

G.add_nodes_from(stars)
G.add_edges_from(edge)
pos = dict()

c = 0

for i in set(stars)-nodes:
    c+=1
    if c%2 ==0:
        m = 0
    else:
        m = 1
    pos[i] = (m*250, c*15)
    
for i in nodes:
    pos[i] = (random.choice(range(50,200)), random.choice(range(1,c*15)))
nx.draw(G, pos, with_labels=True, font_color = 'red', font_size = 13, font_weight = 'bold')
plt.xlim([-70, 320])
plt.savefig("co-stardom.png")

## Analysis

Let's do some analysis on this network. We can make a density distribution histogram which shows the number of connections for each node

In [None]:
all_degrees =[val for (node, val) in G.degree()]
unique_degrees = list(set(all_degrees))
count = []
for i in unique_degrees :
    x = all_degrees.count(i)
    count.append(x)
plt.plot(unique_degrees , count, "yo-")
plt.xlabel("Degree")
plt.ylabel("Number of nodes")
plt.show()

We can calculate degree centality for each node to see which actors where more involved with other actors(most influential)

In [None]:
nx.degree_centrality(G)

In [None]:
sorted(nx.degree_centrality(G), key = nx.degree_centrality(G).get , reverse = True)