# Libraries 
The most important libraries needed for this project

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import spacy
import string
import gensim
import operator
import re

# Loading the data
We will now load the csv comics data into the dataframe and take a quick look at the columns and data being transferred

In [None]:
df_comics = pd.read_csv('../input/marvel-comic-books/Marvel_Comics.csv')
df_comics.head()

# Cleaning and preprocessing of our data
Data preprocessing is one of the most important steps in text analytics. The goal is to remove any unwanted words or characters that serve human readability, but will not contribute to better results for our model.

The following function uses regular expressions to match patterns of unwanted text and remove/replace them.

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS

spacy_nlp = spacy.load('en_core_web_sm')

#creating a list of punctuation and stopwords
punctuations = string.punctuation
stop_words = spacy.lang.en.stop_words.STOP_WORDS

#Function to clean and process data

def spacy_tokenizer(sentence):
 
    #removal of unnecessary single quotation marks
    sentence = re.sub('\'','',sentence)

    #removal of digits and words containing digits
    sentence = re.sub('\w*\d\w*','',sentence)

    #replacing redundant spaces with single spaces
    sentence = re.sub(' +',' ',sentence)

    #removal of unnecessary lines beginning with special characters
    sentence = re.sub(r'\n: \'\'.*','',sentence)
    sentence = re.sub(r'\n!.*','',sentence)
    sentence = re.sub(r'^:\'\'.*','',sentence)
    
    #removal of non-breaking signs
    sentence = re.sub(r'\n',' ',sentence)
    
    #removing punctuation
    sentence = re.sub(r'[^\w\s]',' ',sentence)
    
    #creation of token object
    tokens = spacy_nlp(sentence)
    
    #lower, strip and lemmatization
    tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens]
    
    #remove stopwords and skip words shorter than 2 characters
    tokens = [word for word in tokens if word not in stop_words and word not in punctuations and len(word) > 2]
    
    #return of tokens
    return tokens

Function of cleansing and preprocessing data for comics in "issue_description" column. We will save the cleansed and tokenized data in a new column.

In [None]:
print('Cleaning and Tokenizing...')
%time df_comics['issue_description_tokenized'] = df_comics['issue_description'].map(lambda x: spacy_tokenizer(x))
df_comics.head()

Storage of the tokenized column in a separate variable to facilitate operations at subsequent points and quickly view the values

In [None]:
marvel_comic_plot = df_comics['issue_description_tokenized']
marvel_comic_plot[0:5]

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

series = pd.Series(np.concatenate(marvel_comic_plot)).value_counts()[:100]
wordcloud = WordCloud(background_color='black').generate_from_frequencies(series)

plt.figure(figsize=(15,15), facecolor = None)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# Building a dictionary of words
In the next step, we will build a dictionary, in which all unique words will be given identifiers, and their frequencies will be recorded. It is worth noting that we use the gensim library to build the dictionary. In gensim, words are referred to as "tokens," and the index of each word in the dictionary is called ID

In [None]:
from gensim import corpora

#creating a word dictionary
%time dictionary = corpora.Dictionary(marvel_comic_plot)

#list of several words that can be further removed
stoplist = set('hello and if this can would should could tell ask stop come go')
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
dictionary.filter_tokens(stop_ids)

Once the dictionary is created, content-neutral words and additional stopwords are removed.

In [None]:
#Printing the top 50 entries from the dictionary with their unique token-id
dict_tokens = [[[dictionary[key], dictionary.token2id[dictionary[key]]] for key, value in dictionary.items() if key <= 50]]
print (dict_tokens)


# Feature Extraction (Bag of Words).
The bag-of-words (BoW) model, is a way of extracting features from text to use them in modeling, such as in machine learning algorithms. It is a representation that describes the occurrence of words in a document. It includes two things

1. a dictionary of known words
2. a measure of the presence of known words.

The doc2bow method iterates through all the words in the text, if the word already exists in the set, it increases the frequency count, otherwise it inserts the word into the set and sets the frequency count to 1.

In [None]:
corpus = [dictionary.doc2bow(desc) for desc in marvel_comic_plot]

word_frequencies = [[(dictionary[id], frequency) for id, frequency in line] for line in corpus[0:3]]

print(word_frequencies)


Powyższe wyniki przedstawiają słownictwo wraz z ich częstotliwością.

Construction of the Tf-Idf and LSI model
Tf-Idf or Term frequency-Inverse Document Frequency. This is a commonly used NLP model that helps identify the most important words in each document in a collection. Once the Tf-Idf model is built, it will be passed to the LSI model and the number of features to build will be determined

In [None]:
%time comic_tfidf_model = gensim.models.TfidfModel(corpus, id2word=dictionary)
%time comic_lsi_model = gensim.models.LsiModel(comic_tfidf_model[corpus], id2word=dictionary, num_topics=300)

It is a good idea to serialize and store the collection locally so that it can be easily retrieved when needed.

In [None]:
%time gensim.corpora.MmCorpus.serialize('comic_tfidf_model_mm', comic_tfidf_model[corpus])
%time gensim.corpora.MmCorpus.serialize('comic_lsi_model_mm',comic_lsi_model[comic_tfidf_model[corpus]])

In [None]:
#Loading the indexed collection
comic_tfidf_corpus = gensim.corpora.MmCorpus('comic_tfidf_model_mm')
comic_lsi_corpus = gensim.corpora.MmCorpus('comic_lsi_model_mm')

print(comic_tfidf_corpus)
print(comic_lsi_corpus)

In [None]:
from gensim.similarities import MatrixSimilarity

%time comic_index = MatrixSimilarity(comic_lsi_corpus, num_features = comic_lsi_corpus.num_terms)

# Semantic search
Having an index of comics initialized and loaded, we can use it to find similar comics

Entering a query, the model will return us the corresponding comic book titles along with "Relevance %", which is the degree of similarity. The higher the similarity score, the more similar the query is to the document in the given index.

Below is a helper function to search the index, sort and return the results

In [None]:
from operator import itemgetter

def search_similar_comics(search_term):

    query_bow = dictionary.doc2bow(spacy_tokenizer(search_term))
    query_tfidf = comic_tfidf_model[query_bow]
    query_lsi = comic_lsi_model[query_tfidf]

    comic_index.num_best = 5

    comic_list = comic_index[query_lsi]

    comic_list.sort(key=itemgetter(1), reverse=True)
    comic_names = []

    for j, comic in enumerate(comic_list):

        comic_names.append (
            {
                'Relevance': round((comic[1] * 100),2),
                'Comic Title': df_comics['comic_name'][comic[0]],
                'Comic Plot': df_comics['issue_description'][comic[0]]
            }

        )
        if j == (comic_index.num_best-1):
            break

    return pd.DataFrame(comic_names, columns=['Relevance','Comic Title','Comic Plot'])

In [None]:
#Search for comic book titles, which are associated with the following search parameters
search_similar_comics('Shield')

In [None]:
#Search for comic book titles, which are associated with the following search parameters
search_similar_comics('God')

In [None]:
#Search for comic book titles, which are associated with the following search parameters
search_similar_comics('Wolverine')