In [33]:
import pandas as pd

anime_data = pd.read_csv('./Dataset/animes.csv')

In [37]:
print(anime_data['synopsis'])

0        Following their participation at the Inter-Hig...
1        Music accompanies the path of the human metron...
2        The Abyss—a gaping chasm stretching down into ...
3        "In order for something to be obtained, someth...
4        After helping revive the legendary vampire Kis...
                               ...                        
19306    Cocona is an average middle schooler living wi...
19307    While visiting the National Library, junior-hi...
19308    Years ago, all of the ghosts in a haunted scho...
19309    Inuyasha and company have finally destroyed Na...
19310    The year is Universal Century 0093. Char Aznab...
Name: synopsis, Length: 19311, dtype: object


In [34]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')

# Tokenize the text into words
anime_data['synopsis'] = anime_data['synopsis'].apply(nltk.word_tokenize)

# Convert all words to lowercase
anime_data['synopsis'] = anime_data['synopsis'].apply(lambda x: [str(word).lower() for word in x])

# Remove stop words
stop_words = set(stopwords.words('english'))
anime_data['synopsis'] = anime_data['synopsis'].apply(lambda x: [word for word in x if word not in stop_words])

# Stem the words
ps = PorterStemmer()
anime_data['synopsis'] = anime_data['synopsis'].apply(lambda x: [ps.stem(word) for word in x])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dellg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


TypeError: expected string or bytes-like object

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

def preprocess_text(text):
    text = str(text)
    text = text.lower()
    tokens = word_tokenize(text)
    
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    stemmer = SnowballStemmer('english')
    tokens = [stemmer.stem(token) for token in tokens]
    
    preprocessed_text = ' '.join(tokens)
    preprocessed_text.lower()
    
    return preprocessed_text

# anime_data['title'] = anime_data['title'].apply(preprocess_text)
anime_data['synopsis'] = anime_data['synopsis'].apply(preprocess_text)
print("Preprocessing Done!")

In [None]:
inverted_index = {}

for idx, doc in anime_data.iterrows():
    for word in doc['synopsis']:
        if word not in inverted_index:
            inverted_index[word] = []
        inverted_index[word].append(idx)


In [None]:
inverted_index

In [None]:
def boolean_search(query):
    query_words = query.split()
    docs = set()
    
    for word in query_words:
        if word not in inverted_index:
            continue
        
        word_docs = set(inverted_index[word])
        if 'not' in query_words and word_docs:
            word_docs = set(range(len(anime_data))) - word_docs
        
        docs = docs.union(word_docs)
    
    return anime_data.iloc[list(docs)]


In [None]:
anime_data['synopsis'][0]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(anime_data['synopsis'].apply(lambda x: ' '.join(x)))

def similarity_search(query):
    query_tfidf = tfidf.transform([query])
    similarity_scores = cosine_similarity(query_tfidf, tfidf_matrix)[0]
    sorted_indices = similarity_scores.argsort()[::-1]
    return anime_data.iloc[sorted_indices]


In [None]:
def phrase_search(query):
    query_words = query.split()
    docs = set()
    
    for i in range(len(query_words) - 1):
        word1 = query_words[i]
        word2 = query_words[i + 1]
        
        if word1 not in inverted_index or word2 not in inverted_index:
            continue
        
        word1_docs = set(inverted_index[word1])
        word2_docs = set(inverted_index[word2])
        phrase_docs = word1_docs.intersection(word2_docs)
        
        for doc_idx in phrase_docs:
            doc_words = anime_data.iloc[doc_idx]['synopsis']
            for j in range(len(doc_words) - 1):
                if doc_words[j] == word1 and doc_words[j+1] == word2:
                    docs.add(doc_idx)
                    break
    
    return anime_data.iloc[list(docs)]


In [None]:
query = "high school"
results = phrase_search(query)
