In [1]:
#Import Packages
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk import TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

## Loading Data

In [2]:
anime_df = pd.read_csv("anime.csv")
test_df = pd.read_csv("test.csv")
train_df = pd.read_csv("train.csv")

#make copies of each csv file
anime_copy = anime_df.copy()
test_copy = test_df.copy()
train_copy = train_df.copy()


In [87]:
anime_copy.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [88]:
train_copy.head()

Unnamed: 0,user_id,anime_id,rating
0,1,11617,10
1,1,11757,10
2,1,15451,10
3,2,11771,10
4,3,20,8


In [3]:
def check_data(df):
    """
    -Checks for missing values and duplicates
    -Deletes duplicates
    -handles missing values through imputation method
    """
    duplicates = df.duplicated().sum()
    print(f"total number of duplicate rows: {duplicates}")
    
    null_values = df.isnull().sum()
    print("Missing values in each column: ")
    print(null_values)
    
    return 

In [90]:
check_data(anime_copy)

total number of duplicate rows: 0
Missing values in each column: 
anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64


In [91]:
check_data(train_copy)

total number of duplicate rows: 1
Missing values in each column: 
user_id     0
anime_id    0
rating      0
dtype: int64


## Data Cleaning

In [4]:
#code here
def handle_missing_values(df):
    """Handles missing values in the dataframe:
    -Fills missing values in categorical columns with the most frequent value.
    -Fills missing values in the 'rating' column with the mean.

    """
     #Delete duplicates
    df = df.drop_duplicates()
     
    #Categorical data is filled with the most frequent value
    for column in ['genre','type']:
        most_frequent_value = df[column].mode()[0]
        df[column].fillna(most_frequent_value, inplace=True)
        
    #Fill missing values in rating with the mean
    mean_value = df['rating'].mean()
    df['rating'].fillna(mean_value, inplace=True)
    
    #convert episodes to numeric and fill missing values with the median
    df['episodes'] = pd.to_numeric(df['episodes'],errors='coerce') 
    median_value = df['episodes'].median()
    df['episodes'].fillna(median_value, inplace=True)
    
    return df

In [5]:
anime_clean = handle_missing_values(anime_copy)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(most_frequent_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['rating'].fillna(mean_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting v

In [6]:
anime_clean.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1.0,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64.0,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51.0,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24.0,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51.0,9.16,151266


In [7]:
check_data(anime_clean)

total number of duplicate rows: 0
Missing values in each column: 
anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64


In [8]:
def clean_text(text):
        
        #check that the text is a string and check if it's an NaN
        if not isinstance(text,str) or pd.isna(text):
                return ""
        
        #lowercase the text
        text = text.lower()
        
        #remove punctuation
        text = text.translate(str.maketrans('','',string.punctuation))
        
        #remove numerical values
        text = re.sub(r'd\+','',text)
        
        #Tokenize the text
        tokens = word_tokenize(text) 
        
        #Remove stop words
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
        
        #Lemmatization
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        
        #Remove special characters and extra whitespaces
        tokens = [re.sub(r'\W+', '',word) for word in tokens]
        
        #Filter out empty strings and single letter words
        tokens = [word for word in tokens if word and len(word) > 1]
            
        # Ensure that words are written as individual words
        distinct_tokens = set(tokens)
        
        # Join the tokens back together
        text = ' '.join(distinct_tokens)
        
        
        return text

In [9]:
df = anime_clean
text_columns = ['name', 'genre', 'type']
for column in text_columns:
    df[column] = df[column].astype(str).apply(clean_text) 


In [10]:
df.columns

Index(['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members'], dtype='object')

df merge with train csv and drop the other rating and then split 

In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# Split genres into individual columns and handle NaN values
df['genre'] = df['genre'].astype(str).str.split()
all_genres = set([genre for sublist in df['genre'] if isinstance(sublist, list) for genre in sublist])

for genre in all_genres:
    df[genre] = df['genre'].apply(lambda x: 1 if isinstance(x, list) and genre in x else 0)

# One-hot encode the 'type' column
df_encoded = pd.get_dummies(df, columns=['type'])

# Drop original 'genre' column from the new dataset (since it was one-hot encoded)
df_encoded.drop('genre', axis=1, inplace=True)

# Normalize numerical features (episodes, rating)
df_encoded['episodes'] = (df_encoded['episodes'] - df_encoded['episodes'].min()) / (df_encoded['episodes'].max() - df_encoded['episodes'].min())
df_encoded['rating'] = (df_encoded['rating'] - df_encoded['rating'].min()) / (df_encoded['rating'].max() - df_encoded['rating'].min())

# Create feature matrix and calculate cosine similarity
features = df_encoded.drop(columns=['anime_id', 'name', 'members']) # Dropped incorrect ';'
similarity_matrix = cosine_similarity(features)

# Function to get recommendations
def get_recommendations(anime_name, similarity_matrix, df_encoded, top_n=3):
    try:  # Try to find the anime
        idx = df_encoded[df_encoded['name'] == anime_name].index[0]
        similar_indices = similarity_matrix[idx].argsort()[::-1][1:top_n+1]
        return df_encoded['name'].iloc[similar_indices]
    except IndexError:  # Catch the error if the anime is not found
        print(f"Anime '{anime_name}' not found in the dataset.")
        return []  # Or return a default recommendation list


recommendations = get_recommendations("fullmetal alchemist brotherhood", similarity_matrix, df_encoded)  # Corrected anime name
print(recommendations)

200     fullmetal alchemist
101      magic kingdom magi
268    labyrinth magic magi
Name: name, dtype: object


**Lets pickle the Similarity Matrix, the dataframe and the reccomender function**

In [12]:
import pickle

# Save similarity matrix
with open('similarity_matrix.pkl', 'wb') as f:
    pickle.dump(similarity_matrix, f)

# Save dataframe
with open('anime_data.pkl', 'wb') as f:
    pickle.dump(df, f)

# Optionally, you can also save the recommendation function
with open('recommendation_function.pkl', 'wb') as f:
    pickle.dump(get_recommendations, f)  
