In [717]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [703]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rajgo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [704]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rajgo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [630]:
drama = pd.read_csv('drama_movie_df.csv')

In [631]:
horror = pd.read_csv('horror_713_movies_df.csv')

In [632]:
horror2 = pd.read_csv('horror_before_713_movie_df.csv')

In [633]:
mystery = pd.read_csv('mystery_movie_df.csv')

In [634]:
thriller = pd.read_csv('thriller_movie_df.csv')

In [635]:
scifi = pd.read_csv('scifi_movie_df.csv')

In [636]:
war = pd.read_csv('war_movie_df.csv')

In [760]:
action = pd.read_csv('action_movie_df.csv')

In [761]:
comedy = pd.read_csv('comedy_movie_df.csv')

In [762]:
fantasy = pd.read_csv('fantasy_movie_df.csv')

In [763]:
documentary = pd.read_csv('documentary_movie_df.csv')

In [764]:
romance1 = pd.read_csv('romance_movie_df1.csv')

In [None]:
western = pd.read_csv('western_movie_df.csv')

In [637]:
all_movies = pd.concat([drama, horror, horror2, mystery, thriller, scifi, war], axis=0)

In [638]:
all_movies.drop_duplicates(subset=['Movie ID'], inplace=True)


In [639]:
all_movies = all_movies.sample(frac=1).reset_index(drop=True)

In [640]:
df = all_movies 

### Data Cleaning

In [710]:
df.columns

Index(['Movie ID', 'Title', 'IMDB Rating', 'Directors', 'Writers', 'Stars',
       'Storyline', 'Origin Countries', 'Languages', 'Budget',
       'Gross Worldwide', 'Runtime', 'Genres', 'IMDB Ratings', 'tags',
       'processed_tags'],
      dtype='object')

In [641]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [642]:
df['Writers'] = df['Writers'].str.replace('noneWriters', '')

In [643]:
df['Stars'] = df['Stars'].str.replace('noneStars', '')


In [644]:
df.drop(columns=['Release Date'], inplace=True)

In [648]:
df.fillna('', inplace=True)

### Converting the tag columns to string datatype

In [649]:
df['Writers'] = df['Writers'].astype(str)
df['Directors'] = df['Directors'].astype(str)
df['Stars'] = df['Stars'].astype(str)
df['Languages'] = df['Languages'].astype(str)
df['Title'] = df['Title'].astype(str)
df['Origin Countries'] = df['Origin Countries'].astype(str)
df['Storyline'] = df['Storyline'].astype(str)
df['Genres'] = df['Genres'].astype(str)
df['IMDB Ratings'] = df['IMDB Rating'].astype(str)

### removing \u200b from writers, directors, stars

In [650]:
df['Writers'] = df['Writers'].str.replace('\u200b', '')
df['Directors'] = df['Directors'].str.replace('\u200b','')
df['Stars'] = df['Stars'].str.replace('\u200b','')

### Join names together

In [654]:
df['Directors'] = df['Directors'].apply(lambda names: ''.join([name.replace(' ', '') for name in names]))
df['Writers'] = df['Writers'].apply(lambda names: ''.join([name.replace(' ', '') for name in names]))
df['Stars'] = df['Stars'].apply(lambda names: ''.join([name.replace(' ', '') for name in names]))
df['Origin Countries'] = df['Origin Countries'].apply(lambda names: ''.join([name.replace(' ', '') for name in names]))

### Cleaning the text of Writers, Directors, Stars, Origin Countries

In [656]:
def clean_text(text):
    cleaned_text = text.replace(',', ' ')
    return cleaned_text

# Apply clean_writers function to 'Writers' column
df['Writers'] = df['Writers'].apply(clean_text)
df['Directors'] = df['Directors'].apply(clean_text)
df['Stars'] = df['Stars'].apply(clean_text)
df['Origin Countries'] = df['Origin Countries'].apply(clean_text)

### 


### Creating tags column the important columns

In [695]:
df['tags'] =  df['Directors'] + ' ' + df['Languages'] + ' ' + df['Genres'] + ' ' + df['Writers'] + ' ' + df['Stars'] + ' ' + df['Origin Countries'] + ' ' + df['Storyline']


### Removing the commas from tags

In [696]:
def remove_commas(text):
    return text.replace(',', '').replace('-', '')

In [697]:
df['tags'] = df['tags'].apply(remove_commas)

### converting the tags to lowecase

In [698]:
df['tags'] = df['tags'].str.lower()

### Remove extra spaces from the string

In [699]:
def clean_text(text):
    # Remove commas
    text = text.replace(',', '')
    # Remove leading and trailing whitespace
    text = text.strip()
    # Replace multiple consecutive spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    return text

# Apply the clean_text function to the 'tags' column
df['tags'] = df['tags'].apply(clean_text)

In [709]:
len(df['tags'][0])

264

### Removing stopwords from the string

In [706]:
# Preprocessing function to remove stop words
def preprocess_text_remove_stopwords(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize the text
    words = text.split()
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    # Join words back into a single string
    processed_text = ' '.join(filtered_words)
    return processed_text

# Apply the preprocessing function to the 'tags' column
df['processed_tags'] = df['tags'].apply(preprocess_text_remove_stopwords)

In [708]:
len(df['processed_tags'][0])

228

In [711]:
df.rename(columns = {'Movie ID':'movie_id', 
                     'Title':'movie_title',
                     'IMDB Rating':'imdb_rating',
                     'Directors':'directors',
                     'Writers':'writers',
                     'Stars':'stars',
                     'Storyline':'storyline',
                     'Origin Countries':
                     'origin_countries',
                     'Language':'language',
                     'Budget':'budget',
                     'Gross Worldwide':'gross_worldwide',
                     'Runtime':'runtime',
                     'Genres':'genres',}, inplace = True)

In [724]:
df.columns

Index(['movie_id', 'movie_title', 'imdb_rating', 'directors', 'writers',
       'stars', 'storyline', 'origin_countries', 'Languages', 'budget',
       'gross_worldwide', 'runtime', 'genres', 'IMDB Ratings', 'tags',
       'processed_tags'],
      dtype='object')

In [756]:
df.head(1)

Unnamed: 0,movie_id,movie_title,imdb_rating,directors,writers,stars,storyline,origin_countries,Languages,budget,gross_worldwide,runtime,genres,IMDB Ratings,tags,processed_tags
0,tt11762434,Cosmic Sin,2.5,EdwardDrake,EdwardDrake CoreyLarge,FrankGrillo BruceWillis BrandonThomasLee,Seven rogue soldiers launch a preemptive strike against a newly discovered alien civilization in the hopes of ending an interstellar war before it starts.,UnitedStates,English,,"$349,757",1 hour 28 minutes,"Action, Sci-Fi",2.5,edwarddrake english action scifi edwarddrake coreylarge frankgrillo brucewillis brandonthomaslee unitedstates seven rogue soldiers launch a preemptive strike against a newly discovered alien civilization in the hopes of ending an interstellar war before it starts.,edwarddrake english action scifi edwarddrake coreylarge frankgrillo brucewillis brandonthomaslee unitedstates seven rogue soldiers launch preemptive strike newly discovered alien civilization hopes ending interstellar war starts


### IFIDF vectorizing the words

In [718]:
# Vectorize the 'processed_tags' column
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_tags'])

In [719]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [726]:
print(tfidf_matrix.toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [727]:
print(tfidf_vectorizer.get_feature_names_out())


['000mile' '0068' '007' ... 'öznur' 'ørjangamst' 'ümitacar']


## 1.

In [749]:
import pandas as pd

# Function to get movie recommendations based on content similarity using movie title
def get_content_based_recommendations(movie_title, cosine_sim=cosine_sim):
    # Find the movie ID based on the title
    movie_id = df[df['movie_title'] == movie_title]['movie_id'].values[0]
    
    # Get the index of the movie that matches the movie ID
    idx = df[df['movie_id'] == movie_id].index[0]
    
    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]  # Exclude the first entry since it will be the movie itself
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar movies, filtering by similar genres
    movie_genres = df[df['movie_title'] == movie_title]['genres'].values[0]
    similar_movies = df.iloc[movie_indices]
    
    # Filter similar movies by genres
    similar_movies_filtered = similar_movies[similar_movies['genres'].apply(lambda x: any(genre in x for genre in movie_genres))]
    
    
    # Return movie titles and processed tags of the top similar movies as a DataFrame
    return similar_movies_filtered[['movie_title', 'processed_tags']]



In [759]:

# Example usage
similar_movies_df = get_content_based_recommendations('Inside Out 2')
print(similar_movies_df)

                          movie_title  \
6849                       Inside Out   
14337                     Thundercats   
13032    The Secret World of Arrietty   
15296                         Wildcat   
2608                    Baby Driver 2   
6056                             Otis   
3316                     Isle of Dogs   
5205                        The Abyss   
11322  The Strongest Man in the World   
15303                     Toy Story 5   

                                                                                                                                                                                                                                                                                                                                                                                                                     processed_tags  
6849                                                                                                                 petedocte