In [1]:
import pandas as pd

# Specify the file path
file_path = r"D:\movies.dat"

# Load the file with correct delimiter and column names
movies= pd.read_csv(file_path, sep="::", engine="python", encoding="ISO-8859-1", header=None, names=["MovieID", "Title", "Genres"])

# Display the first few rows
movies.head()


Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [2]:
import pandas as pd

# Specify the file path
file_path = r"D:\ratings.dat"

# Load the file with correct delimiter and column names
ratings= pd.read_csv(file_path, sep="::", engine="python", encoding="ISO-8859-1", header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'])

# Display the first few rows
ratings.head()


Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
import pandas as pd

# Specify the file path
file_path = r"D:\users.dat"

# Load the file with correct delimiter and column names
users= pd.read_csv(file_path, sep="::", engine="python", encoding="ISO-8859-1", header=None, names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'])

# Display the first few rows
users.head()


Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [4]:
# First, merge 'ratings' with 'movies' on 'MovieID'
recomendation_data = pd.merge(ratings, movies, on='MovieID', how='inner')

# Next, merge the resulting dataset with 'users' on 'UserID'
recomendation_data= pd.merge(recomendation_data, users, on='UserID', how='inner')

# Display the first few rows of the merged dataset
print(recomendation_data.head())

# Check the shape of the final dataset
print(f"Merged data shape: {recomendation_data.shape}")

   UserID  MovieID  Rating  Timestamp                                   Title  \
0       1     1193       5  978300760  One Flew Over the Cuckoo's Nest (1975)   
1       1      661       3  978302109        James and the Giant Peach (1996)   
2       1      914       3  978301968                     My Fair Lady (1964)   
3       1     3408       4  978300275                  Erin Brockovich (2000)   
4       1     2355       5  978824291                    Bug's Life, A (1998)   

                         Genres Gender  Age  Occupation Zip-code  
0                         Drama      F    1          10    48067  
1  Animation|Children's|Musical      F    1          10    48067  
2               Musical|Romance      F    1          10    48067  
3                         Drama      F    1          10    48067  
4   Animation|Children's|Comedy      F    1          10    48067  
Merged data shape: (1000209, 10)


In [5]:
# dropping unnecesary variables
recomendation_data.drop(['Timestamp', 'Zip-code','Age','Occupation', 'Gender'], axis=1, inplace=True)
#veryfing data
recomendation_data.head()

Unnamed: 0,UserID,MovieID,Rating,Title,Genres
0,1,1193,5,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,My Fair Lady (1964),Musical|Romance
3,1,3408,4,Erin Brockovich (2000),Drama
4,1,2355,5,"Bug's Life, A (1998)",Animation|Children's|Comedy


In [6]:
recomendation_data_values=recomendation_data.duplicated().sum()
print (recomendation_data_values)

0


In [7]:
recomendation_data_values = recomendation_data.isnull().sum()
print (recomendation_data_values)

UserID     0
MovieID    0
Rating     0
Title      0
Genres     0
dtype: int64


In [8]:
recomendation_data.head()

Unnamed: 0,UserID,MovieID,Rating,Title,Genres
0,1,1193,5,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,My Fair Lady (1964),Musical|Romance
3,1,3408,4,Erin Brockovich (2000),Drama
4,1,2355,5,"Bug's Life, A (1998)",Animation|Children's|Comedy


In [12]:
recomendation_data.tail()

Unnamed: 0,UserID,MovieID,Rating,Title,Genres
1000204,4211,3791,2,Footloose (1984),Drama
1000205,4211,3806,3,MacKenna's Gold (1969),Western
1000206,4211,3840,4,Pumpkinhead (1988),Horror
1000207,4211,3766,2,Missing in Action (1984),Action|War
1000208,4211,3834,2,Bronco Billy (1980),Adventure|Drama|Romance


In [14]:
sampled_data = recomendation_data.sample(n=5000, random_state=42)

In [24]:
sampled_data = sampled_data.reset_index(drop=True)


In [31]:
sampled_data.head()

Unnamed: 0,UserID,MovieID,Rating,Title,Genres
0,3466,1968,5,"Breakfast Club, The (1985)",Comedy|Drama
1,5437,1610,4,"Hunt for Red October, The (1990)",Action|Thriller
2,770,445,3,Fatal Instinct (1993),Comedy
3,889,2696,2,"Dinner Game, The (Le Dîner de cons) (1998)",Comedy
4,2203,2013,5,"Poseidon Adventure, The (1972)",Action|Adventure


# content based filtering

In [25]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# Convert Genres into a TF-IDF matrix
tfidf = TfidfVectorizer(max_features=5000)  # Limit to top 5000 words
tfidf_matrix = tfidf.fit_transform(sampled_data['Genres'])
# Apply dimensionality reduction
svd = TruncatedSVD(n_components=10)  # Choose a suitable number of components
reduced_tfidf_matrix = svd.fit_transform(tfidf_matrix)

In [26]:
# Compute cosine similarity on the reduced matrix
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(reduced_tfidf_matrix, reduced_tfidf_matrix, dense_output=False)

In [27]:
# Function to recommend movies
def recommend_content_based(movie_title,sampled_data , cosine_sim):
    idx = sampled_data[sampled_data["Title"] == movie_title].index[0]  # Get index of the movie
    sim_scores = list(enumerate(cosine_sim[idx]))  # Get similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:6]  # Get top 5 similar movies
    movie_indices = [i[0] for i in sim_scores]
    return sampled_data.iloc[movie_indices]["Title"]

In [33]:

print(recommend_content_based("Fatal Instinct (1993)", sampled_data, cosine_sim))

3     Dinner Game, The (Le Dîner de cons) (1998)
14                           Animal House (1978)
24                        Shower (Xizhao) (1999)
33                     Addams Family, The (1991)
52                               Snow Day (2000)
Name: Title, dtype: object
