In [3]:
import pandas as pd
import numpy as np
df1=pd.read_csv('dataset/tmdb_5000_credits.csv')
df2=pd.read_csv('dataset/tmdb_5000_movies.csv')

In [4]:
df1.columns = ['id','tittle','cast','crew']
df2= df2.merge(df1,on='id')

In [5]:
df2['overview'].head(5)

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
2    A cryptic message from Bond’s past sends him o...
3    Following the death of District Attorney Harve...
4    John Carter is a war-weary, former military ca...
Name: overview, dtype: object

#Data processing

##process plot descriptions

In [6]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
df2['overview'] = df2['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df2['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(4803, 20978)

In [7]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [8]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(df2.index, index=df2['title']).drop_duplicates()

##process Credits, Genres and Keywords

In [9]:
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    df2[feature] = df2[feature].apply(literal_eval)

In [10]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [11]:
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [12]:
df2['director'] = df2['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    df2[feature] = df2[feature].apply(get_list)

In [13]:
df2[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

Unnamed: 0,title,cast,director,keywords,genres
0,Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",James Cameron,"[culture clash, future, space war]","[Action, Adventure, Fantasy]"
1,Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley]",Gore Verbinski,"[ocean, drug abuse, exotic island]","[Adventure, Fantasy, Action]"
2,Spectre,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",Sam Mendes,"[spy, based on novel, secret agent]","[Action, Adventure, Crime]"


In [14]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [15]:
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    df2[feature] = df2[feature].apply(clean_data)
    print(df2[feature].head(5))

0    [samworthington, zoesaldana, sigourneyweaver]
1       [johnnydepp, orlandobloom, keiraknightley]
2        [danielcraig, christophwaltz, léaseydoux]
3        [christianbale, michaelcaine, garyoldman]
4      [taylorkitsch, lynncollins, samanthamorton]
Name: cast, dtype: object
0       [cultureclash, future, spacewar]
1       [ocean, drugabuse, exoticisland]
2       [spy, basedonnovel, secretagent]
3    [dccomics, crimefighter, terrorist]
4        [basedonnovel, mars, medallion]
Name: keywords, dtype: object
0        jamescameron
1       goreverbinski
2           sammendes
3    christophernolan
4       andrewstanton
Name: director, dtype: object
0           [action, adventure, fantasy]
1           [adventure, fantasy, action]
2             [action, adventure, crime]
3                 [action, crime, drama]
4    [action, adventure, sciencefiction]
Name: genres, dtype: object


In [16]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])
df2['soup'] = df2.apply(create_soup, axis=1)

In [None]:
print(df2[['id', 'title', 'cast', 'director', 'keywords', 'genres', 'soup']].head())

       id                                     title  \
0   19995                                    Avatar   
1     285  Pirates of the Caribbean: At World's End   
2  206647                                   Spectre   
3   49026                     The Dark Knight Rises   
4   49529                               John Carter   

                                            cast          director  \
0  [samworthington, zoesaldana, sigourneyweaver]      jamescameron   
1     [johnnydepp, orlandobloom, keiraknightley]     goreverbinski   
2      [danielcraig, christophwaltz, léaseydoux]         sammendes   
3      [christianbale, michaelcaine, garyoldman]  christophernolan   
4    [taylorkitsch, lynncollins, samanthamorton]     andrewstanton   

                              keywords                               genres  \
0     [cultureclash, future, spacewar]         [action, adventure, fantasy]   
1     [ocean, drugabuse, exoticisland]         [adventure, fantasy, action]   
2     [spy,

In [None]:
import pandas as pd
import numpy as np

# 加载评分数据集
ratings = pd.read_csv('dataset/ratings.csv')

# 查看数据基本信息
print(ratings.head())
print(ratings.info())
