In [1]:
import os, string, warnings
import numpy as np
import pandas as pd
import math

import nltk, re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
from collections import Counter

from scipy.spatial.distance import cosine
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings('ignore')

## Loading data

In [2]:
keywords = pd.read_csv('data/keywords.csv', sep = '|')
movies = pd.read_csv('data/movie.csv', sep = '|')
plot_summary = pd.read_csv('data/plotsummary.csv', sep = '|')

## Processing the data for matrix 1

In [3]:
keywords.columns = ['keyword_id', 'keyword', 'movie_id']

This function applies the following NLP proccesses to a list of words:
* Remove the stop words
* Remove the punctuation signs
* Apply lemmatizer
* Remove short words (least than 2 letters)
* Remove words than are actually numbers

In [4]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean_lemma(words):
    no_dash = [w.replace('-', ' ') for w in words]
    stop_free = " ".join([i for i in no_dash if i.lower() not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch.lower() not in exclude)    
    # Using lemmatizer
    normalized = [lemma.lemmatize(word) for word in punc_free.split()]
    
    # Check again for stop words
    normalized = [w for w in normalized if w not in stop]
    # Adding the len filter
    normalized = [w for w in normalized if len(w) > 2]
    # Remove digits
    normalized = [w for w in normalized if w[0].isdigit() == False]
    
    return normalized

For each movie id, clean the bag of words

In [5]:
keywords_clean = pd.DataFrame(columns = ['keyword', 'movie_id'])
for movie in set(keywords['movie_id']):
    keywords_subset = keywords[keywords['movie_id'] == movie]
    kw = list(keywords_subset['keyword'])
    clean_kw = list(set(clean_lemma(kw)))
    movie_list = [movie] * len(clean_kw)
    
    sub_df = pd.DataFrame({
        'keyword': clean_kw,
        'movie_id': movie_list
    })
    
    keywords_clean = pd.concat([keywords_clean, sub_df], ignore_index = True)
    
    

In [6]:
keywords_clean.head(10)

Unnamed: 0,keyword,movie_id
0,share,tt0364725
1,wash,tt0364725
2,girl,tt0364725
3,punctuation,tt0364725
4,gmc,tt0364725
5,credit,tt0364725
6,cheerleader,tt0364725
7,man,tt0364725
8,loose,tt0364725
9,pepsi,tt0364725


Each keyword is counted, and the ones that only appear 1 or 2 times are removed

In [7]:
keyword_count = keywords_clean.groupby(['keyword']).count()
keyword_count = keyword_count.reset_index()
keyword_count = keyword_count[keyword_count['movie_id'] > 2]

In [8]:
use_kw = list(keyword_count['keyword'])

In [9]:
keywords_clean = keywords_clean[keywords_clean['keyword'].isin(use_kw)]

For each movie id, gather all the tokens into one line

In [10]:
keywords_join = pd.DataFrame(columns = ['movie_id', 'keyword'])
for movie in set(keywords_clean['movie_id']):
    keywords_subset = keywords_clean[keywords_clean['movie_id'] == movie]
    kw = list(keywords_subset['keyword'])
    
    kw = ' '.join(kw)
    
    sub_df = pd.DataFrame({
        'keyword': kw,
        'movie_id': [movie]
    })
    
    keywords_join = pd.concat([keywords_join, sub_df], ignore_index = True)
    
keywords_join = keywords_join.set_index('movie_id')

In [11]:
keywords_join.head(5)

Unnamed: 0_level_0,keyword
movie_id,Unnamed: 1_level_1
tt0364725,share wash girl punctuation gmc credit cheerle...
tt0300471,combat thames brother chaplin murder swordsman...
tt0409847,building combat brother run gunfight drunkenne...
tt0190865,morse cigarette brother helicopter sister drun...
tt0308506,movie orphanage title adult orphan rhyme shoe ...


## Constructing cosine similarity matrix 1

For the cosine similarity, I will use TfidfVectorizer from sklearn library. 

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(min_df=1)
tfidf = vect.fit_transform(keywords_join['keyword'])

In [13]:
cosine_matrix = (tfidf * tfidf.T).A

Converting the cosine matrix from the SKLearn function to a dataframe

In [14]:
cosine_df = pd.DataFrame(cosine_matrix, index = keywords_join.index, columns = keywords_join.index)

In [15]:
first_matrix = cosine_df

In [16]:
first_matrix.head(5)

movie_id,tt0364725,tt0300471,tt0409847,tt0190865,tt0308506,tt0974661,tt0280590,tt0166813,tt0302640,tt0104036,...,tt0286788,tt0078721,tt0335266,tt0376105,tt0427152,tt0078346,tt0078446,tt0418689,tt0072890,tt0120591
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0364725,1.0,0.05398,0.024851,0.004317,0.034169,0.065966,0.08036,0.015693,0.025709,0.024532,...,0.008135,0.030568,0.041708,0.025706,0.027652,0.032203,0.015851,0.019195,0.040835,0.022228
tt0300471,0.05398,1.0,0.088796,0.026247,0.01695,0.028624,0.044284,0.057351,0.053518,0.038483,...,0.046612,0.013026,0.051872,0.010262,0.01251,0.05857,0.013955,0.064069,0.035738,0.036988
tt0409847,0.024851,0.088796,1.0,0.067276,0.004384,0.039035,0.042468,0.157044,0.052435,0.064396,...,0.03252,0.020316,0.034761,0.014019,0.024694,0.101083,0.014075,0.132407,0.061243,0.057272
tt0190865,0.004317,0.026247,0.067276,1.0,0.003336,0.036272,0.038259,0.039081,0.043385,0.024035,...,0.027489,0.024881,0.013561,0.005642,0.001454,0.045477,0.032394,0.061793,0.022994,0.034849
tt0308506,0.034169,0.01695,0.004384,0.003336,1.0,0.10333,0.002885,0.00238,0.048987,0.00217,...,0.002676,0.003082,0.001802,0.0,0.045775,0.032867,0.01525,0.0,0.002216,0.001969


## Processing the data for matrix 2

In [17]:
plot_summary.columns = ['plot_id', 'movie_id', 'contents']
movies.columns = [i.replace(' ', '') for i in list(movies.columns)]

summary_df = plot_summary[['movie_id', 'contents']]
synopsis_df = movies[['movie_id', 'synopsis']]
synopsis_df = synopsis_df.fillna(value = '')


Cleaning the contents column values from the plot_summary table and the synopsis values from the movies table

In [18]:
%%time
for i in range(len(summary_df)):
    words = summary_df['contents'][i].split()
    clean_summary = clean_lemma(words)
    clean_summary = ' '.join(clean_summary)
    
    summary_df['contents'][i] = clean_summary  
    

Wall time: 5min 57s


In [19]:
%%time
for i in range(len(synopsis_df)):
    words = synopsis_df['synopsis'][i].split()
    clean_synopsis = clean_lemma(words)
    clean_synopsis = ' '.join(clean_synopsis)
    
    synopsis_df['synopsis'][i] = clean_synopsis  

Wall time: 10.4 s


Combining the contents and sypnosis into one table

In [20]:
list_movies = set(movies['movie_id'])

In [21]:
plot_join = pd.DataFrame(columns = ['movie_id', 'plot'])
for movie in list_movies:
    plot_subset_1 = summary_df[summary_df['movie_id'] == movie]
    plot_subset_2 = synopsis_df[synopsis_df['movie_id'] == movie]
    plot = list(plot_subset_1['contents']) + list(plot_subset_2['synopsis'])
    plot = ' '.join(plot)
    
    sub_df = pd.DataFrame({
        'movie_id': [movie],
        'plot': plot
    })
    
    plot_join = pd.concat([plot_join, sub_df], ignore_index = True)
    

plot_join = plot_join.set_index('movie_id')

In [22]:
plot_join.head(10)

Unnamed: 0_level_0,plot
movie_id,Unnamed: 1_level_1
tt0364725,White Goodman owner founder Globo Gym would lo...
tt0300471,Chinese rebel murder Chons estranged father es...
tt0409847,Old West lone cowboy lead uprising terror beyo...
tt0190865,high adrenaline tale young climber Peter Garre...
tt0308506,Calvin friend live orphanage find old shoe fad...
tt0974661,Mike ODonnell top world star high school baske...
tt0280590,Longfellow Deeds small town pizzeria owner poe...
tt0166813,mustang stallion Spirit grows proudly succeed ...
tt0302640,Jessica Spencer hottest popular girl high scho...
tt0104036,unlikely kind friendship develops Fergus Irish...


### Calculating cosine matrix 2

As in matrix 1 section, calculate the cosine similarity matrix

In [23]:
vect = TfidfVectorizer(min_df=1)
tfidf = vect.fit_transform(plot_join['plot'])
cosine_matrix = (tfidf * tfidf.T).A

In [24]:
cosine_df = pd.DataFrame(cosine_matrix, index = plot_join.index, columns = plot_join.index)

In [25]:
second_matrix = cosine_df

In [26]:
second_matrix.head(5)

movie_id,tt0364725,tt0300471,tt0409847,tt0190865,tt0308506,tt0974661,tt0280590,tt0166813,tt0302640,tt0104036,...,tt0286788,tt0078721,tt0335266,tt0376105,tt0427152,tt0078346,tt0078446,tt0418689,tt0072890,tt0120591
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0364725,1.0,0.007467,0.009565,0.062698,0.002251,0.026799,0.006987,0.009162,0.008956,0.005152,...,0.012842,0.018209,0.007886,0.009273,0.00698,0.015876,0.009242,0.01111,0.015944,0.015279
tt0300471,0.007467,1.0,0.01672,0.010085,0.005715,0.017214,0.007357,0.015392,0.009068,0.012825,...,0.022259,0.012792,0.011969,0.00702,0.009367,0.016766,0.011519,0.014739,0.008586,0.011285
tt0409847,0.009565,0.01672,1.0,0.014905,0.00576,0.025629,0.013035,0.034388,0.013067,0.008682,...,0.008631,0.014842,0.018443,0.012828,0.017951,0.046457,0.012289,0.019215,0.009347,0.022014
tt0190865,0.062698,0.010085,0.014905,1.0,0.005917,0.02689,0.012147,0.029016,0.022323,0.010087,...,0.004651,0.018065,0.02197,0.004854,0.017448,0.024327,0.013485,0.012706,0.009363,0.02917
tt0308506,0.002251,0.005715,0.00576,0.005917,1.0,0.049932,0.004227,0.013374,0.012879,0.004255,...,0.0061,0.005547,0.007041,0.004481,0.006187,0.014205,0.006968,0.00154,0.002809,0.005717


## Q1.

### List the ten closest movies for the top 10 movies (by Ranking in Movie) by using the first matrix

Getting the top 10 movies by ranking and preparing dataframe for output

In [27]:
top_movies = movies.head(10)

In [28]:
cm_df = pd.DataFrame(columns = ['movie_name', 'movie_id', '10 closest movies'])
mn_list = mi_list = cm_list = ['']*10
cm_df['movie_name'] = mn_list
cm_df['movie_id'] = mi_list
cm_df['10 closest movies'] = cm_list

Getting the closest movies by using cosine similarity in the first matrix

In [29]:
movies_list = movies[['movie_id', 'name']]
for i in range(len(top_movies)):
    
    movie_id = top_movies['movie_id'][i]
    cm_df['movie_id'][i] = movie_id  
    cm_df['movie_name'][i] = top_movies['name'][i]    

    closest_movies = list(pd.DataFrame(first_matrix[movie_id].sort_values(ascending = False)).head(11).index[1:10])
    closest_movies = tuple([movies_list[movies_list['movie_id'] == m]['name'].values[0] for m in closest_movies])
    
    cm_df['10 closest movies'][i] = closest_movies   
    

Printing the output

In [30]:
cm_df_1 = cm_df
cm_df_1

Unnamed: 0,movie_name,movie_id,10 closest movies
0,Avatar,tt0499549,"(The Last Samurai, Dances with Wolves, Aliens,..."
1,Titanic,tt0120338,"(Poseidon, The Notebook, American Beauty, Shak..."
2,The Avengers,tt0848228,"(Iron Man 2, Thor, Captain America: The First ..."
3,The Dark Knight,tt0468569,"(Batman Forever, Batman Begins, The Dark Knigh..."
4,Star Wars: Episode I - The Phantom Menace,tt0120915,"(Star Wars: Episode II - Attack of the Clones,..."
5,Star Wars,tt0076759,(Star Wars: Episode V - The Empire Strikes Bac...
6,The Dark Knight Rises,tt1345836,"(The Dark Knight, G.I. Joe: Retaliation, Batma..."
7,Shrek 2,tt0298148,"(Shrek Forever After, Shrek the Third, Shrek, ..."
8,E.T. the Extra-Terrestrial,tt0083866,"(Aliens, Halloween, Home Alone, Paul, Edward S..."
9,Pirates of the Caribbean: Dead Man's Chest,tt0383574,"(Pirates of the Caribbean: At World's End, Pir..."


In [31]:
for t in cm_df_1['10 closest movies']:
    print(t)

('The Last Samurai', 'Dances with Wolves', 'Aliens', 'Man of Steel', 'The Last Airbender', 'The Avengers', 'Underworld: Awakening', 'Oblivion', 'Prometheus')
('Poseidon', 'The Notebook', 'American Beauty', 'Shakespeare in Love', 'In Time', 'The Poseidon Adventure', 'Troy', 'Pretty in Pink', 'West Side Story')
('Iron Man 2', 'Thor', 'Captain America: The First Avenger', 'Iron Man', 'Man of Steel', 'G.I. Joe: Retaliation', 'Transformers: Dark of the Moon', 'Iron Man 3', 'John Carter')
('Batman Forever', 'Batman Begins', 'The Dark Knight Rises', 'Batman', 'Takers', 'The Green Hornet', 'G.I. Joe: Retaliation', 'RoboCop', 'A Good Day to Die Hard')
('Star Wars: Episode II - Attack of the Clones', 'Star Wars: Episode III - Revenge of the Sith', 'Star Wars: Episode VI - Return of the Jedi', 'Star Wars: Episode V - The Empire Strikes Back', 'Star Wars', 'Star Wars: The Clone Wars', 'Star Trek: The Motion Picture', 'Man of Steel', 'Star Trek: Insurrection')
('Star Wars: Episode V - The Empire St

## Q2.

### List the ten closest movies for the top 10 movies (by Ranking in Movie) by using the second matrix

The process is similar to Q1, except here we have to use second cosine similarity matrix

In [32]:
cm_df = pd.DataFrame(columns = ['movie_name', 'movie_id', '10 closest movies'])
mn_list = mi_list = cm_list = ['']*10
cm_df['movie_name'] = mn_list
cm_df['movie_id'] = mi_list
cm_df['10 closest movies'] = cm_list

In [33]:
movies_list = movies[['movie_id', 'name']]
for i in range(len(top_movies)):
    
    movie_id = top_movies['movie_id'][i]
    cm_df['movie_id'][i] = movie_id  
    cm_df['movie_name'][i] = top_movies['name'][i]    

    closest_movies = list(pd.DataFrame(second_matrix[movie_id].sort_values(ascending = False)).head(11).index[1:10])
    closest_movies = tuple([movies_list[movies_list['movie_id'] == m]['name'].values[0] for m in closest_movies])
    
    cm_df['10 closest movies'][i] = closest_movies   
    

Printing the output

In [34]:
cm_df_2 = cm_df
cm_df_2

Unnamed: 0,movie_name,movie_id,10 closest movies
0,Avatar,tt0499549,"(The Guardian, It's Complicated, Sweet Home Al..."
1,Titanic,tt0120338,"(Silent Hill, The American, The Mirror Has Two..."
2,The Avengers,tt0848228,"(Thor, Dogma, Iron Man 2, The Incredible Hulk,..."
3,The Dark Knight,tt0468569,"(Batman, Batman Forever, Batman Begins, Metal ..."
4,Star Wars: Episode I - The Phantom Menace,tt0120915,"(Star Wars: Episode II - Attack of the Clones,..."
5,Star Wars,tt0076759,(Star Wars: Episode V - The Empire Strikes Bac...
6,The Dark Knight Rises,tt1345836,"(Batman Begins, The Dark Knight, Bruce Almight..."
7,Shrek 2,tt0298148,"(Shrek Forever After, Shrek, Shrek the Third, ..."
8,E.T. the Extra-Terrestrial,tt0083866,"(Bedazzled, The Happening, Hannah and Her Sist..."
9,Pirates of the Caribbean: Dead Man's Chest,tt0383574,"(Pirates of the Caribbean: At World's End, Pir..."


In [35]:
for t in cm_df_2['10 closest movies']:
    print(t)

('The Guardian', "It's Complicated", 'Sweet Home Alabama', 'Wall Street: Money Never Sleeps', 'Cowboys & Aliens', 'Must Love Dogs', 'The Blues Brothers', 'Keeping the Faith', 'Not Another Teen Movie')
('Silent Hill', 'The American', 'The Mirror Has Two Faces', 'The Shining', 'Ladder 49', 'State of Play', 'Crazy, Stupid, Love.', 'Speed', 'Firewall')
('Thor', 'Dogma', 'Iron Man 2', 'The Incredible Hulk', 'Iron Man', 'Captain America: The First Avenger', 'Hulk', 'Iron Man 3', 'MASH')
('Batman', 'Batman Forever', 'Batman Begins', 'Metal jaket', 'Batman Returns', 'The Dark Knight Rises', 'Batman & Robin', 'Beverly Hills Cop II', 'Bruce Almighty')
('Star Wars: Episode II - Attack of the Clones', 'Star Wars: Episode III - Revenge of the Sith', 'Star Wars: The Clone Wars', 'Star Wars', 'Star Wars: Episode VI - Return of the Jedi', 'Star Wars: Episode V - The Empire Strikes Back', 'Star Trek: Insurrection', 'The Queen', 'Mirror Mirror')
('Star Wars: Episode V - The Empire Strikes Back', 'Star W

## Q3.

### Which recommendation results look more reasonable and why? Provide academic references if possible.

At first glance, the recommendations look similar; however, when I calculated the percentages of same recommended movies in both approaches, the results showed otherwise:

In [46]:
for i in range(10):
    rm1 = cm_df_1['10 closest movies'][i]
    rm2 = cm_df_2['10 closest movies'][i]
    
    p = 0
    for r in rm1:
        if r in rm2: 
            p += 1
    p = p/10
    
    print(cm_df_1['movie_name'][i] + ': ' + str(p))
        

Avatar: 0.0
Titanic: 0.0
The Avengers: 0.5
The Dark Knight: 0.4
Star Wars: Episode I - The Phantom Menace: 0.7
Star Wars: 0.6
The Dark Knight Rises: 0.3
Shrek 2: 0.3
E.T. the Extra-Terrestrial: 0.0
Pirates of the Caribbean: Dead Man's Chest: 0.2


For instance, for the two movies, Avatar and Titanic, the recommendations were as follows:

In [54]:
pd.DataFrame([list(cm_df_1['10 closest movies'][0]), list(cm_df_2['10 closest movies'][0])], index= ['From matrix 1', 'From matrix 2']).T

Unnamed: 0,From matrix 1,From matrix 2
0,The Last Samurai,The Guardian
1,Dances with Wolves,It's Complicated
2,Aliens,Sweet Home Alabama
3,Man of Steel,Wall Street: Money Never Sleeps
4,The Last Airbender,Cowboys & Aliens
5,The Avengers,Must Love Dogs
6,Underworld: Awakening,The Blues Brothers
7,Oblivion,Keeping the Faith
8,Prometheus,Not Another Teen Movie


In [55]:
pd.DataFrame([list(cm_df_1['10 closest movies'][1]), list(cm_df_2['10 closest movies'][1])], index= ['From matrix 1', 'From matrix 2']).T

Unnamed: 0,From matrix 1,From matrix 2
0,Poseidon,Silent Hill
1,The Notebook,The American
2,American Beauty,The Mirror Has Two Faces
3,Shakespeare in Love,The Shining
4,In Time,Ladder 49
5,The Poseidon Adventure,State of Play
6,Troy,"Crazy, Stupid, Love."
7,Pretty in Pink,Speed
8,West Side Story,Firewall


Looking at the results one can tell that the first approach, using the keywords table, takes high-level characteristics of the movie (e.g. genre, places, people, situations) which can lead to a vague representation of the movie. For instance, for the first movie (Avatar), being a fantasy movie about aliens fighting in war, the first recommender systems output movies about aliens or fantasy wars. However, the movie is more than war and aliens, since there is a storyline about a complicated love situation involved, therefore, the second systems output movies with similar story line, even though there might not be aliens or wars. There are cases, however, that both systems yielded very similar results, like the case of Star War, as shown below.

In [56]:
pd.DataFrame([list(cm_df_1['10 closest movies'][4]), list(cm_df_2['10 closest movies'][4])], index= ['From matrix 1', 'From matrix 2']).T

Unnamed: 0,From matrix 1,From matrix 2
0,Star Wars: Episode II - Attack of the Clones,Star Wars: Episode II - Attack of the Clones
1,Star Wars: Episode III - Revenge of the Sith,Star Wars: Episode III - Revenge of the Sith
2,Star Wars: Episode VI - Return of the Jedi,Star Wars: The Clone Wars
3,Star Wars: Episode V - The Empire Strikes Back,Star Wars
4,Star Wars,Star Wars: Episode VI - Return of the Jedi
5,Star Wars: The Clone Wars,Star Wars: Episode V - The Empire Strikes Back
6,Star Trek: The Motion Picture,Star Trek: Insurrection
7,Man of Steel,The Queen
8,Star Trek: Insurrection,Mirror Mirror


In [57]:
pd.DataFrame([list(cm_df_1['10 closest movies'][5]), list(cm_df_2['10 closest movies'][5])], index= ['From matrix 1', 'From matrix 2']).T

Unnamed: 0,From matrix 1,From matrix 2
0,Star Wars: Episode V - The Empire Strikes Back,Star Wars: Episode V - The Empire Strikes Back
1,Star Wars: Episode VI - Return of the Jedi,Star Wars: Episode VI - Return of the Jedi
2,Star Wars: Episode I - The Phantom Menace,Star Wars: Episode III - Revenge of the Sith
3,Star Wars: Episode II - Attack of the Clones,Star Wars: Episode II - Attack of the Clones
4,Star Wars: Episode III - Revenge of the Sith,Step Up 3D
5,Star Wars: The Clone Wars,Star Wars: Episode I - The Phantom Menace
6,Man of Steel,The Dukes of Hazzard
7,Spaceballs,Star Wars: The Clone Wars
8,The Fifth Element,Mission to Mars


Furthermore, which approach is better depends on the person who is looking for the recommendation. In the example of Avatar, one might like the movie because of the aliens and wars, other one might like it because of the complicated love story and story telling. But, since the keywords extracted from the plot can be very similar depending on the NLP processes that one applies to it and given that manually picking the keywords is time consuming and can prompt to error, since the keywords are selected by different people and they can have different view about the abstraction of the movie, the second approach can extract both the high-level characteristic of the movie as well as certain level of abstraction as seen in the plot. For such reasons, the second approach is more reasonable.