# Upload the dataset

# Reading dataset

In [None]:
import pandas as pd

movies = pd.read_csv('movies.csv')
movies.head(2)

In [None]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   title                 4809 non-null   object 
 1   original_title        4809 non-null   object 
 2   tagline               3965 non-null   object 
 3   overview              4806 non-null   object 
 4   genres                4809 non-null   object 
 5   cast                  4809 non-null   object 
 6   director              4779 non-null   object 
 7   keywords              4809 non-null   object 
 8   original_language     4809 non-null   object 
 9   popularity            4809 non-null   float64
 10  production_companies  4809 non-null   object 
 11  production_countries  4809 non-null   object 
 12  release_date          4808 non-null   object 
 13  revenue               4809 non-null   int64  
 14  runtime               4807 non-null   float64
 15  spoken_languages     

# Dropping not required columns

In [None]:
required_columns =['title','original_title', 'tagline', 'keywords', 'overview', 'genres', 'cast', 'director']
movies = movies[required_columns]

In [None]:
movies.isna().sum()

title               0
original_title      0
tagline           844
keywords            0
overview            3
genres              0
cast                0
director           30
dtype: int64

In [None]:
movies.fillna(' ', inplace=True)
movies.isna().sum()

title             0
original_title    0
tagline           0
keywords          0
overview          0
genres            0
cast              0
director          0
dtype: int64

In [None]:
movies.iloc[0]

title                                                        Avatar
original_title                                               Avatar
tagline                                 Enter the World of Pandora.
keywords          ['culture clash', 'future', 'space war', 'spac...
overview          In the 22nd century, a paraplegic Marine is di...
genres            ['Action', 'Adventure', 'Fantasy', 'Science Fi...
cast              ['Sam Worthington', 'Zoe Saldana', 'Sigourney ...
director                                              James Cameron
Name: 0, dtype: object

# Create movie CONTENT

In [None]:
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['director'] = movies['director'].apply(lambda x: x.replace(" ","") )

## Converting List to Strings

In [None]:
movies['genres'] = movies['genres'].apply(lambda x: ','.join(map(str, x)))
movies['keywords'] = movies['keywords'].apply(lambda x: ','.join(map(str, x)))
movies['cast'] = movies['cast'].apply(lambda x: ','.join(map(str, x)))

In [None]:
movies['content'] = movies['title'] + ' ' + movies['overview'] + ' ' + movies['keywords'] + ' ' + movies['cast'] + ' ' + movies ['director']
movies['content']

0       Avatar In the 22nd century, a paraplegic Marin...
1       Pirates of the Caribbean: At World's End Capta...
2       Spectre A cryptic message from Bond’s past sen...
3       The Dark Knight Rises Following the death of D...
4       John Carter John Carter is a war-weary, former...
                              ...                        
4804    El Mariachi El Mariachi just wants to play his...
4805    Newlyweds A newlywed couple's honeymoon is upe...
4806    Signed, Sealed, Delivered "Signed, Sealed, Del...
4807    Shanghai Calling When ambitious New York attor...
4808    My Date with Drew Ever since the second grade ...
Name: content, Length: 4809, dtype: object

# Natural Language Processing

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000)
movie_vectors = vectorizer.fit_transform(movies['content'].values) 

# Cosine Similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(movie_vectors)

In [None]:
similarity_df = pd.DataFrame(similarity)
similarity_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,4769,4770,4771,4772,4773,4774,4775,4776,4777,4778,4779,4780,4781,4782,4783,4784,4785,4786,4787,4788,4789,4790,4791,4792,4793,4794,4795,4796,4797,4798,4799,4800,4801,4802,4803,4804,4805,4806,4807,4808
0,1.000000,0.120720,0.054727,0.181850,0.157633,0.169431,0.085322,0.119674,0.038074,0.041976,0.059856,0.124835,0.030252,0.097009,0.091383,0.138101,0.085341,0.079639,0.091711,0.133738,0.090037,0.062529,0.089699,0.068654,0.065944,0.111964,0.214429,0.270058,0.011245,0.171417,0.075153,0.071986,0.132333,0.060288,0.119584,0.109617,0.176813,0.085004,0.103113,0.068666,...,0.107381,0.061106,0.045187,0.046509,0.056639,0.045939,0.065185,0.058425,0.020405,0.121979,0.152649,0.085769,0.024867,0.091521,0.136437,0.059574,0.066321,0.033706,0.035877,0.099176,0.132461,0.135410,0.030727,0.113159,0.054479,0.029395,0.029669,0.059890,0.061162,0.137434,0.058914,0.054707,0.115616,0.085792,0.100569,0.086665,0.050144,0.106024,0.062859,0.047848
1,0.120720,1.000000,0.099872,0.124117,0.216898,0.046506,0.091782,0.235887,0.078293,0.137685,0.127333,0.106837,0.237146,0.149621,0.219803,0.201105,0.174839,0.137342,0.268327,0.232996,0.192888,0.090521,0.190494,0.088833,0.071821,0.124582,0.231614,0.135855,0.104890,0.190082,0.066411,0.113662,0.171761,0.090056,0.110659,0.212955,0.173911,0.141232,0.156721,0.143715,...,0.119625,0.098157,0.047318,0.070987,0.093188,0.046853,0.141048,0.104784,0.068325,0.154538,0.085329,0.046291,0.086622,0.091154,0.217753,0.161084,0.082485,0.062445,0.103759,0.124093,0.210150,0.234326,0.094350,0.167323,0.135678,0.059018,0.102940,0.075767,0.131434,0.236768,0.053258,0.116830,0.112313,0.166890,0.213747,0.102234,0.085246,0.151463,0.137537,0.065207
2,0.054727,0.099872,1.000000,0.073028,0.045837,0.046143,0.034967,0.101910,0.048610,0.047152,0.049427,0.389960,0.039629,0.089307,0.078866,0.097377,0.093032,0.167796,0.088988,0.106811,0.069212,0.051025,0.120624,0.084180,0.027499,0.067228,0.049404,0.105447,0.013511,0.235121,0.054870,0.017735,0.055744,0.041872,0.025214,0.153334,0.122325,0.094632,0.075334,0.060203,...,0.078743,0.060413,0.043266,0.040737,0.148464,0.082823,0.093168,0.044497,0.023230,0.058373,0.032152,0.027017,0.027781,0.052648,0.111098,0.086375,0.045601,0.033183,0.035927,0.045017,0.113709,0.066407,0.047323,0.053414,0.048472,0.082711,0.036185,0.051839,0.042255,0.066012,0.062993,0.013283,0.033992,0.043828,0.104362,0.079478,0.026120,0.088255,0.041129,0.043192
3,0.181850,0.124117,0.073028,1.000000,0.126776,0.082809,0.146062,0.160151,0.103276,0.093932,0.094969,0.121914,0.053928,0.139591,0.114880,0.196758,0.162537,0.107855,0.119002,0.270752,0.123451,0.083295,0.157697,0.102335,0.108779,0.153828,0.128315,0.125018,0.065484,0.198598,0.129796,0.179356,0.129369,0.079085,0.055013,0.130066,0.208011,0.121181,0.154094,0.129518,...,0.068309,0.096618,0.118633,0.081257,0.174665,0.060290,0.108142,0.125764,0.041201,0.167479,0.067859,0.066746,0.036474,0.106684,0.148441,0.085852,0.076874,0.035836,0.014500,0.051038,0.167149,0.121133,0.081147,0.182192,0.076485,0.046036,0.028939,0.073540,0.056472,0.135917,0.026014,0.058746,0.084078,0.108530,0.109721,0.128345,0.082814,0.141403,0.088217,0.128880
4,0.157633,0.216898,0.045837,0.126776,1.000000,0.080019,0.157128,0.254060,0.063199,0.131756,0.160363,0.123707,0.153126,0.091188,0.151604,0.174659,0.141338,0.174551,0.134740,0.241111,0.175647,0.218845,0.105409,0.101539,0.117110,0.159782,0.290831,0.300271,0.181005,0.149675,0.107633,0.143126,0.178919,0.122182,0.068005,0.161358,0.220728,0.124111,0.077594,0.163682,...,0.118678,0.086691,0.109924,0.064024,0.060497,0.030801,0.101190,0.118962,0.043376,0.184496,0.114104,0.090255,0.049184,0.099242,0.210898,0.101411,0.122975,0.049317,0.019676,0.150661,0.203025,0.214430,0.082783,0.166682,0.073896,0.052092,0.028844,0.069655,0.094886,0.141131,0.082457,0.087538,0.105537,0.097895,0.201148,0.151762,0.058291,0.127700,0.119581,0.084958
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4804,0.086665,0.102234,0.079478,0.128345,0.151762,0.060347,0.093782,0.199868,0.098748,0.071068,0.125729,0.117847,0.073381,0.099324,0.147280,0.137117,0.079683,0.162798,0.152471,0.172448,0.209651,0.201673,0.149577,0.054854,0.079942,0.132442,0.042746,0.135414,0.010730,0.129543,0.120574,0.047877,0.118047,0.054432,0.039912,0.143454,0.093190,0.096889,0.085606,0.093239,...,0.115344,0.105412,0.168340,0.062309,0.061095,0.056504,0.149727,0.157080,0.027860,0.159930,0.149398,0.041236,0.077366,0.115341,0.188501,0.124409,0.138674,0.091892,0.108476,0.176359,0.178224,0.202647,0.039542,0.100644,0.113648,0.053723,0.046110,0.062354,0.149136,0.176348,0.053671,0.084961,0.114460,0.066319,0.230197,1.000000,0.020742,0.072689,0.149257,0.105118
4805,0.050144,0.085246,0.026120,0.082814,0.058291,0.053500,0.060803,0.066664,0.016161,0.031257,0.038759,0.086928,0.040282,0.097944,0.063808,0.131943,0.058856,0.041326,0.030915,0.166069,0.083054,0.041312,0.142982,0.050411,0.024796,0.034777,0.045404,0.066482,0.058251,0.099761,0.072700,0.104966,0.048759,0.055353,0.076728,0.087003,0.070433,0.082536,0.051043,0.065153,...,0.031194,0.096037,0.036132,0.067812,0.055219,0.015113,0.085574,0.053493,0.056441,0.069048,0.104029,0.039691,0.014359,0.099101,0.131833,0.089393,0.030104,0.019964,0.000000,0.032036,0.093439,0.052330,0.120349,0.188217,0.085393,0.137272,0.000000,0.041971,0.064531,0.148194,0.008118,0.019514,0.051268,0.088724,0.054618,0.020742,1.000000,0.097607,0.065763,0.034715
4806,0.106024,0.151463,0.088255,0.141403,0.127700,0.128241,0.147168,0.124013,0.081597,0.090569,0.109426,0.093811,0.117010,0.158537,0.144487,0.153589,0.164062,0.146395,0.125773,0.229458,0.071477,0.075452,0.142271,0.093620,0.072056,0.110802,0.152960,0.138778,0.077711,0.159152,0.088546,0.109873,0.124788,0.058661,0.064564,0.116037,0.167807,0.200574,0.154950,0.162009,...,0.089591,0.114511,0.057017,0.076656,0.139606,0.045011,0.099140,0.073066,0.070028,0.196994,0.144111,0.100217,0.082081,0.136138,0.202741,0.108656,0.144135,0.033430,0.016016,0.058033,0.245380,0.117191,0.130024,0.134692,0.122148,0.034934,0.016656,0.069960,0.036178,0.174014,0.082691,0.070278,0.118665,0.130066,0.092227,0.072689,0.097607,1.000000,0.129344,0.064583
4807,0.062859,0.137537,0.041129,0.088217,0.119581,0.070877,0.122555,0.107464,0.097023,0.069589,0.179353,0.112635,0.071046,0.084944,0.174254,0.152160,0.073052,0.100628,0.107242,0.140603,0.138777,0.104780,0.071998,0.046831,0.101860,0.109561,0.046412,0.103640,0.026617,0.124597,0.085435,0.091308,0.108414,0.060274,0.047569,0.276750,0.118736,0.125432,0.107720,0.404399,...,0.126313,0.066926,0.073773,0.068784,0.072302,0.034091,0.075181,0.157091,0.040643,0.126965,0.079291,0.027053,0.079912,0.103852,0.159610,0.092587,0.092392,0.128132,0.034037,0.113449,0.162608,0.173403,0.064832,0.113197,0.087630,0.031366,0.042112,0.039607,0.146157,0.100473,0.026976,0.097445,0.091604,0.112067,0.144768,0.149257,0.065763,0.129344,1.000000,0.149881


# Recommending Movies

In [None]:
def recommend(movie):
    # find movie index from dataset
    movies_index = movies[movies['title'] == movie].index[0]
    
    # finding cosine similarities of movie
    distances = similarity[movies_index]
    
    # sorting cosine similarities
    movies_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]
    
    for i in movies_list:
        print(movies.iloc[i[0]].title)

In [None]:
movies.title.head(10)

0                                      Avatar
1    Pirates of the Caribbean: At World's End
2                                     Spectre
3                       The Dark Knight Rises
4                                 John Carter
5                                Spider-Man 3
6                                     Tangled
7                     Avengers: Age of Ultron
8      Harry Potter and the Half-Blood Prince
9          Batman v Superman: Dawn of Justice
Name: title, dtype: object

In [None]:
recommend('Harry Potter and the Half-Blood Prince')

Harry Potter and the Goblet of Fire
Harry Potter and the Order of the Phoenix
Harry Potter and the Philosopher's Stone
Da Sweet Blood of Jesus
Harry Potter and the Chamber of Secrets
