In [95]:
import numpy as np
import pandas as pd
import ast

The ast module is a powerful tool for parsing and processing Python code. It allows you to work with the syntactic structure of code, making it useful for tasks like code analysis, transformation, and safe evaluation of Python expressions. The examples demonstrate how to parse code into an AST, use ast.literal_eval for safe evaluation, and create a function to process and extract data from JSON-like strings.

In [96]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [97]:
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [98]:
movies.shape

(4803, 20)

In [99]:
credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [100]:
# merge the credit and movies csv
movies = movies.merge(credits, on = 'title')

In [101]:
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,206647,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,49026,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,49529,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [102]:
movies.shape

(4809, 23)

## Important things which i need are
* genres
* id
* keywords
* title
* overview
* cast
* crew

In [103]:
# i just need few coloums so this code is for including few coloumns and exluding other columns
movies = movies[['id','title','overview','cast','crew','genres','keywords']]

In [104]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        4809 non-null   int64 
 1   title     4809 non-null   object
 2   overview  4806 non-null   object
 3   cast      4809 non-null   object
 4   crew      4809 non-null   object
 5   genres    4809 non-null   object
 6   keywords  4809 non-null   object
dtypes: int64(1), object(6)
memory usage: 263.1+ KB


In [105]:
movies.shape

(4809, 7)

In [106]:
movies['crew']

0       [{"credit_id": "52fe48009251416c750aca23", "de...
1       [{"credit_id": "52fe4232c3a36847f800b579", "de...
2       [{"credit_id": "54805967c3a36829b5002c41", "de...
3       [{"credit_id": "52fe4781c3a36847f81398c3", "de...
4       [{"credit_id": "52fe479ac3a36847f813eaa3", "de...
                              ...                        
4804    [{"credit_id": "52fe44eec3a36847f80b280b", "de...
4805    [{"credit_id": "52fe487dc3a368484e0fb013", "de...
4806    [{"credit_id": "52fe4df3c3a36847f8275ecf", "de...
4807    [{"credit_id": "52fe4ad9c3a368484e16a36b", "de...
4808    [{"credit_id": "58ce021b9251415a390165d9", "de...
Name: crew, Length: 4809, dtype: object

In [107]:
movies.head()

Unnamed: 0,id,title,overview,cast,crew,genres,keywords
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...","[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":..."


## to find if there are any null values in this dataset and add them 

In [108]:
movies.isnull().sum()

id          0
title       0
overview    3
cast        0
crew        0
genres      0
keywords    0
dtype: int64

In [109]:
# drop all the null parts inthis dataset
movies.dropna(inplace=True)

In [110]:
# rechecking 
movies.isnull().sum()

id          0
title       0
overview    0
cast        0
crew        0
genres      0
keywords    0
dtype: int64

In [111]:
# checking for duplicate data

In [112]:
movies.duplicated().sum()

0

### The code movies.iloc[0].genres is used to access a specific element in the movies DataFrame:

* movies: The DataFrame containing movie-related data.
* iloc[0]: Uses integer-location based indexing to access the first row (index 0) of the DataFrame.
* genres: Accesses the genres column value of the specified row.

In [113]:
movies.iloc[5].genres

'[{"id": 14, "name": "Fantasy"}, {"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}]'

In [114]:
import ast

def convert(obj):
    # Initialize an empty list to store the extracted names
    L = []
    # Evaluate the string representation of a list of dictionaries and iterate over it
    for i in ast.literal_eval(obj):
        # Append the 'name' value from each dictionary to the list
        L.append(i['name'])
    # Return the list of names
    return L

### the code which i worte here is used before again using it will not work  but i will use this fuction on Keywords


In [115]:
movies['genres'] = movies['genres'].apply(convert)

In [116]:
movies['keywords'] = movies['keywords'].apply(convert)

In [117]:
movies.head()

Unnamed: 0,id,title,overview,cast,crew,genres,keywords
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...","[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...","[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel..."


before it was in different format which we did not needed then we used function then convert it into over usable part 

The convert3 function is specifically designed to extract up to three 'name' values from a string representation of a list of dictionaries. This can be useful for limiting the amount of data processed or displayed at any given time, such as in a preview or summary view of larger datasets. The function stops processing after extracting three names, ensuring efficient and controlled data extraction.

In [118]:
def convert3(obj):
    # Initialize an empty list to store the extracted names
    L = []
    # Initialize a counter to keep track of the number of names extracted
    counter = 0
    # Evaluate the string representation of a list of dictionaries and iterate over it
    for i in ast.literal_eval(obj):
        if counter != 3:
            # Append the 'name' value from each dictionary to the list
            L.append(i['name'])
            # Increment the counter
            counter += 1
        else:
            # Break the loop if 3 names have been extracted
            break
    # Return the list of names
    return L



In [119]:
movies['cast'] = movies['cast'].apply(convert3)

In [120]:
movies.head()

Unnamed: 0,id,title,overview,cast,crew,genres,keywords
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Johnny Depp, Orlando Bloom, Keira Knightley]","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Daniel Craig, Christoph Waltz, Léa Seydoux]","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...","[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Christian Bale, Michael Caine, Gary Oldman]","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...","[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Taylor Kitsch, Lynn Collins, Samantha Morton]","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel..."


The fetch_director function processes a string representation of a list of dictionaries to find and return the name of the director. Here's a detailed breakdown and comments:

In [121]:
def fetch_director(obj):
    # Initialize an empty list to store the director's name
    L = []
    # Evaluate the string representation of a list of dictionaries and iterate over it
    for i in ast.literal_eval(obj):
        # Check if the job is 'Director'
        if i['job'] == 'Director':
            # Append the director's name to the list
            L.append(i['name'])
            # Break the loop after finding the first director
            break
    # Return the list containing the director's name
    return L

In [122]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [123]:
movies.head()

Unnamed: 0,id,title,overview,cast,crew,genres,keywords
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron],"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski],"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes],"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan],"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton],"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel..."


In [125]:
movies['overview'][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.'

The following code snippet applies a lambda function to each element in the 'overview' column of the 'movies' DataFrame. It splits each string value into a list of words, and assigns the resulting list back to the 'overview' column.


### Explanation:

1. **`movies['overview']`**: Selects the 'overview' column of the 'movies' DataFrame.

2. **`.apply(lambda x: x.split())`**: Applies a lambda function to each element in the 'overview' column. The lambda function `lambda x: x.split()` splits each string `x` into a list of words using whitespace as the separator.

3. **`movies['overview'] = ...`**: Assigns the resulting lists of words back to the 'overview' column, effectively replacing the original strings with lists of words.

### Comment:

This code transforms the 'overview' column from containing strings to containing lists of words. This can be useful for text processing tasks, such as tokenization or counting word frequencies.


In [126]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())

# Understanding Lambda Functions in Python

Lambda functions, also known as anonymous functions, are small, inline functions defined using the `lambda` keyword. They offer a concise way to create functions without the need for a formal function definition.

## Key Characteristics of Lambda Functions

1. **Anonymous:** Lambda functions do not have a name, making them suitable for short, one-time-use operations.
2. **Single Expression:** Lambda functions can only contain a single expression, which is evaluated and returned.
3. **Arguments:** Lambda functions can take any number of arguments, but they can only have one expression.
4. **Short-lived:** Lambda functions are typically used for short periods and in places where creating a named function would be overkill.

## Syntax of a Lambda Function

The general syntax of a lambda function is as follows:

```python
lambda arguments: expression


In [129]:
add = lambda x,y: x+y

print(add(2,3))

5


In [132]:
# mapping with map()
numbers = [1,2,3,4,5]
squared = list(map(lambda x:x**2,numbers))
print(squared)

[1, 4, 9, 16, 25]


In [136]:
# filtering with filter()
numbers = [1,2,3,4,5,6]
even = list(filter(lambda x:x % 2 == 0, numbers))
print(even)

[2, 4, 6]


In [142]:
# sorting with sorted()
words = ['banana','apple','cherry', ]
sorted_words = sorted(words, key=lambda x:len(x))
print(sorted_words)

['apple', 'banana', 'cherry']


# Explanation of Sorting List of Words by Length

Here's an explanation of the provided code snippet:

## Code Explanation:

1. **`words = ['banana', 'apple', 'cherry']`**: Initializes a list named `words` containing three strings.

2. **`sorted_words = sorted(words, key=lambda x: len(x))`**: Sorts the `words` list based on the length of each word. 
   - The `sorted()` function is used with a custom key function specified by the lambda expression `lambda x: len(x)`. 
   - This lambda function takes a word `x` as input and returns its length using the `len()` function.
   - The `key` parameter in the `sorted()` function determines the sorting criterion, which in this case is the length of each word.

3. **`print(sorted_words)`**: Prints the sorted list of words.


# Resume


In [143]:
movies.head()

Unnamed: 0,id,title,overview,cast,crew,genres,keywords
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron],"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski],"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes],"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan],"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton],"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel..."


# Explanation of String Replacement in DataFrame Columns

Here's an explanation of the provided code snippet:

## Code Explanation:

The code snippet applies a lambda function to each element in specific columns of the 'movies' DataFrame. This lambda function replaces spaces (" ") in each string with an empty string ("").

```python
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])



# Breakdown of String Replacement in DataFrame Column

Let's break down the components of the provided code snippet:

### 1. `movies['genres']`:
   - Selects the 'genres' column of the 'movies' DataFrame.

### 2. `.apply(lambda x: [i.replace(" ", "") for i in x])`:
   - Applies a lambda function to each element in the 'genres' column.
   - The lambda function iterates over each element `i` in the list `x` (which represents each element of the 'genres' column).
   - Within the lambda function, `i.replace(" ", "")` replaces spaces (" ") with an empty string ("") for each element `i`.
   - This results in a list comprehension that generates a new list where each element has spaces removed.

### Purpose:
The purpose of this operation is to standardize the format of data within the 'genres' column by removing spaces from the strings. This can help in ensuring consistency and facilitating further analysis or processing of the data.

### Summary:
The provided code snippet demonstrates a method for removing spaces from strings within a DataFrame column. By applying a lambda function to each element of the column, spaces are replaced with empty strings, resulting in a standardized format for the data.


In [147]:
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [148]:
movies.head()

Unnamed: 0,id,title,overview,cast,crew,genres,keywords
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes],"[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan],"[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton],"[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p..."


## `making a single column` called  `tags` which will have overview, genres, keywords, cast, crew

In [149]:
movies['tags'] = movies['overview']+ movies['genres']+ movies['keywords']+ movies['cast']+ movies['crew']

In [150]:
movies.shape

(4806, 8)

In [151]:
movies.head()

Unnamed: 0,id,title,overview,cast,crew,genres,keywords,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes],"[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan],"[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton],"[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[John, Carter, is, a, war-weary,, former, mili..."


In [153]:
# making new file which will include just id, title, and tags
new_df = movies[['id','title','tags']]

In [154]:
new_df

Unnamed: 0,id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."
...,...,...,...
4804,9367,El Mariachi,"[El, Mariachi, just, wants, to, play, his, gui..."
4805,72766,Newlyweds,"[A, newlywed, couple's, honeymoon, is, upended..."
4806,231617,"Signed, Sealed, Delivered","[""Signed,, Sealed,, Delivered"", introduces, a,..."
4807,126186,Shanghai Calling,"[When, ambitious, New, York, attorney, Sam, is..."


# Breakdown of Joining Strings in DataFrame Column

Let's break down the components of the provided code snippet:

## 1. `new_df['tags']`:
   - Selects the 'tags' column of the DataFrame `new_df`.

## 2. `.apply(lambda x: " ".join(x))`:
   - Applies a lambda function to each element in the 'tags' column.
   - The lambda function takes each list `x` from the 'tags' column and joins its elements into a single string using the `join()` method.
   - The `join()` method concatenates all elements of the list with spaces between them, creating a single string.

### Purpose:
The purpose of this operation is to concatenate the elements of each list within the 'tags' column into a single string, making it easier to work with and analyze textual data.

### Summary:
The provided code snippet demonstrates a method for joining the elements of lists within a DataFrame column into a single string. By applying a lambda function to each element of the column, the lists are transformed into strings, facilitating further data processing and analysis.


In [156]:
new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))


In [157]:
new_df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [158]:
new_df['tags'][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Adventure Fantasy ScienceFiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d SamWorthington ZoeSaldana SigourneyWeaver JamesCameron'

# Breakdown of Converting Strings to Lowercase in DataFrame Column

Let's break down the components of the provided code snippet:

## 1. `new_df['tags']`:
   - Selects the 'tags' column of the DataFrame `new_df`.

## 2. `.apply(lambda x: x.lower())`:
   - Applies a lambda function to each element in the 'tags' column.
   - The lambda function `lambda x: x.lower()` converts each string `x` to lowercase using the `lower()` method.
   - This ensures that all strings in the 'tags' column are converted to lowercase.

### Purpose:
The purpose of this operation is to standardize the case of the strings within the 'tags' column by converting them all to lowercase. This can help in ensuring consistency when comparing or analyzing textual data.


In [161]:
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())


In [162]:
new_df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


# Breakdown of CountVectorizer in scikit-learn

Let's break down the components of the provided code snippet:

## 1. `from sklearn.feature_extraction.text import CountVectorizer`:
   - Imports the `CountVectorizer` class from scikit-learn, which is a tool for converting a collection of text documents into a matrix of token counts.

## 2. `cv = CountVectorizer(max_features=5000, stop_words='english')`:
   - Initializes a `CountVectorizer` object `cv` with the following parameters:
       - `max_features=5000`: Specifies the maximum number of features (i.e., unique words) to be extracted from the text data. Only the top `max_features` ordered by term frequency across the corpus will be considered.
       - `stop_words='english'`: Specifies that common English stopwords (e.g., 'the', 'is', 'and') should be removed from the text data before tokenization. This helps in improving the quality of the features extracted by the vectorizer.

### Purpose and Usage:

The purpose of using `CountVectorizer` is to convert a collection of text documents into a matrix representation, where each row corresponds to a document and each column corresponds to a unique word in the vocabulary. The cell values indicate the frequency of each word in the corresponding document.

- **Feature Extraction:** It is commonly used for feature extraction in natural language processing (NLP) tasks, such as text classification, sentiment analysis, and document clustering.
- **Text Preprocessing:** It handles tokenization, stopword removal, and vectorization of text data, making it suitable for processing raw text data before feeding it into machine learning models.

By specifying parameters like `max_features` and `stop_words`, you can customize the behavior of the `CountVectorizer` according to the requirements of your specific task.


In [164]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

# Explanation of Code Snippet

This code snippet performs the following operations:

1. `cv.fit_transform(new_df['tags'])`: Fits the `CountVectorizer` object `cv` to the 'tags' column of the DataFrame `new_df` and transforms the text data into a sparse matrix representation.

2. `.toarray()`: Converts the sparse matrix representation into a dense array.

3. `.shape`: Returns the shape of the dense array, which represents the number of rows (documents) and columns (features) in the transformed matrix.

In summary, this code calculates the shape of the matrix obtained after transforming the text data in the 'tags' column using the `CountVectorizer`.


In [165]:
cv.fit_transform(new_df['tags']).toarray().shape

(4806, 5000)

In [168]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [169]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [170]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

cv.get_feature_names_out() retrieves the feature names (i.e., vocabulary) learned by the CountVectorizer during the fitting process. Each feature name corresponds to a unique token (word) extracted from the text data. This method returns an array containing all the feature names in the same order as their indices in the feature matrix.

In summary, cv.get_feature_names_out() provides access to the vocabulary learned by the CountVectorizer, which consists of the unique words present in the text data.

In [171]:
import nltk 

NLTK is a leading platform for building Python programs to work with human language data. It provides easy-to-use interfaces to over 50 corpora and lexical resources, such as WordNet. Additionally, NLTK includes a suite of text processing libraries for classification, tokenization, stemming, tagging, parsing, and semantic reasoning, among other tasks.

In [172]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [174]:
def stem(text):
    L = []  # Initialize an empty list to store the stemmed words
    
    for i in text.split():  # Iterate over each word in the input text after splitting it into a list of words
        y.append(ps.stem(i))  # Stem each word using a stemming algorithm and append the stemmed word to the list L
        
    return " ".join(Y)  # Join the stemmed words in the list L into a single string separated by spaces and return the result


# Breakdown of Stemming Function

Let's break down the components of the provided code snippet:

## 1. `def stem(text):`
   - Defines a function named `stem` that takes a single parameter `text`, representing the input text to be stemmed.

## 2. `L = []:`
   - Initializes an empty list `L` to store the stemmed words.

## 3. `for i in text.split():`
   - Iterates over each word in the input text after splitting it into a list of words.

## 4. `y.append(ps.stem(i)):`
   - Stems each word `i` using a stemming algorithm (presumably from the `nltk.stem` module) and appends the stemmed word to the list `L`.

## 5. `return " ".join(Y):`
   - Joins the stemmed words in the list `L` into a single string separated by spaces and returns the result.

### Purpose:

The purpose of this function is to perform stemming on the input text, which involves reducing words to their base or root form (e.g., "running" becomes "run"). Stemming helps in standardizing words and reducing variations, which can be useful for tasks like text classification and information retrieval.

### Comments:

- It seems there is a typo in the code (`y.append(ps.stem(i))`). The variable `y` is not defined; it should be `L.append(ps.stem(i))` instead.
- Additionally, the stemming algorithm used (`ps.stem`) should be imported from the `nltk.stem` module, typically as `PorterStemmer` or `SnowballStemmer`.


In [175]:
new_df['tags'][0]

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver jamescameron'

In [176]:
from sklearn.metrics.pairwise import cosine_similarity


# sklearn.metrics.pairwise.cosine_similarity

The `cosine_similarity` function is a part of the `sklearn.metrics.pairwise` module in scikit-learn, which is a widely used machine learning library in Python.

## Purpose:
The `cosine_similarity` function computes the cosine similarity between vectors in a given dataset. Cosine similarity is a measure of similarity between two non-zero vectors of an inner product space that measures the cosine of the angle between them. It is widely used in various machine learning tasks, such as recommendation systems, text mining, and clustering.

## Function Signature:
```python
cosine_similarity(X, Y=None, dense_output=True)


## Parameters:
- `X`: {array-like, sparse matrix} of shape `(n_samples_X, n_features)`
   - Array-like or sparse matrix containing the vectors whose cosine similarity is to be computed.

- `Y`: {array-like, sparse matrix} of shape `(n_samples_Y, n_features)`, default=None
   - Optional array-like or sparse matrix containing additional vectors. If provided, the cosine similarity between each vector in `X` and each vector in `Y` is computed. If not provided, the function computes the cosine similarity of vectors in `X` against themselves.

- `dense_output`: bool, default=True
   - Boolean flag indicating whether to return a dense matrix (`True`) or a sparse matrix (`False`) as the output.

## Returns:
- If `Y` is provided: Returns an array-like or sparse matrix of shape `(n_samples_X, n_samples_Y)` containing the cosine similarities between each vector in `X` and each vector in `Y`.
- If `Y` is not provided: Returns an array-like or sparse matrix of shape `(n_samples_X, n_samples_X)` containing the cosine similarities of vectors in `X` against themselves.


In [177]:
similarity = cosine_similarity(vectors)

# Explanation of Code Snippet

This code snippet performs sorting on a list of tuples and selects the top 5 most similar items.

## Code Explanation:

- **`sorted(...)`:** Sorts the input iterable (a list of tuples in this case) based on a specified key function.

- **`list(enumerate(similarity[0]))`:** Enumerates the elements of the first row (index 0) of the `similarity` array, converting it into a list of tuples where each tuple contains the index and the corresponding similarity value.

- **`reverse=True`:** Specifies that the sorting should be done in descending order.

- **`key=lambda x: x[1]`:** Specifies the key function for sorting, which extracts the second element of each tuple (the similarity value) to use as the sorting criterion.

- **`[1:6]`:** Selects the elements from index 1 (inclusive) to index 5 (exclusive), effectively excluding the first element (which corresponds to the similarity of the first item with itself) and selecting the top 5 most similar items.

## Purpose:
The purpose of this code is to find the indices of the top 5 most similar items to the first item in a similarity matrix/array, based on their cosine similarity scores.

## Example:
For example, if `similarity` is a 2D array representing cosine similarity scores between items, `similarity[0]` would give the similarity scores of the first item with all other items. This code then sorts these similarity scores and returns the indices of the top 5 most similar items.


In [179]:
sorted(list(enumerate(similarity[0])),reverse=True, key=lambda x:x[1])[1:6]

[(539, 0.26089696604360174),
 (1194, 0.2581988897471611),
 (507, 0.25302403842552984),
 (260, 0.25110592822973776),
 (1216, 0.24944382578492943)]

In [180]:
def recommend(movie):
    # Find the index of the given movie in the DataFrame
    movie_index = new_df[new_df['title'] == movie].index[0]
    
    # Get the cosine similarity scores of the given movie with all other movies
    distances = similarity[movie_index]
    
    # Sort the movies based on their similarity scores in descending order and select the top 5
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    
    # Print the titles of the recommended movies
    for i in movies_list:
        print(new_df.iloc[i[0]].title)


# Explanation of Movie Recommendation Function

This function `recommend(movie)` provides recommendations for movies similar to a given movie.

## Function Overview:

```python
def recommend(movie):
    # Find the index of the given movie in the DataFrame
    movie_index = new_df[new_df['title'] == movie].index[0]
    
    # Get the cosine similarity scores of the given movie with all other movies
    distances = similarity[movie_index]
    
    # Sort the movies based on their similarity scores in descending order and select the top 5
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    
    # Print the titles of the recommended movies
    for i in movies_list:
        print(new_df.iloc[i[0]].title)


# Testing the code 


In [181]:
recommend('Batman')

Batman
Batman & Robin
The Dark Knight Rises
Batman Begins
Batman Returns


In [184]:
# some errors this is error
recommend('American Pie')

Stuart Little 2
Paddington
Stuart Little
Ted
The Motel


# This model is not perfect but can be usable it behave good in some instance and other instance it creates problems