In [4]:
import pandas as pd
df=pd.read_csv("movies.csv", on_bad_lines='skip')
df.shape, df.columns

((4803, 24),
 Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
        'original_language', 'original_title', 'overview', 'popularity',
        'production_companies', 'production_countries', 'release_date',
        'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
        'vote_average', 'vote_count', 'cast', 'crew', 'director'],
       dtype='object'))

In [5]:
df = df[['id','title','genres','keywords','overview','cast','director']]
df.head()


Unnamed: 0,id,title,genres,keywords,overview,cast,director
0,19995,Avatar,Action Adventure Fantasy Science Fiction,culture clash future space war space colony so...,"In the 22nd century, a paraplegic Marine is di...",Sam Worthington Zoe Saldana Sigourney Weaver S...,James Cameron
1,285,Pirates of the Caribbean: At World's End,Adventure Fantasy Action,ocean drug abuse exotic island east india trad...,"Captain Barbossa, long believed to be dead, ha...",Johnny Depp Orlando Bloom Keira Knightley Stel...,Gore Verbinski
2,206647,Spectre,Action Adventure Crime,spy based on novel secret agent sequel mi6,A cryptic message from Bond’s past sends him o...,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,Sam Mendes
3,49026,The Dark Knight Rises,Action Crime Drama Thriller,dc comics crime fighter terrorist secret ident...,Following the death of District Attorney Harve...,Christian Bale Michael Caine Gary Oldman Anne ...,Christopher Nolan
4,49529,John Carter,Action Adventure Science Fiction,based on novel mars medallion space travel pri...,"John Carter is a war-weary, former military ca...",Taylor Kitsch Lynn Collins Samantha Morton Wil...,Andrew Stanton


In [6]:
# 1) Choose the text columns we care about
text_cols = ['genres', 'keywords', 'overview', 'cast', 'director']

# 2) Fill any missing values with empty string
df[text_cols] = df[text_cols].fillna('')

# 3) Convert all text to lowercase
for col in text_cols:
    df[col] = df[col].astype(str).str.lower()

df[text_cols].head()


Unnamed: 0,genres,keywords,overview,cast,director
0,action adventure fantasy science fiction,culture clash future space war space colony so...,"in the 22nd century, a paraplegic marine is di...",sam worthington zoe saldana sigourney weaver s...,james cameron
1,adventure fantasy action,ocean drug abuse exotic island east india trad...,"captain barbossa, long believed to be dead, ha...",johnny depp orlando bloom keira knightley stel...,gore verbinski
2,action adventure crime,spy based on novel secret agent sequel mi6,a cryptic message from bond’s past sends him o...,daniel craig christoph waltz l\u00e9a seydoux ...,sam mendes
3,action crime drama thriller,dc comics crime fighter terrorist secret ident...,following the death of district attorney harve...,christian bale michael caine gary oldman anne ...,christopher nolan
4,action adventure science fiction,based on novel mars medallion space travel pri...,"john carter is a war-weary, former military ca...",taylor kitsch lynn collins samantha morton wil...,andrew stanton


In [7]:
# Step 4: split each text column into list of words
df['genres'] = df['genres'].str.split()
df['keywords'] = df['keywords'].str.split()
df['overview'] = df['overview'].str.split()
df['cast'] = df['cast'].str.split()
df['director'] = df['director'].str.split()

df[['genres', 'keywords', 'cast', 'director']].head()


Unnamed: 0,genres,keywords,cast,director
0,"[action, adventure, fantasy, science, fiction]","[culture, clash, future, space, war, space, co...","[sam, worthington, zoe, saldana, sigourney, we...","[james, cameron]"
1,"[adventure, fantasy, action]","[ocean, drug, abuse, exotic, island, east, ind...","[johnny, depp, orlando, bloom, keira, knightle...","[gore, verbinski]"
2,"[action, adventure, crime]","[spy, based, on, novel, secret, agent, sequel,...","[daniel, craig, christoph, waltz, l\u00e9a, se...","[sam, mendes]"
3,"[action, crime, drama, thriller]","[dc, comics, crime, fighter, terrorist, secret...","[christian, bale, michael, caine, gary, oldman...","[christopher, nolan]"
4,"[action, adventure, science, fiction]","[based, on, novel, mars, medallion, space, tra...","[taylor, kitsch, lynn, collins, samantha, mort...","[andrew, stanton]"


In [8]:
# Step 5: Combine all lists into one single list
df['tags'] = df['genres'] + df['keywords'] + df['overview'] + df['cast'] + df['director']

# Convert list → single string
df['tags'] = df['tags'].apply(lambda x: " ".join(x))

df[['title','tags']].head()


Unnamed: 0,title,tags
0,Avatar,action adventure fantasy science fiction cultu...
1,Pirates of the Caribbean: At World's End,adventure fantasy action ocean drug abuse exot...
2,Spectre,action adventure crime spy based on novel secr...
3,The Dark Knight Rises,action crime drama thriller dc comics crime fi...
4,John Carter,action adventure science fiction based on nove...


In [9]:
final_df = df[['id', 'title', 'tags']]
final_df.head(), final_df.shape


(       id                                     title  \
 0   19995                                    Avatar   
 1     285  Pirates of the Caribbean: At World's End   
 2  206647                                   Spectre   
 3   49026                     The Dark Knight Rises   
 4   49529                               John Carter   
 
                                                 tags  
 0  action adventure fantasy science fiction cultu...  
 1  adventure fantasy action ocean drug abuse exot...  
 2  action adventure crime spy based on novel secr...  
 3  action crime drama thriller dc comics crime fi...  
 4  action adventure science fiction based on nove...  ,
 (4803, 3))

In [10]:
final_df.to_csv("movies_final.csv", index=False)


### ✅ Day 3 — Feature Engineering Completed

- Selected important columns: id, title, genres, keywords, overview, cast, director  
- Filled missing values and converted all text to lowercase  
- Split text columns into lists of tokens  
- Combined genres, keywords, overview, cast, and director into a single `tags` column  
- Created final dataframe with columns: id, title, tags  
- Saved processed dataset as `movies_final.csv` for model building in Day 4
