In [None]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer 
# this is used as each row has different labels of the same category 


In [None]:
# loading the cleaned datasets and not the original one as we are using modular approach
df=pd.read_csv('cleaned_data.csv')


In [None]:
# feature engineering i.e. encoding categorical to numerical

print(df['Genre'].nunique()) # gives the number of unique genre strings from each row so its value is large as it checks the strings


df['Genre_list']=df['Genre'].apply(lambda x: list(dict.fromkeys(i.strip() for i in x.split(','))))
'''
dict.fromkeys is used to remove the duplicates as in dictionary no 2 keys can be same
and list () is used to convert this to list
strip to remove any spaces 
this line just give me the list of the grenre across all the rows i.e. removes repetioon on the rows but not on whole datasets
but multilabelbinarizer only makes columns of the unique list
'''
df['genre_count']=df['Genre_list'].apply(len)
print(df['genre_count'])
# multilabelizer for genre
mlb_genre =MultiLabelBinarizer()
Genre_encoded =pd.DataFrame(mlb_genre.fit_transform(df['Genre_list']), columns=mlb_genre.classes_)
'''
fit_transform() gives numpy arrays without any names or titles of columns so column name is given on the basis
of the Genre_list stored at the mlb_genre.classes_
The unique genres found in df['Genre_list']
during fit_transform() are automatically saved into mlb_genre.classes_.
'''
df = pd.concat([df, Genre_encoded], axis=1)

print(Genre_encoded.columns.duplicated().any())  # should be False

print(Genre_encoded)
print(mlb_genre.classes_)

In [None]:
# now for the director 

# to stop memory fragmentation we do one hot encoding at once
top=df['Director'].value_counts().nlargest(50).index.tolist() # n=100 and index gives the name/labels of the these directors
direncode=pd.get_dummies(df['Director'])
direncode=direncode[top]
direncode=direncode.astype(int)

print(df['Rating'])
# target encoding for directors used over frequency to see which gives better results
directorMean=df.groupby('Director')['Rating'].mean()
df['directorTargetEnc']=df['Director'].map(directorMean)
print(df['directorTargetEnc'])

#  add  other column
df['Director_other']=(~df['Director'].isin(top)).astype(int)

# frequency encoding to get more numeric information
fq=df['Director'].value_counts(normalize=True)
df['Director_frequnency']=df['Director'].map(fq)

# concatenating these 100 columns into the dataframe
df=pd.concat([df,direncode],axis=1)

# removing the duplicates as they were created running this loop version
df = df.loc[:, ~df.columns.duplicated()]


''' 
this finds thr top 100 most frequent directors 
it is a list of names of directors given as by .index

# one hot encoding/ binary encoding
for Director in top:
    df[f'Director_{Director}']=(df['Director']==Director).astype(int)
    this adds the columns in a loop 1 by 1 thus it is fragmented and inefficient 
    to make it efficient we need to add it at once
    
    '''

print(df.columns[df.columns.duplicated()]) # this is to check if there are duplicates or not



In [None]:
# now for actors 
# combinining actors 1 2 3 into single list column for each row
df['actors']=df[['Actor 1','Actor 2','Actor 3']].values.tolist()

# counting top 100s actors
allActors= df['actors'].explode()  
top= allActors.value_counts().nlargest(50).index
'''
explode is used to give all the actors names as in actors i.e. every row with repetition 
this is used to calulate the top actors
'''

# order insensitivity encoding

df = df.loc[:, ~df.columns.duplicated()]

# to remove the memory fragmentation problem create an empty dataframe to hold new columns
dumies=pd.DataFrame(index=df.index)

for actor in top:
    dumies[f'actors_{actor}']=df['actors'].apply(lambda x: int(actor in x))

# nor concatenate all columns at once to original df
df=pd.concat([df,dumies],axis=1)
    
new_actor_cols = [col for col in df.columns if col.startswith('actors_')]


print(len(new_actor_cols))  # How many columns?
print(len(set(new_actor_cols)))  # Are there duplicates in names?

print(df.columns.duplicated().sum())  # Should be 0 ideally
actor_counts = df[new_actor_cols].sum().sort_values(ascending=False)






In [None]:
# sving the encoded one to the csv
df.to_csv("cleaned_encoded.csv", index= False)