# Importer les modules et chargement du dataframe

In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing as pp, decomposition as dc

In [None]:
df = pd.read_csv("cleaned_data.csv")
df.fillna("", inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4794 entries, 0 to 4793
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   movie_title                4794 non-null   object 
 1   director_name              4794 non-null   object 
 2   num_critic_for_reviews     4794 non-null   float64
 3   duration                   4794 non-null   float64
 4   director_facebook_likes    4794 non-null   float64
 5   actor_3_facebook_likes     4794 non-null   float64
 6   actor_2_name               4794 non-null   object 
 7   actor_1_facebook_likes     4794 non-null   float64
 8   gross                      4794 non-null   float64
 9   genres                     4794 non-null   object 
 10  actor_1_name               4794 non-null   object 
 11  num_voted_users            4794 non-null   int64  
 12  cast_total_facebook_likes  4794 non-null   int64  
 13  actor_3_name               4794 non-null   objec

# Supression des colonnes inutiles à l'entrainement

In [None]:
df.drop(columns=["movie_title",
                 "movie_imdb_link",
                 "content_rating",
                 "gross_filled_with_median",
                 "budget_filled_with_median"],
        inplace=True)

# Séparation des colonnes par type

In [None]:
df.select_dtypes(include="object").info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4794 entries, 0 to 4793
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   director_name  4794 non-null   object
 1   actor_2_name   4794 non-null   object
 2   genres         4794 non-null   object
 3   actor_1_name   4794 non-null   object
 4   actor_3_name   4794 non-null   object
 5   plot_keywords  4794 non-null   object
 6   language       4794 non-null   object
 7   country        4794 non-null   object
 8   age_category   4794 non-null   object
dtypes: object(9)
memory usage: 337.2+ KB


In [None]:
df_textual = df.select_dtypes(include="object").drop(columns=["genres", "plot_keywords"])
df_numeric = df.select_dtypes(include="number")
df_genres = df[["genres"]]
df_keywords = df[["plot_keywords"]]

# Encodage

## Colonnes textuelles

In [None]:
# df_textual
cols_textual = ["director_name", "actor_1_name", "actor_2_name", "actor_3_name", "language", "country", "age_category"]

enc_textual = pp.OneHotEncoder(sparse_output=False)

one_hot_encoded_data_textual = enc_textual.fit_transform(df_textual[cols_textual])
features_names_textual = enc_textual.get_feature_names_out(cols_textual)

df_textual_encoded = pd.DataFrame(one_hot_encoded_data_textual, columns=features_names_textual)

## Colonnes numeriques

In [None]:
# df_numeric
cols_numeric_to_transform = ["num_critic_for_reviews",
                             "director_facebook_likes",
                             "actor_3_facebook_likes",
                             "actor_1_facebook_likes",
                             "num_voted_users",
                             "cast_total_facebook_likes",
                             "num_user_for_reviews",
                             "actor_2_facebook_likes",
                             "movie_facebook_likes",
                             "gross",
                             "budget"]

df_numeric_transformed = df_numeric.copy()

# On utilise le logarithme pour réduire l'asymétrie dans les valeurs
df_numeric_transformed[cols_numeric_to_transform] = np.log(df_numeric_transformed[cols_numeric_to_transform] + 1)

scaler = pp.StandardScaler()
df_numeric_transformed = pd.DataFrame(scaler.fit_transform(df_numeric_transformed), columns=df_numeric_transformed.columns)


## Colonne a séparer

In [None]:
# df_genres
df_genres.loc[:, "genres"] = df_genres["genres"].str.split("|")
df_genres = df.explode("genres")
enc_genres = pp.OneHotEncoder(sparse_output=False)

one_hot_encoded_data_genres = enc_genres.fit_transform(df_genres[['genres']])
feature_names_genres = enc_genres.get_feature_names_out(['genres'])

df_genres_encoded = pd.DataFrame(one_hot_encoded_data_genres, columns=feature_names_genres, index=df_genres.index)

df_genres_encoded = df_genres_encoded.groupby(df_genres_encoded.index).sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_genres.loc[:, "genres"] = df_genres["genres"].str.split("|")


In [None]:
# df_keywords
df_keywords.loc[:, "plot_keywords"] = df_keywords["plot_keywords"].str.split("|")
df_keywords = df.explode("plot_keywords")
enc_keywords = pp.OneHotEncoder(sparse_output=False)

one_hot_encoded_data_keywords = enc_keywords.fit_transform(df_genres[['genres']])
feature_names_keywords = enc_keywords.get_feature_names_out(['genres'])

df_keywords_encoded = pd.DataFrame(one_hot_encoded_data_keywords, columns=feature_names_keywords, index=df_keywords.index)

df_keywords_encoded = df_keywords_encoded.groupby(df_keywords_encoded.index).sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_keywords.loc[:, "plot_keywords"] = df_keywords["plot_keywords"].str.split("|")


# Concatenation des dataframes

In [None]:
df_encoded = pd.concat([df_textual_encoded,
                             df_numeric_transformed,
                             df_genres_encoded,
                             df_keywords_encoded],
                            axis=1)
df_encoded

Unnamed: 0,director_name_,director_name_A. Raven Cruz,director_name_Aaron Hann,director_name_Aaron Schneider,director_name_Aaron Seltzer,director_name_Abel Ferrara,director_name_Adam Brooks,director_name_Adam Carolla,director_name_Adam Goldberg,director_name_Adam Green,...,genres_Mystery|Thriller,genres_Mystery|Western,genres_Romance,genres_Romance|Sci-Fi|Thriller,genres_Sci-Fi,genres_Sci-Fi|Thriller,genres_Thriller,genres_Thriller|War,genres_Thriller|Western,genres_Western
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4789,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4790,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4791,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4792,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Réduction de dimention avec le PCA

In [None]:
pca = dc.PCA(n_components=0.95)

df_pca = pca.fit_transform(df_encoded)

df_preprocessed = pd.DataFrame(df_pca, columns=[f'PC{i}' for i in range(1, df_pca.shape[1] + 1)])

df_preprocessed

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC2395,PC2396,PC2397,PC2398,PC2399,PC2400,PC2401,PC2402,PC2403,PC2404
0,-3.802174,-2.692924,-1.131585,0.264511,3.190559,-0.739857,0.211588,-0.286955,0.601809,-0.842412,...,0.041635,0.009795,0.003354,0.015910,0.001525,0.009207,-0.012693,0.042670,-0.013251,-0.016500
1,-4.772900,-0.614090,0.808033,-0.857052,1.003409,1.100208,1.555663,0.254965,0.373681,0.205412,...,-0.007978,0.014900,0.010198,0.005582,0.011372,0.006914,0.001506,-0.021333,0.008248,0.009307
2,-2.935003,-1.412923,-1.942175,0.474058,2.416977,-0.702014,0.329273,-0.217745,-0.339259,0.886364,...,0.021410,-0.027779,-0.011726,-0.003532,0.013362,0.022517,-0.032055,-0.019335,0.012103,-0.002361
3,-6.980701,-0.936806,-0.103729,1.292603,0.444591,2.257654,-0.322197,0.378269,0.442904,-1.391594,...,-0.009648,0.000575,0.011874,0.005220,0.004815,0.000079,0.030159,0.002895,0.004608,-0.023445
4,-2.336121,-1.620767,-1.791077,-0.322834,0.701191,1.250605,-0.231826,-0.588186,-0.135520,-0.840286,...,-0.007495,0.022275,0.031822,-0.040544,-0.012178,0.087387,0.031546,0.050091,0.032827,0.017584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4789,4.935560,1.418683,0.488220,3.391850,-1.308475,-1.424447,0.307629,-0.365095,1.097452,-1.169199,...,-0.012745,0.014528,-0.021849,-0.026089,-0.020936,-0.003936,0.000358,-0.014212,-0.022204,-0.041207
4790,2.805571,2.091862,0.510929,0.732437,1.612661,0.484276,0.012953,2.232365,-1.024234,-0.515168,...,0.041601,-0.010403,0.052973,-0.038905,0.043116,-0.047261,0.015592,-0.001381,-0.014925,-0.028176
4791,10.067781,-1.710436,-0.989198,-0.100150,-0.096851,-0.882329,-1.436930,2.141839,2.858181,-0.605962,...,0.025061,0.049891,0.014604,-0.021651,0.024803,0.014948,0.028480,-0.025507,-0.037607,-0.016626
4792,2.497162,2.359794,-0.266969,2.037687,0.496257,-0.223464,1.270817,-0.908257,-1.761074,-1.213863,...,-0.043133,-0.045754,-0.028984,0.013568,-0.001717,0.018660,-0.029337,0.008803,-0.026257,0.031036


# Sauvegarde du fichier aprés le prétraitement terminé

In [None]:
df_preprocessed.to_csv("preprocessed_data.csv.gz", compression="gzip")