In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import missingno as msno
import plotly.express as px
from wordcloud import WordCloud
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.express as px 
import collections
from mpl_toolkits.mplot3d import Axes3D
from plotly.offline import iplot , plot 
from plotly.subplots import make_subplots
plt.style.use('ggplot')
import warnings
warnings.filterwarnings("ignore")

In [None]:
merge = "C:\\Users\\559816\\Documents\\Project2\\Projet-Data-IA\\Data\\merged_final.csv"
df = pd.read_csv(merge, sep = ',')
df = df.drop(columns = ['titleId', 'poster_path',
       'backdrop_path', 'nconst_director'])

In [None]:
# mettre en colonnes dans Streamlit :

# col1, col2, col3 = st.columns(3)
# col1.plotly_chart(fig1, use_container_width=True)
# col2.plotly_chart(fig2, use_container_width=True)
# col3.plotly_chart(fig3, use_container_width=True)

unique_film_count = df['title'].nunique()
unique_genre1_count = df['genre1'].nunique()
unique_directors_count = df['Director_name'].nunique()

fig1 = go.Figure(go.Indicator(
    mode="number",
    value=unique_film_count,
    title="Films",
    number={'font': {'color': 'lightblue'}}
))
fig2 = go.Figure(go.Indicator(
    mode="number",
    value=unique_genre1_count,
    title="Genres",
    number={'font': {'color': 'lightblue'}}
))
fig3 = go.Figure(go.Indicator(
    mode="number",
    value=unique_directors_count,
    title="Réalisateurs",
    number={'font': {'color': 'lightblue'}}
))
fig1.update_layout(
    width=800,
    height=300,
    template='plotly_dark')
fig2.update_layout(
    width=800,
    height=300,
    template='plotly_dark')
fig3.update_layout(
    width=800,
    height=300,
    template='plotly_dark')

fig1.show()
fig2.show()
fig3.show()

In [None]:
df = df[df['startYear'] != 2024]

# Nombre de films par année

yearly_counts = df.groupby('startYear').size().reset_index(name='count')

fig = px.line(
    yearly_counts,
    x = 'startYear',
    y = 'count',
    markers=True,
    labels={'startYear': 'Année', 'count': 'Nombre de films'},
    title='Nombre de films par année'
)
fig.update_traces(line=dict(color='#c63256'))
fig.update_layout(template='plotly_dark')
fig.show()

In [None]:
# Genres

# First subplot
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
data = df['genre1'].value_counts().to_dict()
wc = WordCloud(width=2000, height=1000, random_state=1, background_color='#191919', colormap='rainbow').generate_from_frequencies(data)
plt.imshow(wc)
plt.title('Genres principaux', fontsize=20)
plt.axis('off')

# Second subplot
plt.subplot(1, 2, 2)
data = df['genre2'].value_counts().to_dict()
wc = WordCloud(width=2000, height=1000, random_state=1, background_color='#191919', colormap='rainbow').generate_from_frequencies(data)
plt.imshow(wc)
plt.title('Genres secondaires', fontsize=20)
plt.axis('off')

plt.tight_layout()
plt.show()


In [None]:
# Quantité de films par genre principal et secondaire

# Calculate the unique counts for each genre in both primary and secondary
genre_counts = df.apply(lambda row: pd.Series(row[['genre1', 'genre2']].unique()), axis=1).stack().value_counts()

index = genre_counts.index
x = list(range(len(index)))
bar_width = 0.8

fig = go.Figure()
fig.add_trace(go.Bar(
    x=x, 
    y=genre_counts.values, 
    width=bar_width, 
    name='Total genre counts'
))
fig.update_layout(
    xaxis=dict(
        tickmode='array',
        tickvals=x,
        ticktext=index,
        tickangle=45
    ),
    legend=dict(title='Légende'),
    xaxis_title='Genre',
    yaxis_title='Quantité',
    title='Quantité de films par genre',
    template='plotly_dark'
)
fig.show()

In [None]:
# Evolution des genres de film par décennie

df['startYear'] = df['startYear'].astype(int)
df_filtered = df[df['startYear'] <= 2023]
df_filtered['decade'] = (df_filtered['startYear'] // 10) * 10
df_genre_count = df_filtered.groupby(['decade', 'genre1']).size().reset_index(name='total_genre')

fig = px.bar(df_genre_count, 
             x='genre1', 
             y='total_genre', 
             color='genre1', 
             animation_frame='decade', 
             title='Evolution des genres de film par décennie',
             category_orders={'decade': list(range(df_genre_count['decade'].min(), 2024, 10))})

fig.update_layout(yaxis_range=[0, df_genre_count['total_genre'].max() * 1.1], template = 'plotly_dark')

animation_settings = {
    "frame": {"duration": 2000, "redraw": True},
    "fromcurrent": True, 
    "transition": {"duration": 500, "easing": "quadratic-in-out"},
}
fig.show()

In [None]:
# Distribution des notes moyenne, ax y à changer

fig = go.Figure()

fig.add_trace(go.Histogram(
    x=df['averageRating'],
    marker_color='#c63256',
    name='Note moyenne',
    xbins=dict(
        start=min(df['averageRating']),
        end=max(df['averageRating']),
        size=0.5  # Adjust the bin size as needed
    )
))
fig.update_layout(
    title='Distribution des notes moyennes',
    xaxis_title='Note moyenne',
    yaxis_title='Nombre de films',
    template='plotly_dark'
)
fig.show()


In [None]:
topscore = pd.Series(collections.Counter(df['averageRating']), name="IMDb Rating").to_frame(name = 'Count').sort_values(by = 'Count',ascending=False).head(5)

from plotly.subplots import make_subplots
colors = ['lightslategray',] * 5
colors[0] = '#33C7FF'
fig1 = make_subplots(rows=1, cols=2)
trace0 = go.Histogram(x=df['averageRating'], showlegend=False)
trace1 = go.Bar(x=topscore.index, y=topscore.Count,marker_color=colors, showlegend=False)
fig1.append_trace(trace0, 1, 1)
fig1.append_trace(trace1, 1, 2)
fig1.update_layout(height=600, width=920, title_text="Distribution des notes moyennes", template='plotly_dark')
fig1.show()

In [None]:
# Recent movies
recent_movies = df[df['startYear'] >= 2022]
recent_movies = recent_movies.sort_values(by=['averageRating'], ascending=False).head(15)
recent_movies = recent_movies.rename(columns={
    'title': 'Titre',
    'startYear': 'Année de sortie',
    'Director_name': 'Réalisateur',
    'genre1': 'Genre principal',
    'averageRating': 'Note moyenne'
})
# Top 15 movies
top15movies = df.sort_values(by=['averageRating'], ascending=False).head(15)
top15movies = top15movies.rename(columns={
    'title': 'Titre',
    'startYear': 'Année de sortie',
    'Director_name': 'Réalisateur',
    'genre1': 'Genre principal',
    'averageRating': 'Note moyenne'
})
# Create subplot figure
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=("15 meilleurs films récents", "Les 15 meilleurs films dans IMDb"),
    specs=[[{'type': 'domain'}, {'type': 'domain'}]]
)
# Add recent movies table
fig.add_trace(
    go.Table(
        header=dict(values=['Title', 'Release Year', 'Director', 'Category', 'Rating'],
                    fill_color='Khaki',
                    height=30,
                    align='left',
                    font=dict(color='black', size=13)),
        cells=dict(values=[recent_movies['Titre'], recent_movies['Année de sortie'], recent_movies['Réalisateur'], recent_movies['Genre principal'], recent_movies['Note moyenne']],
                   fill_color='lavender',
                   height=30,
                   align='left',
                   font=dict(color='black', size=12))
    ),
    row=1, col=1
)
# Add top 15 movies table
fig.add_trace(
    go.Table(
        header=dict(values=['Title', 'Release Year', 'Director', 'Category', 'Rating'],
                    fill_color='Khaki',
                    height=30,
                    align='left',
                    font=dict(color='black', size=13)),
        cells=dict(values=[top15movies['Titre'], top15movies['Année de sortie'], top15movies['Réalisateur'], top15movies['Genre principal'], top15movies['Note moyenne']],
                   fill_color='lavender',
                   height=30,
                   align='left',
                   font=dict(color='black', size=12))
    ),
    row=1, col=2
)
fig.update_layout(height=800, width=1500, template='plotly_dark')
fig.show()


In [None]:
import io
from PIL import Image

# Generate WordCloud
plt.figure(figsize=(16,10))
data = df['Director_name'].value_counts().to_dict()
wc = WordCloud(width= 2000, height = 1000, random_state=1, background_color='#191919', colormap='tab20c').generate_from_frequencies(data)

# Save the WordCloud as an image
wc_image = io.BytesIO()
wc.to_image().save(wc_image, format='PNG')
wc_image.seek(0)

# Create the pie chart using Plotly
top_directors = df['Director_name'].value_counts().head(15)
pie_fig = px.pie(
    names=top_directors.index,
    values=top_directors.values,
    title='Les 15 réalisateurs les plus prolifiques',
    labels={'names': 'Réalisateur', 'values': 'Nombre de films'}
)
pie_fig.update_traces(textposition='outside', textinfo='percent+label', marker=dict(line=dict(color='#000000', width=2)))
pie_fig.update_layout(template='plotly_dark')

# Create a subplot figure with adjusted column widths
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=("Réalisateurs", "Les 15 réalisateurs les plus prolifiques"),
    specs=[[{"type": "image"}, {"type": "domain"}]],
    column_widths=[0.65, 0.35]
)
# Add the WordCloud image
img_pil = Image.open(wc_image)
fig.add_trace(
    go.Image(z=img_pil),
    row=1, col=1
)
# Add the pie chart
for trace in pie_fig.data:
    fig.add_trace(trace, row=1, col=2)

fig.update_layout(height=600, width=1700, showlegend=True, template='plotly_dark')
fig.show()

In [None]:
# Diviser les noms des acteurs et actrices dans une liste
df['Actors_Actresses'] = df['Actors_Actresses'].str.split(',')
df_actors = df.explode('Actors_Actresses')

In [None]:
# Get top 20 actors/actresses and their counts

count_actors = df_actors['Actors_Actresses'].value_counts().head(15)

fig = go.Figure(data=[go.Bar(
    x=count_actors.index,
    y=count_actors.values,
    marker=dict(color='#c63256'),
)])

fig.update_layout(
    title='Acteurs et actrices les plus populaires',
    xaxis_title='Acteur/Actrice',
    yaxis_title='Nombre de films',
    xaxis=dict(tickfont=dict(size=10)),
    height=500,
    margin=dict(l=100, r=20, t=50, b=50),
    template='plotly_dark',
    font=dict(size=10) 
)
fig.show()


In [None]:
# Top 5 acteurs/actrices les plus prolifiques par décennie

df['decade'] = (df['startYear'] // 10) * 10
df_exploded = df.explode('Actors_Actresses')
actor_counts_by_decade = df_exploded.groupby(['decade', 'Actors_Actresses']).size().reset_index(name='film_count')
top_actors_by_decade = actor_counts_by_decade.sort_values(by=['decade', 'film_count'], ascending=[True, False])
top_actors_by_decade = top_actors_by_decade.groupby('decade').head(5)

fig = px.bar(top_actors_by_decade, 
             x='Actors_Actresses', 
             y='film_count', 
             color='Actors_Actresses', 
             animation_frame='decade', 
             range_y=[0, top_actors_by_decade['film_count'].max() + 5],
             title='Top 5 acteurs/actrices les plus prolifiques par décennie',
             labels={'film_count': 'Nombre de films', 'Actors_Actresses': 'Acteur/Actrice'},
             height=600)

fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 1800

fig.update_layout(xaxis={'categoryorder':'total descending'}, showlegend=False, template = 'plotly_dark')
fig.show()

In [None]:
# Répartition des maisons de production

production_companies = df['production_companies_name'].value_counts().head(10)

fig = px.pie(
    names = production_companies.index,
    values = production_companies.values,
    title = 'Répartition des 10 maisons de production les plus populaires',
    labels={'names': 'Maisons de production', 'values': 'Pourcentage'},
)

fig.update_traces(textposition='outside', textinfo='percent+label', marker=dict(line=dict(color='#000000', width=2)))
fig.update_layout(template='plotly_dark')

fig.show()