# Tableau de Répartition des Bandes de Fréquence
## Projet Mastercamp Data

Pierre Bonnin

Romain Caussignac

Antoine Combaldieu

Alice Guillou

Mehdy Michalak

Jules Sucrot

# Importation et nettoyage de la Data

In [1]:
import streamlit as st
import numpy as np
import plotly.graph_objects as go
import pandas as pd
import matplotlib.pyplot as plt
import folium
from folium.plugins import HeatMap, MarkerCluster
import streamlit as st
import plotly.express as px

In [3]:
df = pd.read_csv("https://github.com/PRAJAM/Projet/raw/jules/Export_TER_juin2023_FIX_SafwanChendeb.csv",sep=";")


# DATA CLEAN

Enlever les valeurs nulles ou trop petites de Largeur de bande

In [4]:
df2 = df.dropna(subset=['ASS_LGBD_KHZ'])
df2 = df2[df2["ASS_LGBD_KHZ"] >1000]


Enlever les antennes qui ont été supprimées

In [None]:
enreg_to_supp = df2[df2['MVT_CODE'] == 'SUP']['N° ENREG']
df2 = df2[~df2['N° ENREG'].isin(enreg_to_supp)]


Garder seulement les ajouts d'antenne

In [None]:
df2  = df2[df2['MVT_CODE'] == 'ADD']

Enlever les MOD

In [None]:
df2 = df2[df2['MVT_CODE'] != 'MOD' ]

Garder seulement les dernières modifications de chaque antenne

In [None]:
df2['Date CAF'] = pd.to_datetime(df['Date CAF'], format='%d/%m/%Y')
df2 = df2.sort_values('Date CAF').drop_duplicates(subset = "N° ENREG", keep = 'last')

Échantillonage des données (10%)

In [5]:
sample_fraction = 0.1  
df = df.sample(frac=sample_fraction, random_state=1)

TEST

In [None]:
df2.head(10)

# Visualisation des données

## Graph du nombre d'antennes ajoutées par mois et du nombre d'antennes en service par mois

In [7]:
st.title("Analyse des Antennes")

enreg_to_supp = df2[df2['MVT_CODE'] == 'SUP']['N° ENREG']
df2 = df2[~df2['N° ENREG'].isin(enreg_to_supp)]
df2 = df2[df2['MVT_CODE'] == 'ADD']
df2 = df2[df2['MVT_CODE'] != 'MOD' ]
df2['Date CAF'] = pd.to_datetime(df['Date CAF'], format='%d/%m/%Y')
df2 = df2.sort_values('Date CAF').drop_duplicates(subset = "N° ENREG", keep = 'last')
df2['YearMonth'] = df2['Date CAF'].dt.to_period('M')
monthly_counts = df2['YearMonth'].value_counts().sort_index().reset_index()
monthly_counts.columns = ['YearMonth', 'Number of Antennas']

monthly_counts['YearMonth'] = monthly_counts['YearMonth'].astype(str)

DeltaGenerator()

Plot du graph

## Graph du nombre d'antennes en service par mois

Création d'un nouveau Dataframe pour les données des antennes par mois

In [7]:
df2['Date CAF'] = pd.to_datetime(df2['Date CAF'], format='%d/%m/%Y')
df2 = df2.sort_values(by='Date CAF')
df2['YearMonth'] = df2['Date CAF'].dt.to_period('M')

adds_per_month = df2[df2['MVT_CODE'] == 'ADD'].groupby('YearMonth').size()
sups_per_month = df2[df2['MVT_CODE'] == 'SUP'].groupby('YearMonth').size()
service_df_corrected = pd.DataFrame({'Adds': adds_per_month, 'Sups': sups_per_month}).fillna(0)
service_df_corrected['Cumulative Adds'] = service_df_corrected['Adds'].cumsum()
service_df_corrected['Cumulative Sups'] = service_df_corrected['Sups'].cumsum()
service_df_corrected['Antennas in Service'] = service_df_corrected['Cumulative Adds'] - service_df_corrected['Cumulative Sups']
service_df_corrected = service_df_corrected.reset_index()
service_df_corrected['YearMonth'] = service_df_corrected['YearMonth'].astype(str)

Traçage de la figure avec plotly

In [8]:
st.title("Analyse des Antennes")

fig = px.line(
    monthly_counts, 
    x='YearMonth', 
    y='Number of Antennas', 
    title='Évolution du nombre d\'ajout d\'antennes par mois',
    markers=True,
    labels={'YearMonth': 'Mois', 'Number of Antennas': 'Nombre d\'antennes ajoutées'}
)

fig.update_layout(
    xaxis_title='Mois',
    yaxis_title='Nombre d\'antennes ajoutées',
    xaxis_tickangle=-45
)

st.plotly_chart(fig)

fig_corrected = px.line(
    service_df_corrected, 
    x='YearMonth', 
    y='Antennas in Service', 
    title='Nombre d\'antennes en service par mois',
    markers=True,
    labels={'YearMonth': 'Mois', 'Antennas in Service': 'Nombre d\'antennes en service'}
)

fig_corrected.update_layout(
    xaxis_title='Mois',
    yaxis_title='Nombre d\'antennes en service',
    xaxis_tickangle=-45
)

st.plotly_chart(fig_corrected)

2024-07-03 03:30:12.847 
  command:

    streamlit run C:\Users\sucro\AppData\Roaming\Python\Python312\site-packages\ipykernel_launcher.py [ARGUMENTS]


DeltaGenerator()

conversion des données

In [None]:
# Fonction vectorisée pour convertir les coordonnées DMS en degrés décimaux
def dms_to_dd(degrees, minutes, seconds, orientation):
    dd = degrees + minutes / 60 + seconds / 3600
    dd[orientation.isin(['W', 'S'])] *= -1
    return dd

# Appliquer la conversion sur les colonnes de latitude et de longitude
df2['latitude'] = dms_to_dd(df2['PT_LAT_DEG'], df2['PT_LAT_MIN'], df2['PT_LAT_SEC'], df2['PT_LAT_ORIENT'])
df2['longitude'] = dms_to_dd(df2['PT_LONG_DEG'], df2['PT_LONG_MIN'], df2['PT_LONG_SEC'], df2['PT_LONG_ORIENT'])

#drop
df2 = df2.drop(['PT_LAT_DEG', 'PT_LAT_MIN', 'PT_LAT_SEC', 'PT_LAT_ORIENT', 'PT_LONG_DEG', 'PT_LONG_MIN', 'PT_LONG_SEC', 'PT_LONG_ORIENT', 'BASE', 'MVT_CODE', 'N° ENREG', 'TER_ANT_ANG', 'TER_ANT_AZM_MAX'], axis=1)

suppresion des valeurs aberrantes et inutiles

In [None]:
#drop ass_frq > 100 000 000
df2 = df2[df2['ASS_FRQ_KHZ'] < 50000000]


#drop ass_lgbd > 200 000
df2 = df2[df2['ASS_LGBD_KHZ'] < 200000]

#encadrer la longitude et latitude : 51.691467, -6.106181 et 41.924975, 8.307881
df2 = df2[(df2['latitude'] >= 41.924975) & (df2['latitude'] <= 51.691467) & (df2['longitude'] >= -6.106181) & (df2['longitude'] <= 8.307881)]


Map des antennes en France par année

In [None]:
@st.cache_data
def get_df_by_year(df_sorted):
    df_by_year = {}
    for year in df_sorted['Annee'].unique():
        current_year_rows = df_sorted[(df_sorted['Annee'] == year) & (df_sorted['MVT_CODE'].isin(['ADD', 'MOD']))]
        previous_years_rows = df_sorted[(df_sorted['Annee'] < year) & (~df_sorted['MVT_CODE'].str.contains('SUP', na=False))]
        combined_rows = pd.concat([previous_years_rows, current_year_rows])
        df_by_year[year] = combined_rows
    return df_by_year

df_sorted = df2()
df_by_year = get_df_by_year(df_sorted)

# Afficher la carte dans Streamlit
st.title('Carte des antennes avec HeatMap et clusters par année')

selected_year = st.selectbox("Sélectionner une année", df_sorted['Annee'].unique())
df_year = df_by_year[selected_year]

# Échantillonnage des données pour réduire le nombre de points
sample_fraction = 0.1  # Vous pouvez ajuster cette valeur
df_sample = df_year.sample(frac=sample_fraction, random_state=1)

# Créer une carte Folium centrée sur la France avec des tuiles légères
m = folium.Map(location=[46.603354, 1.888334], zoom_start=6, tiles='CartoDB positron')

# Ajouter des clusters de points pour une meilleure visualisation
marker_cluster = MarkerCluster().add_to(m)

# Ajouter les points au cluster
for idx, row in df_sample.iterrows():
    folium.Marker(location=[row['latitude'], row['longitude']]).add_to(marker_cluster)

# Préparer les données pour le HeatMap avec intensité
heat_data = [[row['latitude'], row['longitude'], 1] for index, row in df_sample.iterrows()]

# Ajouter le HeatMap à la carte avec des paramètres ajustés pour une meilleure performance et visibilité
HeatMap(heat_data, radius=15, blur=10, max_zoom=12, min_opacity=0.4).add_to(m)

# Afficher la carte dans Streamlit
st_folium(m, width=800, height=600)

Largeur de bande et services

In [None]:
# Convertir 'ASS_FRQ_KHZ' et 'ASS_LGBD_KHZ' en entiers si ce sont des floats
df2['ASS_FRQ_KHZ'] = df2['ASS_FRQ_KHZ'].astype(int)
df2['ASS_LGBD_KHZ'] = df2['ASS_LGBD_KHZ'].astype(int)

# Ajouter une colonne pour la bande de fréquence en MHz
df2['ASS_FRQ_MHZ'] = df2['ASS_FRQ_KHZ'] / 1000
df2['ASS_LGBD_MHZ'] = df2['ASS_LGBD_KHZ'] / 1000

# Ajouter une colonne pour la fréquence de fin en MHz
df2['END_FRQ_MHZ'] = df2['ASS_FRQ_MHZ'] + df2['ASS_LGBD_MHZ']

# Regrouper les données par année, service (Code CAF) et affectataire (BASE)
df2['Year'] = df2['Date CAF'].dt.year
grouped = df2.groupby(['Year', 'Code CAF', 'BASE']).agg(
    {'ASS_FRQ_MHZ': 'count', 'ASS_LGBD_MHZ': 'sum'}
).reset_index()

# Renommer les colonnes pour plus de clarté
grouped.columns = ['Year', 'Service', 'Affectataire', 'Num_Bands', 'Total_Bandwidth_MHz']

fig_bar = px.bar(grouped, x='Year', y='Total_Bandwidth_MHz', color='Service', barmode='stack',
             hover_data=['Affectataire', 'Num_Bands'],
             title='Largeur de Bande Totale par Année et par Service/Affectataire')

fig_bar.update_layout(xaxis=dict(tickformat='d'), yaxis=dict(tickformat=','))
st.plotly_chart(fig_bar)

fig_pie = px.pie(grouped, values='Total_Bandwidth_MHz', names='Service', title='Répartition de la Largeur de Bande par Service')
st.plotly_chart(fig_pie)


Machine learning

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import MiniBatchKMeans
# Sélection des colonnes numériques et non numériques
numerical_cols = ['ASS_FRQ_KHZ', 'ASS_LGBD_KHZ',
                  'latitude', 'longitude']
categorical_cols = ['Code CAF']  # Exemple de colonnes catégorielles à encoder


# Encodage des colonnes catégorielles
label_encoder = LabelEncoder()

df_encoded = df2.copy()

for col in categorical_cols:
    df_encoded[col] = label_encoder.fit_transform(df_encoded[col])

# Normalisation des données
scaler = StandardScaler()

df_standardized = df_encoded.copy()

df_standardized[numerical_cols] = scaler.fit_transform(df_standardized[numerical_cols])
df_standardized[categorical_cols] = scaler.fit_transform(df_standardized[categorical_cols])
df_standardized['Date CAF'] = scaler.fit_transform(df_standardized[['Date CAF']])
df_standardized['Code CAF'] /= 3
df_standardized['Date CAF'] /= 2
df_standardized['Date CAF'] +=1.5

In [None]:
#plot each column
df_standardized.boxplot(figsize=(12, 8))

Matrice de corrélation

In [None]:
#matrice de correlation
import matplotlib.pyplot as plt
import seaborn as sns

correlation_matrix = df_standardized.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)

KMeans

In [None]:
from sklearn.cluster import MiniBatchKMeans

wcss = []
for i in range(1, 11):
    kmeans = MiniBatchKMeans(n_clusters=i, random_state=42)
    kmeans.fit(df_standardized)
    wcss.append(kmeans.inertia_)

# Tracé du coude (Elbow Method)
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss, marker='o', linestyle='--')
plt.xlabel('Nombre de Clusters')
plt.ylabel('WCSS (Inertia)')
plt.title('Méthode du Coude pour K-Means')
plt.show()


In [None]:
df_standardized.head()
from sklearn.cluster import KMeans

#kmeans
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(df_standardized)

In [None]:
#boxplot each attribute with clusters
df_standardized['cluster'] = kmeans.labels_
df_standardized.boxplot(by='cluster', figsize=(12, 8))


In [None]:
#scatter df plotly
import plotly.express as px
fig = px.scatter(df2, x='Date CAF', y='ASS_FRQ_KHZ', color="Code CAF")

fig.show()

In [None]:
#scatter df plotly
import plotly.express as px
fig = px.scatter(df2, x='Date CAF', y='ASS_FRQ_KHZ', color=kmeans.labels_)


fig.show()

KNN

In [None]:
#knn classification for code caf
from sklearn.neighbors import KNeighborsClassifier

# Define the features and target variables
X = df_standardized[['ASS_FRQ_KHZ', 'ASS_LGBD_KHZ', 'longitude', 'latitude', 'Date CAF']]
y = df2['Code CAF']

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the KNN classifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# Evaluate the KNN classifier
from sklearn.metrics import accuracy_score

y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

#f1 score
from sklearn.metrics import f1_score

f1 = f1_score(y_test, y_pred, average='weighted')
print('F1 Score:', f1)

#y_pred against y_test heat
from sklearn.metrics import confusion_matrix

confusion = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:\n', confusion)



Random Forest

In [None]:
#random forest pour predir la frequence

from sklearn.ensemble import RandomForestRegressor

# Define the features and target variables
X = df_standardized[['ASS_LGBD_KHZ', 'longitude', 'latitude', 'Date CAF']]
y = df2['ASS_FRQ_KHZ']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluate the Random Forest regressor
y_pred = rf.predict(X_test)

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error:', mse)

# Feature importance
importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances
from matplotlib import pyplot as plt
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
        color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

#plot ypred against y test
import matplotlib.pyplot as plt
plt.scatter(y_test, y_pred)
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.show()
