In [6]:
import pandas as pd

# Read data from Excel file with header starting from the second row
df = pd.read_excel('Dummy Customer Compliant Data.xlsx', header=1)

# Assuming your date column is named 'source creation date'
# Convert the 'source creation date' column to datetime format
df['source creation date'] = pd.to_datetime(df['SOURCE CREATION DATE'], format='%d/%m/%Y')

# Extract year, month, and day into separate columns
df['year'] = df['source creation date'].dt.year
df['month'] = df['source creation date'].dt.month
df['day'] = df['source creation date'].dt.day

# Print the DataFrame with separated columns
print(df)



       INCIDENT N°     PROTECTIONS  \
0        212472461             NaN   
1        212472456             NaN   
2        212467991             NaN   
3        212468029             NaN   
4        212465847             NaN   
...            ...             ...   
10665    128458648             NaN   
10666    128458565             NaN   
10667    128441430             NaN   
10668    128407887  LUP Q000331834   
10669    128405012             NaN   

                                      ASSIGNED TO TOPICS  \
0                                                    NaN   
1                                                    NaN   
2                                                    NaN   
3                                                    NaN   
4                                                    NaN   
...                                                  ...   
10665                                                NaN   
10666                                     SFE-B002222259   
106

In [7]:
print(df.columns)

Index(['INCIDENT N°', 'PROTECTIONS', 'ASSIGNED TO TOPICS',
       'TOPICS OF THE OPERATION', 'STATE', 'TRANSFERED FROM', 'TRANSFERRED TO',
       'Dummy VIN', 'GI COMMENT', 'TAGS',
       ...
       'FIRST DETECTION', 'LAST DETECTION', 'LAST UPDATE OF DETECTION PILOTE',
       'LAST UPDATE OF INCIDENT PILOTE', 'KEYWORDS', 'DETECTION QUERY',
       'source creation date', 'year', 'month', 'day'],
      dtype='object', length=141)


In [8]:
# Assuming your DataFrame is named 'df'
# Fill NaN values in 'year', 'month', and 'day' columns with zeros
df['year'] = df['year'].fillna(0).astype(int)
df['month'] = df['month'].fillna(0).astype(int)
df['day'] = df['day'].fillna(0).astype(int)

# Print the DataFrame to verify changes
print(df)


       INCIDENT N°     PROTECTIONS  \
0        212472461             NaN   
1        212472456             NaN   
2        212467991             NaN   
3        212468029             NaN   
4        212465847             NaN   
...            ...             ...   
10665    128458648             NaN   
10666    128458565             NaN   
10667    128441430             NaN   
10668    128407887  LUP Q000331834   
10669    128405012             NaN   

                                      ASSIGNED TO TOPICS  \
0                                                    NaN   
1                                                    NaN   
2                                                    NaN   
3                                                    NaN   
4                                                    NaN   
...                                                  ...   
10665                                                NaN   
10666                                     SFE-B002222259   
106

In [9]:
import pandas as pd

# Create a DataFrame with all months (1 to 12)
df = pd.DataFrame({'month': range(1, 13)})

# Define a function to map numerical month values to seasons
def map_season(month):
    if month in range(2, 7):
        return 'Summer'
    elif month in [10, 11, 12, 1]:
        return 'Monsoon'
    else:
        return 'Wind'

# Apply the function to create a new 'season' column
df['season'] = df['month'].apply(map_season)

# Print the DataFrame with 'season' column
print(df)


    month   season
0       1  Monsoon
1       2   Summer
2       3   Summer
3       4   Summer
4       5   Summer
5       6   Summer
6       7     Wind
7       8     Wind
8       9     Wind
9      10  Monsoon
10     11  Monsoon
11     12  Monsoon


In [15]:

print(df.columns)


Index(['INCIDENT N°', 'PROTECTIONS', 'ASSIGNED TO TOPICS',
       'TOPICS OF THE OPERATION', 'STATE', 'TRANSFERED FROM', 'TRANSFERRED TO',
       'Dummy VIN', 'GI COMMENT', 'TAGS',
       ...
       'BLOCKING FOR TEST ?', 'ALGO', 'DETECTION PILOT', 'INCIDENT PILOT',
       'FIRST DETECTION', 'LAST DETECTION', 'LAST UPDATE OF DETECTION PILOTE',
       'LAST UPDATE OF INCIDENT PILOTE', 'KEYWORDS', 'DETECTION QUERY'],
      dtype='object', length=137)


In [24]:
import pandas as pd

# Read data from Excel file with header starting from the second row
df = pd.read_excel('Dummy Customer Compliant Data.xlsx', header=1)
df['PIMOF'] = df['PIMOF'].replace({'yes': 1, 'no': 0})
# Assuming your date column is named 'source creation date'
# Convert the 'source creation date' column to datetime format
df['source creation date'] = pd.to_datetime(df['SOURCE CREATION DATE'], format='%d/%m/%Y')

# Extract year, month, and day into separate columns
df['year'] = df['source creation date'].dt.year
df['month'] = df['source creation date'].dt.month
df['day'] = df['source creation date'].dt.day

# Concatenate the new columns with the original DataFrame
df = pd.concat([df[['day', 'month', 'year']], df], axis=1)

# Print the DataFrame with separated columns
print(df.columns)

Index(['day', 'month', 'year', 'INCIDENT N°', 'PROTECTIONS',
       'ASSIGNED TO TOPICS', 'TOPICS OF THE OPERATION', 'STATE',
       'TRANSFERED FROM', 'TRANSFERRED TO',
       ...
       'FIRST DETECTION', 'LAST DETECTION', 'LAST UPDATE OF DETECTION PILOTE',
       'LAST UPDATE OF INCIDENT PILOTE', 'KEYWORDS', 'DETECTION QUERY',
       'source creation date', 'year', 'month', 'day'],
      dtype='object', length=144)


In [32]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Assuming you have already loaded the DataFrame and added the 'year', 'month', 'day', and 'season' columns
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode categorical variables to numeric values
X['PIMOF'] = label_encoder.fit_transform(X['PIMOF'])

# Scaling the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)





# Selecting columns for clustering
X = df[['INCIDENT N°', 'PIMOF', 'MIS', 'MILEAGE (KM)', 'day', 'month', 'year']]

# Ensure 'day', 'month', and 'year' columns are numeric
X['day'] = pd.to_numeric(X['day'], errors='coerce')
X['month'] = pd.to_numeric(X['month'], errors='coerce')
X['year'] = pd.to_numeric(X['year'], errors='coerce')

# Handling missing values
X.fillna(0, inplace=True)  # Replace missing values with 0 or any other appropriate value

# Scaling the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# Selecting columns for clustering
X = df[['INCIDENT N°', 'PIMOF', 'MIS', 'MILEAGE (KM)', 'SOURCE CREATION DATE']]

# Handling missing values
X.fillna(0, inplace=True)  # Fill missing values with 0, assuming missing values indicate absence of information

# Scaling the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize KMeans clustering with a chosen number of clusters
kmeans = KMeans(n_clusters=3)

# Fit KMeans clustering to the scaled data
kmeans.fit(X_scaled)

# Assign cluster labels to the original DataFrame
df['cluster_label'] = kmeans.labels_

# Print the DataFrame with cluster labels
print(df)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['PIMOF'] = label_encoder.fit_transform(X['PIMOF'])


ValueError: could not convert string to float: '21/01/2024'

In [34]:
print(df.head())



   INCIDENT N° PROTECTIONS ASSIGNED TO TOPICS TOPICS OF THE OPERATION STATE  \
0    212472461         NaN                NaN                     NaN     -   
1    212472456         NaN                NaN                     NaN     -   
2    212467991         NaN                NaN                     NaN     -   
3    212468029         NaN                NaN                     NaN     -   
4    212465847         NaN                NaN                     NaN     -   

   TRANSFERED FROM  TRANSFERRED TO  Dummy VIN GI COMMENT TAGS  ...  \
0              NaN             NaN          1        NaN  NaN  ...   
1              NaN             NaN          2        NaN  NaN  ...   
2              NaN             NaN          3        NaN  NaN  ...   
3              NaN             NaN          4        NaN  NaN  ...   
4              NaN             NaN          5        NaN  NaN  ...   

  BLOCKING FOR TEST ? ALGO DETECTION PILOT INCIDENT PILOT FIRST DETECTION  \
0                 NaN  NaN 

In [35]:
# Assuming df is your DataFrame and 'column_name' is the name of the column you want to inspect
column_head = df['SOURCE CREATION DATE'].head()
print(column_head)


0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
Name: SOURCE CREATION DATE, dtype: object


In [38]:
# Convert 'SOURCE CREATION DATE' column to datetime format with specified format
df['SOURCE CREATION DATE'] = pd.to_datetime(df['SOURCE CREATION DATE'], format='%d/%m/%Y', errors='coerce')

# Extract day, month, and year into separate columns
df['day'] = df['SOURCE CREATION DATE'].dt.day
df['month'] = df['SOURCE CREATION DATE'].dt.month
df['year'] = df['SOURCE CREATION DATE'].dt.year

# Display the head of the new columns
print(df[['day', 'month', 'year']].head())
print(df[['day', 'month', 'year']].tail())



   day  month  year
0  NaN    NaN   NaN
1  NaN    NaN   NaN
2  NaN    NaN   NaN
3  NaN    NaN   NaN
4  NaN    NaN   NaN
        day  month    year
10665   4.0    1.0  2023.0
10666   NaN    NaN     NaN
10667   NaN    NaN     NaN
10668  28.0    3.0  2023.0
10669   4.0   11.0  2023.0


In [40]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Assuming df is your DataFrame with the specified columns
# Drop rows with NaN values as KMeans cannot handle them
df = df.dropna(subset=['day', 'month', 'year', 'INCIDENT N°', 'PIMOF', 'MIS', 'MILEAGE (KM)'])



from sklearn.preprocessing import LabelEncoder

# Label encode the 'PIMOF' column
label_encoder = LabelEncoder()
df['PIMOF'] = label_encoder.fit_transform(df['PIMOF'])

# Now you can proceed with scaling and clustering

# Selecting the columns for clustering
X = df[['INCIDENT N°', 'PIMOF', 'MIS', 'MILEAGE (KM)', 'day', 'month', 'year']]

# Scaling the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize KMeans clustering with a chosen number of clusters
kmeans = KMeans(n_clusters=3)

# Fit KMeans clustering model to the data
kmeans.fit(X_scaled)

# Get cluster labels
cluster_labels = kmeans.labels_

# Add cluster labels to the DataFrame
df['cluster'] = cluster_labels

# Display the DataFrame with cluster labels
print(df[['INCIDENT N°', 'PIMOF', 'MIS', 'MILEAGE (KM)', 'day', 'month', 'year', 'cluster']].head())


    INCIDENT N°  PIMOF  MIS  MILEAGE (KM)   day  month    year  cluster
72    212429057      0   12         22636  21.0    1.0  2024.0        2
75    212429018      0   16          4923  18.0    1.0  2024.0        2
80    212420384      0   16          9124  19.0    1.0  2024.0        2
81    212420807      0   12         22191  19.0    1.0  2024.0        2
82    212420285      0    7         21356  18.0    1.0  2024.0        2


In [62]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Initialize an empty list to store silhouette scores
silhouette_scores = []

# Specify the range of clusters to try
max_clusters = 20
for i in range(2, max_clusters + 1):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(X_scaled)
    silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))

# Find the index of the maximum silhouette score
optimal_index = silhouette_scores.index(max(silhouette_scores))

# Print silhouette scores for all clusters
for i, score in enumerate(silhouette_scores, start=2):
    print(f"Silhouette score for {i} clusters: {score}")

# Print the optimal number of clusters
optimal_clusters = optimal_index + 2  # Adding 2 because range starts from 2 clusters
print("Optimal number of clusters:", optimal_clusters)



Silhouette score for 2 clusters: 0.2304901607683982
Silhouette score for 3 clusters: 0.269564261551279
Silhouette score for 4 clusters: 0.3027371250700183
Silhouette score for 5 clusters: 0.2543481782077171
Silhouette score for 6 clusters: 0.2546911775492127
Silhouette score for 7 clusters: 0.26696702471488565
Silhouette score for 8 clusters: 0.2693973400465679
Silhouette score for 9 clusters: 0.27912564244759264
Silhouette score for 10 clusters: 0.2875960134021934
Silhouette score for 11 clusters: 0.29594068925571415
Silhouette score for 12 clusters: 0.28903570840256
Silhouette score for 13 clusters: 0.28566269130603666
Silhouette score for 14 clusters: 0.2707407457702138
Silhouette score for 15 clusters: 0.2672693626279463
Silhouette score for 16 clusters: 0.27272360093144893
Silhouette score for 17 clusters: 0.2693735169048412
Silhouette score for 18 clusters: 0.2741644330132871
Silhouette score for 19 clusters: 0.27410486514310456
Silhouette score for 20 clusters: 0.270346335665218

In [52]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Select columns for clustering
selected_columns = ['INCIDENT N°', 'PIMOF', 'MIS', 'MILEAGE (KM)', 'day', 'month', 'year']

# Subset the dataframe with selected columns
X = df[selected_columns]

# Drop rows with missing values
X.dropna(inplace=True)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize KMeans clustering
kmeans = KMeans(n_clusters=4, random_state=42)

# Fit the model to the scaled data
kmeans.fit(X_scaled)

# Get cluster labels
cluster_labels = kmeans.labels_

# Calculate silhouette score
silhouette_avg = silhouette_score(X_scaled, cluster_labels)
print("Silhouette Score:", silhouette_avg)

# Display cluster labels
print("Cluster Labels:")
print(cluster_labels)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.dropna(inplace=True)


Silhouette Score: 0.3027371250700183
Cluster Labels:
[1 1 1 ... 3 0 3]


In [56]:
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, confusion_matrix

# Assuming df is your DataFrame containing the data
# Selecting columns of interest
columns_of_interest = ['INCIDENT N°', 'PIMOF', 'MIS', 'MILEAGE (KM)', 'day', 'month', 'year']
X = df[columns_of_interest]

# Preprocessing the data
# Handling missing values (you can use other methods depending on your data)
X.fillna(0, inplace=True)  # Filling missing values with 0 for simplicity

# Scaling numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Applying DBSCAN clustering
eps = 0.5  # Adjust as needed
min_samples = 5  # Adjust as needed
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
cluster_labels = dbscan.fit_predict(X_scaled)

# Evaluate clustering performance
silhouette_avg = silhouette_score(X_scaled, cluster_labels)
print("DBSCAN Silhouette Score:", silhouette_avg)

# Display confusion matrix
# Since DBSCAN does not require specifying the number of clusters, we'll use cluster labels directly
conf_matrix = confusion_matrix(cluster_labels, cluster_labels)
print("Confusion Matrix:")
print(conf_matrix)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(0, inplace=True)  # Filling missing values with 0 for simplicity


DBSCAN Silhouette Score: 0.06624978727494457
Confusion Matrix:
[[666   0   0 ...   0   0   0]
 [  0  77   0 ...   0   0   0]
 [  0   0   7 ...   0   0   0]
 ...
 [  0   0   0 ...   6   0   0]
 [  0   0   0 ...   0   5   0]
 [  0   0   0 ...   0   0   9]]


In [59]:
import numpy as np
from sklearn.decomposition import PCA

# Assuming X contains your data

# Initialize PCA
pca = PCA()

# Fit PCA
pca.fit(X)

# Find the number of components explaining a desired percentage of variance
desired_variance = 0.95  # Adjust as needed
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(cumulative_variance >= desired_variance) + 1
print(n_components)



1


In [61]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, confusion_matrix

# Assuming X contains your data with columns 'INCIDENT N°', 'PIMOF', 'MIS', 'MILEAGE (KM)', 'day', 'month', and 'year'

# Selecting columns for PCA
X_pca = X[['INCIDENT N°', 'PIMOF', 'MIS', 'MILEAGE (KM)', 'day', 'month', 'year']]

# Handling missing values if any
X_pca = X_pca.dropna()

# Standardizing the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_pca)

# Initialize PCA
pca = PCA()

# Fit PCA
pca.fit(X_scaled)

# Determine the number of components to explain 95% of the variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(cumulative_variance >= 0.95) + 1

# Reduce dimensionality to the determined number of components
pca = PCA(n_components=n_components)
X_pca_transformed = pca.fit_transform(X_scaled)

# Perform clustering on the PCA-transformed data
kmeans = KMeans(n_clusters=2, random_state=42)
cluster_labels = kmeans.fit_predict(X_pca_transformed)

# Evaluate clustering performance
silhouette_avg = silhouette_score(X_pca_transformed, cluster_labels)
print("PCA Silhouette Score:", silhouette_avg)

# Print confusion matrix
y_true = [0] * len(X_pca_transformed)  # Assuming we don't have ground truth labels
conf_matrix = confusion_matrix(y_true, cluster_labels)
print("Confusion Matrix:")
print(conf_matrix)



PCA Silhouette Score: 0.23049016076840076
Confusion Matrix:
[[3995 4518]
 [   0    0]]


In [63]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, confusion_matrix

# Assuming X contains your data with columns 'INCIDENT N°', 'PIMOF', 'MIS', 'MILEAGE (KM)', 'day', 'month', and 'year'

# Selecting columns for t-SNE
X_tsne = X[['INCIDENT N°', 'PIMOF', 'MIS', 'MILEAGE (KM)', 'day', 'month', 'year']]

# Handling missing values if any
X_tsne = X_tsne.dropna()

# Standardizing the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_tsne)

# Initialize t-SNE
tsne = TSNE(random_state=42)

# Fit t-SNE
X_tsne_transformed = tsne.fit_transform(X_scaled)

# Perform clustering on the t-SNE-transformed data
kmeans = KMeans(n_clusters=4, random_state=42)
cluster_labels = kmeans.fit_predict(X_tsne_transformed)

# Evaluate clustering performance
silhouette_avg = silhouette_score(X_tsne_transformed, cluster_labels)
print("t-SNE Silhouette Score:", silhouette_avg)

# Print confusion matrix
y_true = [0] * len(X_tsne_transformed)  # Assuming we don't have ground truth labels
conf_matrix = confusion_matrix(y_true, cluster_labels)
print("Confusion Matrix:")
print(conf_matrix)




t-SNE Silhouette Score: 0.38877115
Confusion Matrix:
[[2317 1171 2554 2471]
 [   0    0    0    0]
 [   0    0    0    0]
 [   0    0    0    0]]


In [68]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [69]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, confusion_matrix
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

# Assuming X contains your data with columns 'INCIDENT N°', 'PIMOF', 'MIS', 'MILEAGE (KM)', 'day', 'month', and 'year'

# Selecting columns for autoencoder
X_autoencoder = X[['INCIDENT N°', 'PIMOF', 'MIS', 'MILEAGE (KM)', 'day', 'month', 'year']]

# Handling missing values if any
X_autoencoder = X_autoencoder.dropna()

# Standardizing the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_autoencoder)

# Define the autoencoder model
input_dim = X_autoencoder.shape[1]
encoding_dim = 3  # Adjust this number as needed
input_layer = Input(shape=(input_dim,))
encoded = Dense(encoding_dim, activation='relu')(input_layer)
decoded = Dense(input_dim, activation='sigmoid')(encoded)
autoencoder = Model(input_layer, decoded)

# Compile the model
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Fit the autoencoder
autoencoder.fit(X_scaled, X_scaled, epochs=10, batch_size=32, shuffle=True, validation_split=0.2)

# Get the encoded representations of the data
encoder = Model(input_layer, encoded)
X_encoded = encoder.predict(X_scaled)

# Perform clustering on the encoded data
kmeans = KMeans(n_clusters=2, random_state=42)
cluster_labels = kmeans.fit_predict(X_encoded)

# Evaluate clustering performance
silhouette_avg = silhouette_score(X_encoded, cluster_labels)
print("Autoencoder Silhouette Score:", silhouette_avg)

# Print confusion matrix
y_true = [0] * len(X_encoded)  # Assuming we don't have ground truth labels
conf_matrix = confusion_matrix(y_true, cluster_labels)
print("Confusion Matrix:")
print(conf_matrix)


RuntimeError: module compiled against API version 0x10 but this version of numpy is 0xe

RuntimeError: module compiled against API version 0x10 but this version of numpy is 0xe

RuntimeError: module compiled against API version 0x10 but this version of numpy is 0xe

SystemError: initialization of _pywrap_checkpoint_reader raised unreported exception

In [77]:
import pandas as pd

# Read data from Excel file
data = pd.read_excel('Dummy Customer Compliant Data.xlsx',header=1)

# Convert 'SOURCE CREATION DATE' to datetime format and extract day, month, and year
data['SOURCE CREATION DATE'] = pd.to_datetime(data['SOURCE CREATION DATE'], format='%d/%m/%Y')
data['day'] = data['SOURCE CREATION DATE'].dt.day
data['month'] = data['SOURCE CREATION DATE'].dt.month
data['year'] = data['SOURCE CREATION DATE'].dt.year

# Now 'day', 'month', and 'year' columns are added to your DataFrame
print(data.columns)





Index(['INCIDENT N°', 'PROTECTIONS', 'ASSIGNED TO TOPICS',
       'TOPICS OF THE OPERATION', 'STATE', 'TRANSFERED FROM', 'TRANSFERRED TO',
       'Dummy VIN', 'GI COMMENT', 'TAGS',
       ...
       'INCIDENT PILOT', 'FIRST DETECTION', 'LAST DETECTION',
       'LAST UPDATE OF DETECTION PILOTE', 'LAST UPDATE OF INCIDENT PILOTE',
       'KEYWORDS', 'DETECTION QUERY', 'day', 'month', 'year'],
      dtype='object', length=140)


In [80]:
# Convert 'SOURCE CREATION DATE' column to datetime format with specified format
df['SOURCE CREATION DATE'] = pd.to_datetime(df['SOURCE CREATION DATE'], format='%d/%m/%Y', errors='coerce')

# Extract day, month, and year into separate columns
df['day'] = df['SOURCE CREATION DATE'].dt.day
df['month'] = df['SOURCE CREATION DATE'].dt.month
df['year'] = df['SOURCE CREATION DATE'].dt.year

# Display the head of the new columns
print(df[['day', 'month', 'year']].head())
print(df[['day', 'month', 'year']].tail())




    day  month  year
72   21      1  2024
75   18      1  2024
80   19      1  2024
81   19      1  2024
82   18      1  2024
       day  month  year
10660    4      1  2023
10664    4      1  2023
10665    4      1  2023
10668   28      3  2023
10669    4     11  2023


In [82]:
import pandas as pd

# Assuming df is your DataFrame with the existing columns
# Assuming you have already created 'day', 'month', and 'year' columns from 'SOURCE CREATION DATE'

# Concatenate the new columns with the existing DataFrame
df = pd.concat([df[['INCIDENT N°', 'PIMOF', 'MIS', 'MILEAGE (KM)']], df[['day', 'month', 'year']]], axis=1)

# Now df contains all the original columns along with the new 'day', 'month', and 'year' columns
print(df.head())  # Check the DataFrame to verify the concatenation


    INCIDENT N°  PIMOF  MIS  MILEAGE (KM)  day  month  year
72    212429057      0   12         22636   21      1  2024
75    212429018      0   16          4923   18      1  2024
80    212420384      0   16          9124   19      1  2024
81    212420807      0   12         22191   19      1  2024
82    212420285      0    7         21356   18      1  2024


In [87]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

# Load the data
data = pd.read_excel('Dummy Customer Compliant Data.xlsx', header=1)

# Select relevant columns
selected_columns = ['INCIDENT N°', 'PIMOF', 'MIS', 'MILEAGE (KM)', 'day', 'month', 'year']
X = data[selected_columns]

# Handle missing values if any
# For example, you can fill missing numerical values with mean and categorical values with mode
X.fillna(X.mean(), inplace=True)

# Train-test split
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

# Normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the architecture of the VAE model
class VAE(tf.keras.Model):
    def __init__(self, latent_dim):
        super(VAE, self).__init__()
        self.latent_dim = latent_dim
        self.encoder = tf.keras.Sequential([
            tf.keras.layers.Input(shape=(X_train_scaled.shape[1],)),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(32, activation='relu'),
            tf.keras.layers.Dense(latent_dim + latent_dim),
        ])
        self.decoder = tf.keras.Sequential([
            tf.keras.layers.Input(shape=(latent_dim,)),
            tf.keras.layers.Dense(32, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dense(X_train_scaled.shape[1]),
        ])

    def sample(self, eps=None):
        if eps is None:
            eps = tf.random.normal(shape=(100, self.latent_dim))
        return self.decode(eps, apply_sigmoid=True)

    def encode(self, x):
        mean, logvar = tf.split(self.encoder(x), num_or_size_splits=2, axis=1)
        return mean, logvar

    def reparameterize(self, mean, logvar):
        eps = tf.random.normal(shape=mean.shape)
        return eps * tf.exp(logvar * .5) + mean

    def decode(self, z, apply_sigmoid=False):
        logits = self.decoder(z)
        if apply_sigmoid:
            probs = tf.sigmoid(logits)
            return probs
        return logits

# Define the loss function
def vae_loss(x, x_recon, mean, logvar):
    reconstruction_loss = tf.reduce_mean(tf.square(x - x_recon))
    kl_loss = -0.5 * tf.reduce_sum(1 + logvar - tf.square(mean) - tf.exp(logvar), axis=1)
    return reconstruction_loss + kl_loss

# Instantiate the VAE model
latent_dim = 2
vae = VAE(latent_dim)

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
vae.compile(optimizer=optimizer, loss=vae_loss)

# Train the model
history = vae.fit(X_train_scaled, X_train_scaled, epochs=100, batch_size=32, validation_data=(X_test_scaled, X_test_scaled))

# Reconstruction error
X_pred = vae.predict(X_test_scaled)
reconstruction_error = np.mean(np.square(X_test_scaled - X_pred))
print("Reconstruction Error:", reconstruction_error)


RuntimeError: module compiled against API version 0x10 but this version of numpy is 0xe

SystemError: initialization of _pywrap_checkpoint_reader raised unreported exception

In [88]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LeakyReLU
from tensorflow.keras.optimizers import Adam

# Load data
data = pd.read_excel('Dummy Customer Compliant Data.xlsx', header=1)

# Select columns
selected_columns = ['INCIDENT N°', 'PIMOF', 'MIS', 'MILEAGE (KM)', 'day', 'month', 'year']
X = data[selected_columns]

# Handle missing values
X.fillna(X.mean(), inplace=True)

# Standardize data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define GAN architecture
generator = Sequential([
    Dense(128, input_shape=(latent_dim,), activation='relu'),
    Dense(64, activation='relu'),
    Dense(X_scaled.shape[1], activation='linear')  # Output layer
])

discriminator = Sequential([
    Dense(64, input_shape=(X_scaled.shape[1],), activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Output layer
])

# Compile discriminator
discriminator.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

# Freeze discriminator layers during generator training
discriminator.trainable = False

# Combined model
gan_input = Input(shape=(latent_dim,))
gan_output = discriminator(generator(gan_input))
gan = Model(gan_input, gan_output)
gan.compile(optimizer=Adam(), loss='binary_crossentropy')

# Training parameters
batch_size = 32
epochs = 1000
latent_dim = 100

# Training loop
for epoch in range(epochs):
    noise = np.random.normal(0, 1, size=(batch_size, latent_dim))
    fake_data = generator.predict(noise)
    real_data = X_scaled[np.random.randint(0, X_scaled.shape[0], size=batch_size)]

    X_combined = np.concatenate([real_data, fake_data])
    y_combined = np.concatenate([np.ones((batch_size, 1)), np.zeros((batch_size, 1))])

    discriminator_loss = discriminator.train_on_batch(X_combined, y_combined)

    noise = np.random.normal(0, 1, size=(batch_size, latent_dim))
    y_gen = np.ones((batch_size, 1))

    generator_loss = gan.train_on_batch(noise, y_gen)

    if epoch % 100 == 0:
        print(f'Epoch: {epoch}, Discriminator Loss: {discriminator_loss}, Generator Loss: {generator_loss}')

# Evaluate the GAN
# You can generate samples using the generator and evaluate them based on your specific task

# Note: GANs for tabular data are not commonly used and may not perform well. Consider other approaches for tabular data generation.


RuntimeError: module compiled against API version 0x10 but this version of numpy is 0xe

SystemError: initialization of _pywrap_checkpoint_reader raised unreported exception

In [1]:
import pandas as pd

# Read data from Excel file
data = pd.read_excel('Dummy Customer Compliant Data.xlsx', header=1)

# Select columns of interest
columns_of_interest = ['INCIDENT N°', 'PIMOF', 'MIS', 'MILEAGE (KM)']

# Compute correlation matrix
correlation_matrix = data[columns_of_interest].corr()

# Display correlation matrix
print("Correlation Matrix:")
print(correlation_matrix)


Correlation Matrix:
              INCIDENT N°       MIS  MILEAGE (KM)
INCIDENT N°      1.000000  0.370103      0.177626
MIS              0.370103  1.000000      0.506810
MILEAGE (KM)     0.177626  0.506810      1.000000


In [8]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Read data from Excel file
data = pd.read_excel('Dummy Customer Compliant Data.xlsx', header=1)

# Select columns of interest
columns_of_interest = ['PIMOF', 'MIS', 'MILEAGE (KM)']

# Drop rows with missing values in any of the selected columns
data.dropna(subset=columns_of_interest, inplace=True)

# Convert non-numeric columns to numeric using LabelEncoder
label_encoders = {}
for column in columns_of_interest:
    if data[column].dtype == 'object':
        label_encoders[column] = LabelEncoder()
        data[column] = label_encoders[column].fit_transform(data[column])

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(data[columns_of_interest])

# Apply PCA
pca = PCA()
pca.fit(X_scaled)

# Get the explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_

print("Explained variance ratio:", explained_variance_ratio)


Explained variance ratio: [0.50235928 0.33329224 0.16434848]


In [12]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Read data from Excel file
data = pd.read_excel('Dummy Customer Compliant Data.xlsx', header=1)

# Select columns of interest
columns_of_interest = ['PIMOF', 'MIS', 'MILEAGE (KM)']

# Drop rows with missing values in any of the selected columns
data.dropna(subset=columns_of_interest, inplace=True)

# Convert non-numeric columns to numeric using LabelEncoder
label_encoders = {}
for column in columns_of_interest:
    if data[column].dtype == 'object':
        label_encoders[column] = LabelEncoder()
        data[column] = label_encoders[column].fit_transform(data[column])

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(data[columns_of_interest])

# Apply PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Get the explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_

print("Explained variance ratio:", explained_variance_ratio)

# Calculate reconstruction error
X_reconstructed = pca.inverse_transform(X_pca)
reconstruction_error = ((X_scaled - X_reconstructed) ** 2).mean()

print("Reconstruction Error:", reconstruction_error)

# Get the PCA components
components = pd.DataFrame(pca.components_, columns=columns_of_interest)

print("PCA Components:")
print(components)


Explained variance ratio: [0.50235928 0.33329224 0.16434848]
Reconstruction Error: 9.856833730090884e-31
PCA Components:
      PIMOF       MIS  MILEAGE (KM)
0  0.022999  0.706782      0.707057
1  0.999592 -0.028224     -0.004302
2 -0.016915 -0.706868      0.707143


In [22]:
import pandas as pd
from itertools import combinations

# Read data from Excel file
data = pd.read_excel('Dummy Customer Compliant Data.xlsx', header=1)

# Select columns of interest
columns_of_interest = ['PIMOF', 'MIS', 'MILEAGE (KM)']

# Drop rows with missing values in any of the selected columns
data.dropna(subset=columns_of_interest, inplace=True)

# Define a function to generate frequent itemsets using the Apriori algorithm
def generate_frequent_itemsets(data, min_support):
    transactions = data.apply(lambda row: set(row.dropna()), axis=1)
    num_transactions = len(transactions)
    frequent_itemsets = {}
    all_items = set(data.columns)
    
    # Generate frequent itemsets of length 1
    candidates = [(item,) for item in all_items]
    frequent_itemsets[1] = {}
    for item in candidates:
        support = sum(1 for transaction in transactions if set(item).issubset(transaction)) / num_transactions
        if support >= min_support:
            frequent_itemsets[1][item] = support
    
    # Generate frequent itemsets of length > 1
    k = 2
    while frequent_itemsets[k - 1]:
        frequent_itemsets[k] = {}
        candidates = set()
        for itemset in frequent_itemsets[k - 1]:
            for item in all_items.difference(set(itemset)):
                candidate = tuple(sorted(list(itemset) + [item]))
                candidates.add(candidate)
        for candidate in candidates:
            support = sum(1 for transaction in transactions if set(candidate).issubset(transaction)) / num_transactions
            if support >= min_support:
                frequent_itemsets[k][candidate] = support
        k += 1
    
    return frequent_itemsets

# Apply Apriori algorithm
min_support = 0.1
frequent_itemsets = generate_frequent_itemsets(data[columns_of_interest], min_support)

# Print scores and matrix
print("Scores for Frequent Itemsets:")
for k, itemsets in frequent_itemsets.items():
    for itemset, support in itemsets.items():
        print(f"Support for {itemset}: {support}")



Scores for Frequent Itemsets:


In [24]:
import pandas as pd
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Read the dataset
data = pd.read_excel('Dummy Customer Compliant Data.xlsx', header=1)

# Select columns of interest
columns_of_interest = ['PIMOF', 'MIS', 'MILEAGE (KM)']

# Drop rows with missing values in the selected columns
data.dropna(subset=columns_of_interest, inplace=True)

# Convert non-numeric columns to numeric using LabelEncoder
label_encoders = {}
for column in columns_of_interest:
    if data[column].dtype == 'object':
        label_encoders[column] = LabelEncoder()
        data[column] = label_encoders[column].fit_transform(data[column])

# Fit Gaussian Mixture Model
gmm = GaussianMixture(n_components=3, random_state=42)
gmm.fit(data[columns_of_interest])

# Print the GMM scores
print("GMM Scores:")
print(gmm.score(data[columns_of_interest]))


GMM Scores:
-7.650043378734142


In [29]:
import pandas as pd
from sklearn.neighbors import KernelDensity
from sklearn.preprocessing import LabelEncoder

# Read data from Excel file
data = pd.read_excel('Dummy Customer Compliant Data.xlsx', header=1)

# Select columns of interest
columns_of_interest = ['PIMOF', 'MIS', 'MILEAGE (KM)']

# Drop rows with missing values in any of the selected columns
data.dropna(subset=columns_of_interest, inplace=True)

# Convert non-numeric columns to numeric using LabelEncoder
label_encoders = {}
for column in columns_of_interest:
    if data[column].dtype == 'object':
        label_encoders[column] = LabelEncoder()
        data[column] = label_encoders[column].fit_transform(data[column])

# Extract numeric values from selected columns
X = data[columns_of_interest].values

# Fit the KDE model to the data
kde = KernelDensity(kernel='gaussian')
kde.fit(X)

# Sample from the KDE model to obtain new features
new_samples = kde.sample(100)  # Adjust the number of samples as needed

print("New samples generated by KDE:")
print(new_samples)



New samples generated by KDE:
[[-8.20026827e-01  9.39581941e+00  1.89239661e+04]
 [ 1.75455063e-01  1.11319580e+01  3.34889031e+03]
 [ 1.36436351e+00  6.53375946e+00  9.01806476e+03]
 [ 1.45130417e+00  1.90343313e+00  3.21153629e+02]
 [ 1.34734074e+00  6.31406346e-01  4.72938853e+03]
 [ 2.49852035e-01  9.91905342e+00  8.32857588e+03]
 [ 1.31818053e+00  1.38594635e+01  2.70659446e+03]
 [ 8.23520212e-01  6.39052763e+00  1.06427542e+04]
 [ 8.07509025e-01  9.21846334e+00  2.42513720e+04]
 [ 5.84760021e-02  1.24981451e+01  1.07730919e+04]
 [-6.52156366e-01  1.21525662e+01  1.00178476e+04]
 [-6.73476623e-01  2.50647238e+00  2.85520250e+03]
 [ 1.42503096e+00  6.82704440e+00  6.21474748e+03]
 [-2.57177315e-01  6.52230624e+00  1.19360020e+04]
 [ 4.63414547e-02  7.18200854e+00  2.19541883e+03]
 [ 1.28427718e+00  6.98187712e+00  1.12506214e+04]
 [-4.24091423e-01  6.13454301e+00  4.44028392e+03]
 [ 1.31324122e+00  4.08086004e+00  1.53796427e+04]
 [-7.63643779e-01 -6.05005546e-02  3.25348007e+01]
 

In [40]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.svm import OneClassSVM

# Read data from Excel file
data = pd.read_excel('Dummy Customer Compliant Data.xlsx', header=1)

# Select columns of interest
columns_of_interest = ['PIMOF', 'MIS', 'MILEAGE (KM)']

# Convert non-numeric columns to numeric using LabelEncoder
label_encoders = {}
for column in columns_of_interest:
    if data[column].dtype == 'object':
        label_encoders[column] = LabelEncoder()
        data[column] = label_encoders[column].fit_transform(data[column])

# Handle missing values
imputer = SimpleImputer(strategy='mean')
data[columns_of_interest] = imputer.fit_transform(data[columns_of_interest])

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(data[columns_of_interest])

# Fit One-Class SVM model
ocsvm = OneClassSVM(kernel='rbf')
ocsvm.fit(X)
# Obtain scores for each sample
scores = ocsvm.decision_function(X)

# Print the scores
print("Scores:")
print(scores)



Scores:
[  19.02836001 -402.84881176 -268.82811488 ... -181.43678002 -951.73076845
  -90.80589951]


In [41]:
# Obtain binary predictions (1 for inliers, -1 for outliers)
predictions = ocsvm.predict(X)

# Print the predictions
print("Predictions:")
print(predictions)


Predictions:
[ 1 -1 -1 ... -1 -1 -1]


In [43]:
import pandas as pd
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Read data from Excel file
data = pd.read_excel('Dummy Customer Compliant Data.xlsx', header=1)

# Select columns of interest
columns_of_interest = ['PIMOF', 'MIS', 'MILEAGE (KM)']

# Drop rows with missing values in any of the selected columns
data.dropna(subset=columns_of_interest, inplace=True)

# Convert non-numeric columns to numeric using LabelEncoder
label_encoders = {}
for column in columns_of_interest:
    if data[column].dtype == 'object':
        label_encoders[column] = LabelEncoder()
        data[column] = label_encoders[column].fit_transform(data[column])

# Preprocessing: Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(data[columns_of_interest])

# Fit Gaussian Mixture Model (GMM)
gmm = GaussianMixture(n_components=3, random_state=42)  # You can adjust the number of components as needed
gmm.fit(X)

# Get cluster assignments for each sample
labels = gmm.predict(X)

# Print the cluster labels
print("Cluster Labels:")
print(labels)


Cluster Labels:
[0 0 0 ... 1 2 1]


In [44]:
from sklearn.metrics import silhouette_score

# Compute the silhouette score
silhouette_avg = silhouette_score(X, labels)
print("Silhouette Score:", silhouette_avg)


Silhouette Score: 0.4132425300037363


In [61]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

class VariationalAutoencoder(BaseEstimator, TransformerMixin):
    def __init__(self, latent_dim=2):
        self.latent_dim = latent_dim
        self.scaler = StandardScaler()
        self.imputer = SimpleImputer(strategy='mean')
        self.pca = PCA(n_components=latent_dim)
        
    def fit(self, X_train):
        # Preprocess the data
        X_train_imputed = self.imputer.fit_transform(X_train)
        X_train_scaled = self.scaler.fit_transform(X_train_imputed)
        
        # Fit PCA
        self.pca.fit(X_train_scaled)
        
    def transform(self, X):
        # Preprocess the data
        X_imputed = self.imputer.transform(X)
        X_scaled = self.scaler.transform(X_imputed)
        
        # Encode the data
        X_encoded = self.pca.transform(X_scaled)
        
        return X_encoded

# Load data
data = pd.read_excel('Dummy Customer Compliant Data.xlsx', header=1)

# Select columns of interest
columns_of_interest = ['PIMOF', 'MIS', 'MILEAGE (KM)']

# Filter data
data_filtered = data[columns_of_interest]

# Handle non-numeric data
label_encoders = {}
for col in columns_of_interest:
    if data_filtered[col].dtype == 'object':
        label_encoders[col] = LabelEncoder()
        data_filtered[col] = label_encoders[col].fit_transform(data_filtered[col])

# Initialize and fit VAE model
vae = VariationalAutoencoder(latent_dim=2)
vae.fit(data_filtered)

# Transform data
data_encoded = vae.transform(data_filtered)

print("Encoded Data Shape:", data_encoded.shape)


Encoded Data Shape: (10670, 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered[col] = label_encoders[col].fit_transform(data_filtered[col])


In [65]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, MeanShift
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
from sklearn.mixture import GaussianMixture
from sklearn.svm import OneClassSVM
from sklearn.neighbors import KernelDensity

# Load the data
data = pd.read_excel('Dummy Customer Compliant Data.xlsx', header=1)

# Select columns of interest
columns_of_interest = ['PIMOF', 'MIS', 'MILEAGE (KM)']

# Define preprocessing steps for numeric and non-numeric columns
numeric_features = data[columns_of_interest].select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_features = data[columns_of_interest].select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Choose a different strategy for non-numeric data
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply preprocessing and imputation
processed_data = preprocessor.fit_transform(data[columns_of_interest])

# Initialize clustering algorithms and dimensionality reduction techniques
algorithms = {
    'K-means': KMeans(n_clusters=3),
    'Hierarchical Clustering': AgglomerativeClustering(n_clusters=3),
    'DBSCAN': DBSCAN(eps=0.5, min_samples=5),
    'Mean Shift Clustering': MeanShift(),
    'Gaussian Mixture Models (GMM)': GaussianMixture(n_components=3),
}

dimensionality_reduction = {
    'PCA': PCA(n_components=2),
    't-SNE': TSNE(n_components=2),
}

# Evaluate clustering algorithms
print("Clustering Algorithms:")
for alg_name, alg in algorithms.items():
    alg.fit(processed_data)
    if hasattr(alg, 'labels_'):
        labels = alg.labels_
    else:
        labels = alg.predict(processed_data)
    score = silhouette_score(processed_data, labels, metric='euclidean')
    print(f"{alg_name} Score:", score)

# Evaluate dimensionality reduction techniques
print("\nDimensionality Reduction Techniques:")
for dim_red_name, dim_red in dimensionality_reduction.items():
    reduced_data = dim_red.fit_transform(processed_data)
    kmeans = KMeans(n_clusters=3)
    kmeans.fit(reduced_data)
    score = silhouette_score(reduced_data, kmeans.labels_, metric='euclidean')
    print(f"{dim_red_name} + K-means Score:", score)

# One-Class SVM
print("\nOne-Class SVM:")
oneclass_svm = OneClassSVM(gamma='auto')
oneclass_svm.fit(processed_data)
print("One-Class SVM Score:", silhouette_score(processed_data, oneclass_svm.predict(processed_data)))

# KDE
print("\nKernel Density Estimation (KDE):")
kde = KernelDensity()
kde.fit(processed_data)
print("KDE Score:", silhouette_score(processed_data, kde.sample(processed_data)))


Clustering Algorithms:
K-means Score: 0.42966827689911663
Hierarchical Clustering Score: 0.3132061510996466
DBSCAN Score: 0.2710561739105903
Mean Shift Clustering Score: 0.26251989513072405
Gaussian Mixture Models (GMM) Score: 0.2763788352221043

Dimensionality Reduction Techniques:
PCA + K-means Score: 0.47302785983313894




t-SNE + K-means Score: 0.35715732

One-Class SVM:
One-Class SVM Score: 0.13126448677130953

Kernel Density Estimation (KDE):


ValueError: maximum supported dimension for an ndarray is 32, found 10670

In [66]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, MeanShift
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
from sklearn.mixture import GaussianMixture
from sklearn.svm import OneClassSVM
from sklearn.neighbors import KernelDensity

# Load the data
data = pd.read_excel('Dummy Customer Compliant Data.xlsx', header=1)

# Select columns of interest
columns_of_interest = ['PIMOF', 'MIS', 'MILEAGE (KM)']

# Define preprocessing steps for numeric and non-numeric columns
numeric_features = data[columns_of_interest].select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_features = data[columns_of_interest].select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Choose a different strategy for non-numeric data
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Apply preprocessing and imputation
processed_data = preprocessor.fit_transform(data[columns_of_interest])

# Initialize clustering algorithms and dimensionality reduction techniques
algorithms = {
    'K-means': KMeans(n_clusters=3),
    'Hierarchical Clustering': AgglomerativeClustering(n_clusters=3),
    'DBSCAN': DBSCAN(eps=0.5, min_samples=5),
    'Mean Shift Clustering': MeanShift(),
    'Gaussian Mixture Models (GMM)': GaussianMixture(n_components=3),
}

dimensionality_reduction = {
    'PCA': PCA(n_components=2),
    't-SNE': TSNE(n_components=2),
}

# Evaluate clustering algorithms
print("Clustering Algorithms:")
for alg_name, alg in algorithms.items():
    alg.fit(processed_data)
    if hasattr(alg, 'labels_'):
        labels = alg.labels_
    else:
        labels = alg.predict(processed_data)
    score = silhouette_score(processed_data, labels, metric='euclidean')
    print(f"{alg_name} Score:", score)

# Evaluate dimensionality reduction techniques
print("\nDimensionality Reduction Techniques:")
for dim_red_name, dim_red in dimensionality_reduction.items():
    reduced_data = dim_red.fit_transform(processed_data)
    kmeans = KMeans(n_clusters=3)
    kmeans.fit(reduced_data)
    score = silhouette_score(reduced_data, kmeans.labels_, metric='euclidean')
    print(f"{dim_red_name} + K-means Score:", score)

# One-Class SVM
print("\nOne-Class SVM:")
oneclass_svm = OneClassSVM(gamma='auto')
oneclass_svm.fit(processed_data)
print("One-Class SVM Score:", silhouette_score(processed_data, oneclass_svm.predict(processed_data)))

# Kernel Density Estimation (KDE) after dimensionality reduction
print("\nKernel Density Estimation (KDE) after Dimensionality Reduction:")
pca_reduced_data = PCA(n_components=2).fit_transform(processed_data)
kde = KernelDensity()
kde.fit(pca_reduced_data)
print("KDE Score:", silhouette_score(pca_reduced_data, kde.sample(pca_reduced_data)))


Clustering Algorithms:
K-means Score: 0.42904010086077227
Hierarchical Clustering Score: 0.3132061510996466
DBSCAN Score: 0.2710561739105903
Mean Shift Clustering Score: 0.26251989513072405
Gaussian Mixture Models (GMM) Score: 0.2763788352221043

Dimensionality Reduction Techniques:
PCA + K-means Score: 0.47304642748656206




t-SNE + K-means Score: 0.365226

One-Class SVM:
One-Class SVM Score: 0.13126448677130953

Kernel Density Estimation (KDE) after Dimensionality Reduction:


ValueError: maximum supported dimension for an ndarray is 32, found 10670

In [68]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

# Load the data
data = pd.read_excel('Dummy Customer Compliant Data.xlsx', header=1)

# Select columns of interest
columns_of_interest = ['PIMOF', 'MIS', 'MILEAGE (KM)']

# Filter out non-numeric columns
numeric_columns = data[columns_of_interest].select_dtypes(include=['number']).columns
data_filtered = data[numeric_columns]

# Preprocessing: Handle missing values
imputer = SimpleImputer(strategy='mean')
data_filtered = imputer.fit_transform(data_filtered)

# Preprocessing: Standardize the features
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_filtered)

# Split data into train and test sets
X_train, X_test = train_test_split(data_scaled, test_size=0.2, random_state=42)

# Implement and evaluate clustering algorithms
algorithms = {
    'K-means': KMeans(n_clusters=2),
    'Hierarchical Clustering': AgglomerativeClustering(n_clusters=2),
    'DBSCAN': DBSCAN(),
}

# Store silhouette scores for each algorithm
silhouette_scores = {}

for alg_name, alg in algorithms.items():
    alg.fit(X_train)
    score = silhouette_score(X_train, alg.labels_, metric='euclidean')
    silhouette_scores[alg_name] = score
    print(f"{alg_name} Score:", score)

# Determine sensitivity
sensitive_algorithms = [alg_name for alg_name, score in silhouette_scores.items() if score > 0.5]

if sensitive_algorithms:
    print("The following algorithms are sensitive:")
    for alg_name in sensitive_algorithms:
        print(alg_name)
else:
    print("No sensitive algorithms found.")


K-means Score: 0.46114702789401185
Hierarchical Clustering Score: 0.42596289756187583
DBSCAN Score: 0.7427003845774999
The following algorithms are sensitive:
DBSCAN


In [69]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score

def check_sensitivity(data):
    # Select columns of interest
    columns_of_interest = ['PIMOF', 'MIS', 'MILEAGE (KM)']

    # Filter out non-numeric columns
    numeric_columns = data[columns_of_interest].select_dtypes(include=['number']).columns
    data_filtered = data[numeric_columns]

    # Preprocessing: Handle missing values
    imputer = SimpleImputer(strategy='mean')
    data_filtered = imputer.fit_transform(data_filtered)

    # Preprocessing: Standardize the features
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(data_filtered)

    # Implement and evaluate clustering algorithms
    algorithms = {
        'K-means': KMeans(n_clusters=2),
        'Hierarchical Clustering': AgglomerativeClustering(n_clusters=2),
        'DBSCAN': DBSCAN(),
    }

    # Store silhouette scores for each algorithm
    silhouette_scores = {}

    for alg_name, alg in algorithms.items():
        alg.fit(data_scaled)
        score = silhouette_score(data_scaled, alg.labels_, metric='euclidean')
        silhouette_scores[alg_name] = score

    # Determine sensitivity
    sensitive_algorithms = [alg_name for alg_name, score in silhouette_scores.items() if score > 0.5]

    if sensitive_algorithms:
        print("The data is sensitive.")
        print("Sensitive algorithms:", sensitive_algorithms)
    else:
        print("The data is not sensitive.")


data = pd.read_excel('Dummy Customer Compliant Data.xlsx', header=1)
check_sensitivity(data)


The data is sensitive.
Sensitive algorithms: ['DBSCAN']


In [74]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score

def check_sensitivity(data):
    # Preprocessing: Handle missing values
    imputer = SimpleImputer(strategy='constant', fill_value=0)
    data_imputed = imputer.fit_transform(data)

    # Preprocessing: Encode non-numeric values
    label_encoders = {}
    for col_index, col_name in enumerate(data.columns):
        # Convert column to string
        col_data = data_imputed[:, col_index].astype(str)
        # Encode non-numeric values
        label_encoders[col_name] = LabelEncoder()
        data_imputed[:, col_index] = label_encoders[col_name].fit_transform(col_data)

    # Preprocessing: Standardize the features
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(data_imputed)

    # Implement and evaluate clustering algorithms
    algorithms = {
        'K-means': KMeans(n_clusters=2),
        'Hierarchical Clustering': AgglomerativeClustering(n_clusters=2),
        'DBSCAN': DBSCAN(),
    }

    # Store silhouette scores for each algorithm
    silhouette_scores = {}

    for alg_name, alg in algorithms.items():
        alg.fit(data_scaled)
        score = silhouette_score(data_scaled, alg.labels_, metric='euclidean')
        silhouette_scores[alg_name] = score

    # Determine sensitivity
    sensitive_algorithms = [alg_name for alg_name, score in silhouette_scores.items() if score > 0.5]

    if sensitive_algorithms:
        print("The data is sensitive.")
        print("Sensitive algorithms:", sensitive_algorithms)
    else:
        print("The data is not sensitive.")


file_path = 'Dummy Customer Compliant Data.xlsx'
data = pd.read_excel(file_path, header=1)
check_sensitivity(data)


The data is not sensitive.
