In [None]:
import pandas as pd

data = pd.read_csv('MailCustomerCustom.csv', sep = ',' , encoding = "ISO-8859 -1")
data

Unnamed: 0,GeographyKey,MaritalStatus,Gender,YearlyIncome,TotalChildren,NumberChildrenAtHome,EnglishEducation,EnglishOccupation,HouseOwnerFlag,NumberCarsOwned,DateFirstPurchase,CommuteDistance,Region,Age,BikeBuyer,AgeCategory
0,26,M,M,90000,2,0,Bachelors,Professional,1,0,2005,1-2 Miles,Pacific,58,1,40-60
1,37,S,M,60000,3,3,Bachelors,Professional,0,1,2005,0-1 Miles,Pacific,59,1,40-60
2,31,M,M,60000,3,3,Bachelors,Professional,1,1,2005,2-5 Miles,Pacific,59,1,40-60
3,11,S,F,70000,0,0,Bachelors,Professional,0,1,2005,5-10 Miles,Pacific,56,1,40-60
4,19,S,F,80000,5,5,Bachelors,Professional,1,4,2005,1-2 Miles,Pacific,56,1,40-60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18404,209,M,M,30000,1,0,Graduate Degree,Clerical,1,0,2007,0-1 Miles,Europe,66,1,61-80
18405,248,S,F,30000,3,0,Graduate Degree,Clerical,1,0,2008,0-1 Miles,Europe,64,1,61-80
18406,120,S,M,30000,3,0,Graduate Degree,Clerical,0,0,2006,0-1 Miles,Europe,64,1,61-80
18407,179,M,M,30000,3,0,Bachelors,Clerical,1,0,2007,0-1 Miles,Europe,65,1,61-80


# Análisis de componentes principales

In [None]:
from sklearn.preprocessing import StandardScaler

numerical_data = data[["YearlyIncome", "TotalChildren", "NumberChildrenAtHome",
                       "HouseOwnerFlag", "NumberCarsOwned", "DateFirstPurchase",
                       "Age", "BikeBuyer"]]

# Estandarización de los datos
scaler = StandardScaler()
standardized_data = scaler.fit_transform(numerical_data)

# Convertir a DataFrame para facilidad de manejo
standardized_df = pd.DataFrame(standardized_data, columns=numerical_data.columns)
standardized_df.head()

Unnamed: 0,YearlyIncome,TotalChildren,NumberChildrenAtHome,HouseOwnerFlag,NumberCarsOwned,DateFirstPurchase,Age,BikeBuyer
0,1.009924,0.098491,-0.661275,0.690547,-1.318798,-2.487477,-0.387747,1.008565
1,0.080138,0.718573,1.30694,-1.448127,-0.44149,-2.487477,-0.296814,1.008565
2,0.080138,0.718573,1.30694,0.690547,-0.44149,-2.487477,-0.296814,1.008565
3,0.390067,-1.141673,-0.661275,-1.448127,-0.44149,-2.487477,-0.569612,1.008565
4,0.699995,1.958737,2.619084,0.690547,2.190435,-2.487477,-0.569612,1.008565


In [None]:
from sklearn.decomposition import PCA

pca = PCA()
pca_components = pca.fit_transform(standardized_df)

# Obtener la proporción de varianza explicada por cada componente
explained_variance = pca.explained_variance_ratio_

pca_df = pd.DataFrame(data=pca_components, columns=[f'PC{i+1}' for i in range(len(explained_variance))])
explained_variance_cumsum = explained_variance.cumsum()

explained_variance, explained_variance_cumsum

(array([0.29697078, 0.18486904, 0.1527259 , 0.12006655, 0.085057  ,
        0.07186956, 0.05582567, 0.0326155 ]),
 array([0.29697078, 0.48183982, 0.63456572, 0.75463227, 0.83968927,
        0.91155883, 0.9673845 , 1.        ]))

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

X = np.column_stack((data['GeographyKey'], data['MaritalStatus'], data['Gender'], data['YearlyIncome'], data['TotalChildren'], data['NumberChildrenAtHome'], data['EnglishEducation'], data['EnglishOccupation'], data['HouseOwnerFlag'], data['NumberCarsOwned'], data['DateFirstPurchase'], data['CommuteDistance'], data['Region'], data['Age'], data['AgeCategory']))
y = data['BikeBuyer']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# K-Means

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

label_encoder = LabelEncoder()
kmeans = KMeans(n_clusters=3, random_state=0, n_init=10)
kmeans.fit(X_train)
kmeans_y_pred = kmeans.predict(X_test)

kmeans_accuracy = accuracy_score(y_test, kmeans_y_pred)

# Analisis de clusters
print(classification_report(y_test, kmeans_y_pred))
print(confusion_matrix(y_test, kmeans_y_pred))

              precision    recall  f1-score   support

           0       0.48      0.44      0.46      1841
           1       0.48      0.40      0.44      1841
           2       0.00      0.00      0.00         0

    accuracy                           0.42      3682
   macro avg       0.32      0.28      0.30      3682
weighted avg       0.48      0.42      0.45      3682

[[807 794 240]
 [867 738 236]
 [  0   0   0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# KNN

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

X = data.drop('BikeBuyer', axis=1)

X = pd.get_dummies(X, drop_first=True)


# Initialize and train the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)


y_pred = knn.predict(X_test)


print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.71      0.70      1841
           1       0.70      0.68      0.69      1841

    accuracy                           0.69      3682
   macro avg       0.69      0.69      0.69      3682
weighted avg       0.69      0.69      0.69      3682

[[1306  535]
 [ 589 1252]]


# A priori

In [None]:
!pip install apyori

from apyori import apriori

columns_of_interest = ['EnglishEducation', 'EnglishOccupation',
                        'CommuteDistance', 'AgeCategory']
data_filtered = data[columns_of_interest]

transactions = data_filtered.apply(lambda x: list(x), axis=1).tolist()

results = list(apriori(transactions, min_support=0.05, min_confidence=0.2, min_lift=1.0))

# Resumir los resultados
results_summary = []
for relation in results:
    for rule in relation.ordered_statistics:
        results_summary.append({
            'Base': list(rule.items_base),
            'Add': list(rule.items_add),
            'Support': relation.support,
            'Confidence': rule.confidence,
            'Lift': rule.lift
        })

# Convertir los resultados en un DataFrame para analizar
results_df = pd.DataFrame(results_summary)

# Mostrar los resultados
print(results_df.head())

  Base  Add   Support  Confidence  Lift
0   []  [0]  0.753219    0.753219   1.0
1   []  [1]  0.659406    0.659406   1.0
2   []  [2]  0.408387    0.408387   1.0
3   []  [3]  0.551361    0.551361   1.0
4   []  [4]  0.409093    0.409093   1.0


# RN (perceptron multicapa)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize and train the MLP classifier
mlp = MLPClassifier(hidden_layer_sizes=(100, 50),
                    max_iter=500,
                    activation='relu',  # Activation function
                    solver='adam',  # Optimization algorithm
                    random_state=42)
mlp.fit(X_train, y_train)

y_pred = mlp.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.84      0.83      1841
           1       0.83      0.81      0.82      1841

    accuracy                           0.82      3682
   macro avg       0.82      0.82      0.82      3682
weighted avg       0.82      0.82      0.82      3682

[[1539  302]
 [ 345 1496]]


# Arbol de decisión

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Initialize and train the Decision Tree classifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.82      0.82      1841
           1       0.82      0.82      0.82      1841

    accuracy                           0.82      3682
   macro avg       0.82      0.82      0.82      3682
weighted avg       0.82      0.82      0.82      3682

[[1505  336]
 [ 328 1513]]
