In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

## Leemos dataset y tipos de datos

In [2]:
df = pd.read_csv('kaggle_ecommerce1.csv',encoding='latin1')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32671 entries, 0 to 32670
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   _unit_id             32671 non-null  int64  
 1   relevance            20571 non-null  float64
 2   relevance:variance   20571 non-null  float64
 3   product_image        32671 non-null  object 
 4   product_link         32671 non-null  object 
 5   product_price        32671 non-null  object 
 6   product_title        32671 non-null  object 
 7   query                32671 non-null  object 
 8   rank                 32671 non-null  int64  
 9   source               32671 non-null  object 
 10  url                  32671 non-null  object 
 11  product_description  24800 non-null  object 
 12  invoice              32671 non-null  int64  
dtypes: float64(2), int64(3), object(8)
memory usage: 3.2+ MB


### Eliminamos columnas innecesarias

In [3]:
df.drop(['product_image', 'product_link', 'product_title', 'product_description', 'url'], axis=1, inplace=True)
df

Unnamed: 0,_unit_id,relevance,relevance:variance,product_price,query,rank,source,invoice
0,711158459,3.67,0.471,$329.98,playstation 4,1,eBay,579
1,711158460,4.00,0.000,$324.84,playstation 4,2,eBay,703
2,711158461,4.00,0.000,$324.83,playstation 4,3,eBay,862
3,711158462,3.67,0.471,$350.00,playstation 4,4,eBay,307
4,711158463,3.33,0.471,$308.00,playstation 4,5,eBay,120
...,...,...,...,...,...,...,...,...
32666,713196633,,,$109.99,snow boots,30,Shop.com,397
32667,713196634,,,$175.00,snow boots,31,Shop.com,674
32668,713196635,,,$175.00,snow boots,32,Shop.com,475
32669,713196636,,,$169.00,snow boots,33,Shop.com,856


### La columna product_price tiene muchos tipos de datos

In [4]:
df['product_price'].value_counts()

$19.99                503
$29.99                424
$39.99                386
$24.99                352
$49.99                330
                     ... 
$154.99 - $398.00       1
$24.90,Sale $19.67      1
$37.18                  1
$8.90                   1
$80.99 - $93.99         1
Name: product_price, Length: 11240, dtype: int64

### Limpiamos y convertimos el tipo de dato de "object" a "float"

In [5]:
df["price"]=df['product_price'].str[1:6]
df

Unnamed: 0,_unit_id,relevance,relevance:variance,product_price,query,rank,source,invoice,price
0,711158459,3.67,0.471,$329.98,playstation 4,1,eBay,579,329.9
1,711158460,4.00,0.000,$324.84,playstation 4,2,eBay,703,324.8
2,711158461,4.00,0.000,$324.83,playstation 4,3,eBay,862,324.8
3,711158462,3.67,0.471,$350.00,playstation 4,4,eBay,307,350.0
4,711158463,3.33,0.471,$308.00,playstation 4,5,eBay,120,308.0
...,...,...,...,...,...,...,...,...,...
32666,713196633,,,$109.99,snow boots,30,Shop.com,397,109.9
32667,713196634,,,$175.00,snow boots,31,Shop.com,674,175.0
32668,713196635,,,$175.00,snow boots,32,Shop.com,475,175.0
32669,713196636,,,$169.00,snow boots,33,Shop.com,856,169.0


In [6]:
df.drop(['product_price'], axis=1, inplace=True)
df

Unnamed: 0,_unit_id,relevance,relevance:variance,query,rank,source,invoice,price
0,711158459,3.67,0.471,playstation 4,1,eBay,579,329.9
1,711158460,4.00,0.000,playstation 4,2,eBay,703,324.8
2,711158461,4.00,0.000,playstation 4,3,eBay,862,324.8
3,711158462,3.67,0.471,playstation 4,4,eBay,307,350.0
4,711158463,3.33,0.471,playstation 4,5,eBay,120,308.0
...,...,...,...,...,...,...,...,...
32666,713196633,,,snow boots,30,Shop.com,397,109.9
32667,713196634,,,snow boots,31,Shop.com,674,175.0
32668,713196635,,,snow boots,32,Shop.com,475,175.0
32669,713196636,,,snow boots,33,Shop.com,856,169.0


In [7]:
df1=df[df["price"] == "ale $"].index
df1=df.drop(df1)
df2=df1[df1["price"] == "eg:\np"].index
df2=df1.drop(df2)
df3=df2[df2["price"] == "ist:\n"].index
df3=df2.drop(df3)
df3

Unnamed: 0,_unit_id,relevance,relevance:variance,query,rank,source,invoice,price
0,711158459,3.67,0.471,playstation 4,1,eBay,579,329.9
1,711158460,4.00,0.000,playstation 4,2,eBay,703,324.8
2,711158461,4.00,0.000,playstation 4,3,eBay,862,324.8
3,711158462,3.67,0.471,playstation 4,4,eBay,307,350.0
4,711158463,3.33,0.471,playstation 4,5,eBay,120,308.0
...,...,...,...,...,...,...,...,...
32666,713196633,,,snow boots,30,Shop.com,397,109.9
32667,713196634,,,snow boots,31,Shop.com,674,175.0
32668,713196635,,,snow boots,32,Shop.com,475,175.0
32669,713196636,,,snow boots,33,Shop.com,856,169.0


In [8]:
df3['price'] = df3['price'].str.replace(',','')
df3['price']
df3['price'] = df3['price'].astype('float64')
df3['price'].dtype

dtype('float64')

In [9]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32103 entries, 0 to 32670
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   _unit_id            32103 non-null  int64  
 1   relevance           20229 non-null  float64
 2   relevance:variance  20229 non-null  float64
 3   query               32103 non-null  object 
 4   rank                32103 non-null  int64  
 5   source              32103 non-null  object 
 6   invoice             32103 non-null  int64  
 7   price               32103 non-null  float64
dtypes: float64(3), int64(3), object(2)
memory usage: 2.2+ MB


In [10]:
data = [df3["invoice"],df3["query"]]
headers = ["invoice","product"]
dfventas = pd.concat(data, axis=1, keys=headers)
dfventas.head(2)

Unnamed: 0,invoice,product
0,579,playstation 4
1,703,playstation 4


In [11]:
dfventas = dfventas.groupby(['invoice', 'product']).size()
aux = dfventas.unstack(level=-1)
productos = aux.reset_index().fillna(0).set_index('invoice')

def codificar(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1
productos1 = productos.applymap(codificar)
productos1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 999 entries, 1 to 999
Columns: 264 entries, 16 gb memory card to zippo hand warmer
dtypes: int64(264)
memory usage: 2.0 MB


In [20]:
#productos1.to_csv("borrar.csv", index=False)
#frequent_itemsets = apriori(productos1, min_support=0.009,use_colnames=True)
#reglas = association_rules(frequent_itemsets, metric="lift", min_threshold=3)
#reglas.sort_values(by = "lift", ascending = False).head(3)

MemoryError: Unable to allocate 45.8 GiB for an array with shape (2050357, 3, 999) and data type int64