In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
# define column names
columns = ['Catalog_ID', 'Artist', 'Title', 'Label', 'Format','Vacio','Released', 'Release_id', 'Notes', 'Inserted_Date']

# read the CSV file with explicit column names
df = pd.read_csv('../Discogs_Exports/VannaBe-wantlist-20230420-0857.csv', names=columns, skiprows=1)

In [3]:
df.head(2)

Unnamed: 0,Catalog_ID,Artist,Title,Label,Format,Vacio,Released,Release_id,Notes,Inserted_Date
0,88.122-l,Supertramp,Crime Of The Century,A&M Records,"LP, Album",,1974,1460333,Lowest:\t€4.10\nMedian:\t€7.50\nHighest:\t€20.00,2023-04-20 03:04:12
1,88674 E,Rick Wakeman,The Myths And Legends Of King Arthur And The K...,A&M Records,"LP, Album, Gat",,1975,5278224,Lowest:\t€2.00\nMedian:\t€3.50\nHighest:\t€5.39,2023-04-20 08:01:25


In [4]:
df.shape

(67, 10)

In [5]:
df.dtypes

Catalog_ID        object
Artist            object
Title             object
Label             object
Format            object
Vacio            float64
Released           int64
Release_id         int64
Notes             object
Inserted_Date     object
dtype: object

In [6]:
df['Inserted_Date'] = df['Inserted_Date'].str[:10]

In [7]:
df.sample(2)

Unnamed: 0,Catalog_ID,Artist,Title,Label,Format,Vacio,Released,Release_id,Notes,Inserted_Date
24,89.791-IE,Jethro Tull,Benefit,"Chrysalis, Chrysalis","LP, Album",,1970,19248337,,2023-04-20
20,28.438-I,Jethro Tull,Songs From The Wood,Chrysalis,"LP, Album",,1977,1727012,Lowest:\t€5.90\nMedian:\t€9.99\nHighest:\t€30.00,2023-04-20


In [8]:
df.isnull().sum()

Catalog_ID        0
Artist            0
Title             0
Label             0
Format            0
Vacio            67
Released          0
Release_id        0
Notes             2
Inserted_Date     0
dtype: int64

In [9]:
#There are 67 rows and 67 null so I can remove this column, on top I know I did not track it in the Wishlist
df.drop('Vacio', axis=1, inplace=True)

In [10]:
df.columns

Index(['Catalog_ID', 'Artist', 'Title', 'Label', 'Format', 'Released',
       'Release_id', 'Notes', 'Inserted_Date'],
      dtype='object')

In [11]:
df.describe().round()

Unnamed: 0,Released,Release_id
count,67.0,67.0
mean,1982.0,5770953.0
std,15.0,5391246.0
min,1968.0,409894.0
25%,1974.0,1872016.0
50%,1977.0,4198115.0
75%,1980.0,7424028.0
max,2021.0,20929789.0


In [12]:
df.describe(include=['O'])

Unnamed: 0,Catalog_ID,Artist,Title,Label,Format,Notes,Inserted_Date
count,67,67,67,67,67,65,67
unique,67,24,67,44,12,64,1
top,88.122-l,Mike Oldfield,Crime Of The Century,A&M Records,"LP, Album",Lowest:\t€2.00\nMedian:\t€3.50\nHighest:\t€5.39,2023-04-20
freq,1,8,1,4,32,2,67


In [13]:
artists = sorted(df['Artist'].unique())
artists

['Cai (4)',
 'Canarios',
 'Electric Light Orchestra',
 'Emerson, Lake & Palmer',
 'Frank Zappa And The Mothers',
 'Genesis',
 'Gotic',
 'Guadalquivir',
 'Iceberg (2)',
 'Imán, Califato Independiente',
 'Jarabe De Palo',
 'Jethro Tull',
 'La Máquina De Hacer Pájaros',
 'Marillion',
 'Mike Oldfield',
 'New Trolls',
 'Premiata Forneria Marconi',
 'Procol Harum',
 'Rick Wakeman',
 'Supertramp',
 'The Alan Parsons Project',
 'Triana (2)',
 'Triumvirat',
 'Yes']

In [14]:
df['Artist'] = df['Artist'].replace({'Cai (4)': 'Cai', 'Iceberg (2)': 'Iceberg', 'Triana (2)': 'Triana'})

In [15]:
artists = sorted(df['Artist'].unique())
artists

['Cai',
 'Canarios',
 'Electric Light Orchestra',
 'Emerson, Lake & Palmer',
 'Frank Zappa And The Mothers',
 'Genesis',
 'Gotic',
 'Guadalquivir',
 'Iceberg',
 'Imán, Califato Independiente',
 'Jarabe De Palo',
 'Jethro Tull',
 'La Máquina De Hacer Pájaros',
 'Marillion',
 'Mike Oldfield',
 'New Trolls',
 'Premiata Forneria Marconi',
 'Procol Harum',
 'Rick Wakeman',
 'Supertramp',
 'The Alan Parsons Project',
 'Triana',
 'Triumvirat',
 'Yes']

In [16]:
df = df.sort_values(by='Artist')

In [17]:
df.head(10)

Unnamed: 0,Catalog_ID,Artist,Title,Label,Format,Released,Release_id,Notes,Inserted_Date
37,LC-001,Cai,Mas Allá De Nuestras Mentes Diminutas,"Lacochu, Trova Records","LP, Album",1978,1819813,Lowest:\t€120.00\nMedian:\t€120.00\nHighest:\t...,2023-04-20
28,EPC 85054,Cai,Canción De La Primavera,Epic,"LP, Album",1981,4007248,Lowest:\t€18.00\nMedian:\t€24.95\nHighest:\t€5...,2023-04-20
27,EPC 84218,Cai,Noche Abierta,Epic,"LP, Album, Gat",1980,1549999,Lowest:\t€14.00\nMedian:\t€28.99\nHighest:\t€5...,2023-04-20
5,"87905 XD, 87905-XD",Canarios,Ciclos,"Ariola, Ariola","2xLP, Album",1974,745518,Lowest:\t€8.00\nMedian:\t€30.91\nHighest:\t€70.00,2023-04-20
36,JETLX 500,Electric Light Orchestra,Discovery,Jet Records,"LP, Album, Gat",1979,1284470,Lowest:\t€2.00\nMedian:\t€5.75\nHighest:\t€10.00,2023-04-20
52,25.144-XD,Electric Light Orchestra,Out Of The Blue,"United Artists Records, Jet Records","2xLP, Album",1978,2843993,Lowest:\t€1.50\nMedian:\t€9.99\nHighest:\t€17.99,2023-04-20
53,"28 273-I, 28273-I",Electric Light Orchestra,A New World Record,"United Artists Records, Jet Records, United Ar...","LP, Album, Smo",1976,1924220,Lowest:\t€1.40\nMedian:\t€4.50\nHighest:\t€8.00,2023-04-20
33,85.382-SE,"Emerson, Lake & Palmer","Emerson, Lake & Palmer",Island Records,"LP, Album",1971,4928694,Lowest:\t€3.99\nMedian:\t€9.00\nHighest:\t€14.00,2023-04-20
35,"86.230-I, 86230.I","Emerson, Lake & Palmer",Trilogy,"Island Records, Island Records","LP, Album, Gat",1972,3582008,Lowest:\t€2.00\nMedian:\t€6.92\nHighest:\t€15.00,2023-04-20
34,85.527-SE,"Emerson, Lake & Palmer",Tarkus,Island Records,"LP, Album, Gat",1971,3289667,Lowest:\t€5.99\nMedian:\t€10.00\nHighest:\t€16.50,2023-04-20


In [18]:
# define regex pattern
pattern = r"Lowest:\s*([^\n]+)\nMedian:\s*([^\n]+)\nHighest:\s*([^\n]+)"

# loop through each row in the dataframe
for i in range(len(df)):
    # check if cell in 'Notes' column is a string
    if isinstance(df.loc[i, 'Notes'], str):
        # apply regex pattern and create new columns for lowest, median, and highest prices
        match = re.findall(pattern, df.loc[i, 'Notes'])
        if match:
            df.loc[i, 'Lowest_Price'] = match[0][0]
            df.loc[i, 'Median_Price'] = match[0][1]
            df.loc[i, 'Highest_Price'] = match[0][2]

In [19]:
df.sample(3)

Unnamed: 0,Catalog_ID,Artist,Title,Label,Format,Released,Release_id,Notes,Inserted_Date,Lowest_Price,Median_Price,Highest_Price
57,"V 3166, 00602557256703",Mike Oldfield,Return To Ommadawn,"Virgin EMI Records, Virgin EMI Records","LP, Album, 180",2017,9679634,Lowest:\t€17.05\nMedian:\t€32.49\nHighest:\t€8...,2023-04-20,€17.05,€32.49,€88.00
43,17.1302/6,Gotic,Escenes,Movieplay,"LP, Album",1978,3241016,Lowest:\t€24.90\nMedian:\t€73.94\nHighest:\t€9...,2023-04-20,€24.90,€73.94,€95.00
41,SUP-853,La Máquina De Hacer Pájaros,Películas,Microfon,"LP, Album",1977,3506667,Lowest:\t€37.31\nMedian:\t€55.00\nHighest:\t€1...,2023-04-20,€37.31,€55.00,€113.75


In [20]:
df.to_csv('../Discogs_Exports/Wannalist_20_04_23.csv', index=0)