# The Purge

## Libraries and Settings

In [14]:
# Libraries
import re
import numpy as np
import pandas as pd

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

## Importing Data

In [15]:
# Read the data to a pandas data frame
df_complete = pd.read_csv('raw_data/coop_rot.csv', sep=',', encoding='utf-8')
df_weiss = pd.read_csv('raw_data/coop_weiss.csv', sep=',', encoding='utf-8')
df_complete = df_complete.append(df_weiss)
df_champagner = pd.read_csv('raw_data/coop_champagner.csv', sep=',', encoding='utf-8')
df_complete = df_complete.append(df_champagner)
df_dessert = pd.read_csv('raw_data/coop_dessert.csv', sep=',', encoding='utf-8')
df_complete = df_complete.append(df_dessert)
df_rose = pd.read_csv('raw_data/coop_rose.csv', sep=',', encoding='utf-8')
df_complete = df_complete.append(df_rose)
df_schaumweine = pd.read_csv('raw_data/coop_schaumweine.csv', sep=',', encoding='utf-8')
df_complete = df_complete.append(df_schaumweine)
df_complete

Unnamed: 0,web-scraper-order,typ,href,wein_name_raw,jahrgang_raw,genussreife_raw,land_raw,region_raw,produzent_raw,rebsorte_raw,alkohol_raw,inhalt_raw,preis_chf_raw
0,1671138041-4172,rot,https://www.coop.ch/de/weine/wein-sortiment/ro...,Apothic Inferno Red Blend Wine with a Whiskey ...,2018.0,2022-2023,USA,,E. & J. Gallo Winery,,16.00%,75cl,19.95
1,1671138044-4173,rot,https://www.coop.ch/de/weine/wein-sortiment/ro...,Apothic Dark California Red Blend,2016.0,2022-2024,USA,,diverse Sorten,,14.00%,75cl,12.95
2,1671138046-4174,rot,https://www.coop.ch/de/weine/wein-sortiment/ro...,Antonini Monte Chiara Montepulciano d'Abruzzo DOC,2020.0,2023-2024,Italien,Abruzzen,Montepulciano,,12.50%,75cl,3.75
3,1671138049-4175,rot,https://www.coop.ch/de/weine/wein-sortiment/ro...,Angelin Langhe Nebbiolo DOC,2021.0,2022-2026,Italien,Piemont,Angelo Negro,Nebbiolo,14.00%,75cl,17.5
4,1671138052-4176,rot,https://www.coop.ch/de/weine/wein-sortiment/ro...,Amarone della Valpolicella DOCG Vigne Alte Zeni,2019.0,2024-2028,Italien,Venetien,Cantina Zeni,Molinara,15.50%,75cl,32.95
...,...,...,...,...,...,...,...,...,...,...,...,...,...
168,1671145310-6985,schaumwein,https://www.coop.ch/de/weine/wein-sortiment/sc...,Raphael Dal Bo Prosecco Superiore DOCG Millesi...,2021,2022-2025,Italien,Venetien,Raphael dal Bo,Glera,11.00%,75cl,11.95
169,1671145313-6987,schaumwein,https://www.coop.ch/de/weine/wein-sortiment/sc...,Prosecco Superiore DOCG Brut 1924 Carpène Malv...,2022-2023,Italien,Venetien,,"diverse Sorten, \n ...",,11.00%,75cl,17.5
170,1671145316-6989,schaumwein,https://www.coop.ch/de/weine/wein-sortiment/sc...,Prosecco Rosé 0.15 Millesimato Brut,2021,2022-2024,Italien,Venetien,De Stefani,Pinot Noir,11.50%,75cl,16.5
171,1671145318-6991,schaumwein,https://www.coop.ch/de/weine/wein-sortiment/sc...,Prosecco Frizzante DOC Canti 75CL,,,,,,,0.00%,75cl,5.95


### Getting Data Types

In [16]:
df_complete.dtypes

web-scraper-order    object
typ                  object
href                 object
wein_name_raw        object
jahrgang_raw         object
genussreife_raw      object
land_raw             object
region_raw           object
produzent_raw        object
rebsorte_raw         object
alkohol_raw          object
inhalt_raw           object
preis_chf_raw        object
dtype: object

# Extract and save relevant information from raw data using regular expressions (regex)

## Extract alcohol value

In [17]:
# Extract values from 'alkohol' strings
alkohol = []
for i in df_complete['alkohol_raw']:
    d1 = re.findall('(.*)\%', str(i))
    try:
        d2 = d1[0].strip().replace(',', '.')
    except:
        d2 = None
    alkohol.append(d2)

# Save as new variable in the pandas data frame
df_complete['alkohol'] = pd.Series(alkohol, dtype="float64")
    
# Print first 5 values
print(df_complete['alkohol_raw'].head(5), '\n')
print(df_complete['alkohol'].head(5))

0    16.00%
1    14.00%
2    12.50%
3    14.00%
4    15.50%
Name: alkohol_raw, dtype: object 

0    16.0
1    14.0
2    12.5
3    14.0
4    15.5
Name: alkohol, dtype: float64


## Extract content

In [18]:
#split string into chunks
inhalt_cl = []
for i in df_complete['inhalt_raw']:
    chunks = re.split(' ', str(i))
    length = len(chunks)
    d1 = re.findall('(.*)cl', chunks[length-1])
    try:
        d2 = d1[0].strip().replace(',', '.')
    except:
        d2 = None
    inhalt_cl.append(d2)

# Save as new variable in the pandas data frame
df_complete['inhalt_cl'] = pd.Series(inhalt_cl, dtype="float64")
    
# Print first 5 values
print(df_complete['inhalt_raw'].head(5), '\n')
print(df_complete['inhalt_cl'].head(5))

0    75cl
1    75cl
2    75cl
3    75cl
4    75cl
Name: inhalt_raw, dtype: object 

0    75.0
1    75.0
2    75.0
3    75.0
4    75.0
Name: inhalt_cl, dtype: float64


In [19]:
# Extract amount of containers in a set from inhalt_raw
anzahl_fl = []
for i in df_complete['inhalt_raw']:
    d1 = re.findall('(.*)x', str(i))
    try:
        d2 = d1[0].strip().replace(',', '.')
    except:
        d2 = None
    if d2 == None:
        d2 = 1
    anzahl_fl.append(d2)

# Save as new variable in the pandas data frame
df_complete['anzahl_fl'] = pd.Series(anzahl_fl, dtype="Int64")
    
# Print first 5 values
print(df_complete['inhalt_raw'].head(5), '\n')
print(df_complete['anzahl_fl'].head(5))

0    75cl
1    75cl
2    75cl
3    75cl
4    75cl
Name: inhalt_raw, dtype: object 

0    1
1    1
2    1
3    1
4    1
Name: anzahl_fl, dtype: Int64
