##### Standardizing my data sources in accordance to a unified agreed format for easy merging with other data sources

In [1]:
import pandas as pd
import numpy as np
from urllib.parse import urlparse
import os
from pathlib import Path

pd.set_option("display.max_rows", 20)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 140)


In [2]:
# path to the cleaned wikidata CSV
file_path = "/workspaces/dsi-ws2025-project-grpab-weigl-mds1ab-awp-proj2/2. Data Cleaning/data/wikidata_clean_eu_companies.csv"  

df = pd.read_csv(file_path, dtype=str)
print("Loaded shape:", df.shape)
df.head(5)


Loaded shape: (6384, 12)


Unnamed: 0,item,itemLabel,website,industryLabel,hqLabel,ownedByLabel,memberOfLabel,inception,dissolved,revenue,revenue_date_parsed,country
0,http://www.wikidata.org/entity/Q1000428,MySQL AB,mysql.com,Software Industry,Solna Municipality,Oracle America,,2001-01-01 00:00:00+00:00,,,,Sweden
1,http://www.wikidata.org/entity/Q1000752,Ferrovie della Calabria,ferroviedellacalabria.it,Public Transport,Catanzaro,,,2001-01-01 00:00:00+00:00,,,,Italy
2,http://www.wikidata.org/entity/Q100094013,Telia Danmark,telia.dk,Telecommunications,Copenhagen,Norlys Energi,,1995-05-01 00:00:00+00:00,,,,Denmark
3,http://www.wikidata.org/entity/Q100142778,Brownies&downieS,browniesanddownies.nl,Horeca,Veghel,,,2010-01-01 00:00:00+00:00,,,,Netherlands
4,http://www.wikidata.org/entity/Q100166679,librerie.coop,librerie.coop,Book Retail Industry,Villanova,Coop Alleanza 3.0,,2006-01-01 00:00:00+00:00,,,,Italy


In [None]:
# Adding new columns not in this dataset but in final dataset

df['funding_amount'] = np.nan
df['status'] = np.nan

In [4]:
df.rename(columns = {'country' : 'country',
                     'itemLabel' : 'company_name',
                     'hqLabel' : 'city',
                     'funding_amount' : 'funding_amount',
                     'industryLabel' : 'industry_tags',
                     'inception' : 'start_year',
                     'status' : 'status', 
                     'website' : 'website'}, inplace=True)

df = df[['country', 'company_name', 'city', 'funding_amount','industry_tags','start_year','status','website']]
df.head()

Unnamed: 0,country,company_name,city,funding_amount,industry_tags,start_year,status,website
0,Sweden,MySQL AB,Solna Municipality,,Software Industry,2001-01-01 00:00:00+00:00,,mysql.com
1,Italy,Ferrovie della Calabria,Catanzaro,,Public Transport,2001-01-01 00:00:00+00:00,,ferroviedellacalabria.it
2,Denmark,Telia Danmark,Copenhagen,,Telecommunications,1995-05-01 00:00:00+00:00,,telia.dk
3,Netherlands,Brownies&downieS,Veghel,,Horeca,2010-01-01 00:00:00+00:00,,browniesanddownies.nl
4,Italy,librerie.coop,Villanova,,Book Retail Industry,2006-01-01 00:00:00+00:00,,librerie.coop


In [5]:
# Saving the standardized file as a csv for further use and ending the notebook here
output_path = "/workspaces/dsi-ws2025-project-grpab-weigl-mds1ab-awp-proj2/3. Data Merging/data/wikidata_standardized_eu_companies.csv"

df.to_csv(output_path, index=False, encoding="utf-8")

print(f"Exported cleaned dataset to: {output_path}")
print(df.shape)
df.head()


Exported cleaned dataset to: /workspaces/dsi-ws2025-project-grpab-weigl-mds1ab-awp-proj2/3. Data Merging/data/wikidata_standardized_eu_companies.csv
(6384, 8)


Unnamed: 0,country,company_name,city,funding_amount,industry_tags,start_year,status,website
0,Sweden,MySQL AB,Solna Municipality,,Software Industry,2001-01-01 00:00:00+00:00,,mysql.com
1,Italy,Ferrovie della Calabria,Catanzaro,,Public Transport,2001-01-01 00:00:00+00:00,,ferroviedellacalabria.it
2,Denmark,Telia Danmark,Copenhagen,,Telecommunications,1995-05-01 00:00:00+00:00,,telia.dk
3,Netherlands,Brownies&downieS,Veghel,,Horeca,2010-01-01 00:00:00+00:00,,browniesanddownies.nl
4,Italy,librerie.coop,Villanova,,Book Retail Industry,2006-01-01 00:00:00+00:00,,librerie.coop
