In [1]:
import chardet

with open("datasets/best_cities.csv", "rb") as f:
    raw_data = f.read(100000)  # Read first 100,000 bytes (adjust if needed)
    result = chardet.detect(raw_data)

print("Detected encoding:", result["encoding"])

Detected encoding: utf-8


In [2]:
# import pandas as pd

# df = pd.read_csv("datasets/best_cities.csv", encoding="UTF-8", encoding_errors="replace") 

# print(df.head(10))

In [3]:
import pandas as pd

df = pd.read_csv("datasets/best_cities.csv", encoding="utf-8", encoding_errors="replace") 
print(df.to_markdown())

|     |   Ranking | City              | Country              |   Remote connection: Average WiFi speed (Mbps per second) |   Co-working spaces: Number of co-working spaces |   Caffeine: Average price of buying a coffee |   Travel: Average price of taxi (per km) |   After-work drinks: Average price for 2 beers in a bar |   Accommodation: Average price of 1 bedroom apartment per month |   Food: Average cost of a meal at a local, mid-level restaurant |   Climate: Average number of sunshine hours |   Tourist attractions: Number of ‘Things to do’ on Tripadvisor |   Instagramability: Number of photos with # |
|----:|----------:|:------------------|:---------------------|----------------------------------------------------------:|-------------------------------------------------:|---------------------------------------------:|-----------------------------------------:|--------------------------------------------------------:|----------------------------------------------------------------:|--

In [4]:
# Count total missing values
missing_values = df.isna().sum().sum()
print(f"Total missing values: {missing_values}")

# Print only rows with at least one NaN value
null_rows = df[df.isna().any(axis=1)]
print(null_rows.to_markdown())  # Displays all rows with missing values

Total missing values: 0
| Ranking   | City   | Country   | Remote connection: Average WiFi speed (Mbps per second)   | Co-working spaces: Number of co-working spaces   | Caffeine: Average price of buying a coffee   | Travel: Average price of taxi (per km)   | After-work drinks: Average price for 2 beers in a bar   | Accommodation: Average price of 1 bedroom apartment per month   | Food: Average cost of a meal at a local, mid-level restaurant   | Climate: Average number of sunshine hours   | Tourist attractions: Number of ‘Things to do’ on Tripadvisor   | Instagramability: Number of photos with #   |
|-----------|--------|-----------|-----------------------------------------------------------|--------------------------------------------------|----------------------------------------------|------------------------------------------|---------------------------------------------------------|-----------------------------------------------------------------|----------------------------------

In [5]:
print(df.dtypes)

Ranking                                                            int64
City                                                              object
Country                                                           object
Remote connection: Average WiFi speed (Mbps per second)            int64
Co-working spaces: Number of co-working spaces                     int64
Caffeine: Average price of buying a coffee                       float64
Travel: Average price of taxi (per km)                           float64
After-work drinks: Average price for 2 beers in a bar            float64
Accommodation: Average price of 1 bedroom apartment per month    float64
Food: Average cost of a meal at a local, mid-level restaurant    float64
Climate: Average number of sunshine hours                          int64
Tourist attractions: Number of ‘Things to do’ on Tripadvisor       int64
Instagramability: Number of photos with #                          int64
dtype: object


In [6]:
# Convert all object columns to string dtype
df = df.apply(lambda x: x.astype('string') if x.dtype == 'object' else x)

# Check the dtypes to confirm
print(df.dtypes)

Ranking                                                                   int64
City                                                             string[python]
Country                                                          string[python]
Remote connection: Average WiFi speed (Mbps per second)                   int64
Co-working spaces: Number of co-working spaces                            int64
Caffeine: Average price of buying a coffee                              float64
Travel: Average price of taxi (per km)                                  float64
After-work drinks: Average price for 2 beers in a bar                   float64
Accommodation: Average price of 1 bedroom apartment per month           float64
Food: Average cost of a meal at a local, mid-level restaurant           float64
Climate: Average number of sunshine hours                                 int64
Tourist attractions: Number of ‘Things to do’ on Tripadvisor              int64
Instagramability: Number of photos with 

In [7]:
print(sorted(df["Country"].dropna().unique()))

['Argentina', 'Australia', 'Austria', 'Belarus', 'Belgium', 'Bolivia', 'Brazil', 'Bulgaria', 'Burma/Myanmar', 'Cambodia', 'Canada', 'Chile', 'China', 'Colombia', 'Costa Rica', 'Croatia', 'Czech Republic', 'Denmark', 'Ecuador', 'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Hawaii', 'Hong Kong', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Ireland', 'Israel', 'Italy', 'Japan', 'Kuwait', 'Laos', 'Latvia', 'Lebanon', 'Lithuania', 'Malaysia', 'Malta', 'Mexico', 'Morocco', 'Nepal', 'Netherlands', 'New Zealand', 'Norway', 'Oman', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Qatar', 'Romania', 'Russia', 'Senegal', 'Serbia', 'Singapore', 'Slovenia', 'South Africa', 'South Korea', 'Spain', 'Sri Lanka', 'Sweden', 'Switzerland', 'Taiwan', 'Thailand', 'Turkey', 'Ukraine', 'United Arab Emirates', 'United Kingdom', 'United States', 'Uruguay', 'Vietnam']


In [8]:
from currency_converter import CurrencyConverter

# Initialize CurrencyConverter
c = CurrencyConverter()

# Get the exchange rate from USD to EUR
exchange_rate = c.convert(1, 'USD', 'EUR')

# Columns to convert
columns_to_convert = [
    'Caffeine: Average price of buying a coffee',
    'Travel: Average price of taxi (per km)',
    'After-work drinks: Average price for 2 beers in a bar',
    'Accommodation: Average price of 1 bedroom apartment per month',
    'Food: Average cost of a meal at a local, mid-level restaurant'
]

# Apply conversion only to the specified columns
df[columns_to_convert] = (df[columns_to_convert] * exchange_rate).round(2)

# Print the updated DataFrame
print(df.to_markdown())

|     |   Ranking | City              | Country              |   Remote connection: Average WiFi speed (Mbps per second) |   Co-working spaces: Number of co-working spaces |   Caffeine: Average price of buying a coffee |   Travel: Average price of taxi (per km) |   After-work drinks: Average price for 2 beers in a bar |   Accommodation: Average price of 1 bedroom apartment per month |   Food: Average cost of a meal at a local, mid-level restaurant |   Climate: Average number of sunshine hours |   Tourist attractions: Number of ‘Things to do’ on Tripadvisor |   Instagramability: Number of photos with # |
|----:|----------:|:------------------|:---------------------|----------------------------------------------------------:|-------------------------------------------------:|---------------------------------------------:|-----------------------------------------:|--------------------------------------------------------:|----------------------------------------------------------------:|--

In [9]:
import os

# Define the folder path
folder_path = "converted_datasets"

# Ensure the folder exists
os.makedirs(folder_path, exist_ok=True)

# Define the full file path
file_path = os.path.join(folder_path, "cities_new.csv")

# Save the DataFrame as a CSV file
df.to_csv(file_path, index=False, encoding='utf-8')

print(f"CSV file saved successfully at: {file_path}")


CSV file saved successfully at: converted_datasets\cities_new.csv
