In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm

def get_city_info(city):
    website_url = requests.get('https://en.wikipedia.org/wiki/' + city).text
    soup = BeautifulSoup(website_url, 'lxml')
    flag = 0
    s1 = 'Country'
    s2 = 'Nation'

    # Creating empty data with relevant population fields. Note that Wiki has City, Metro and Urban columns for population count
    city_data = {'City_name': [], 'Urban': [], 'Metro': [], 'City': [], 'Total': []}

    for i in range(0, 35):
        try:
            table = soup.find_all('tr', {'class': ['mergedtoprow', 'mergedrow']})[i]
            key = table.find('th').text
            try:
                value = table.find('td').text
            except:
                value = ""
            if flag == 0:
                if s1 in key or s2 in key:
                    flag = 1
                    continue
            if flag == 0:
                continue

            # Remove square brackets from key and value
            key = ''.join(c for c in key if c not in '[]')
            value = ''.join(c for c in value if c not in '[]')

            # Remove leading bullet point and whitespace
            value = value.strip().lstrip('•')

            # Getting every possible name for "population"
            city_data['City_name'].append(city)
            city_data['Urban'].append(value if 'urban' in key.lower() else "")
            city_data['Metro'].append(value if 'metro' in key.lower() else "")
            city_data['City'].append(value if 'city' in key.lower() else "")
            city_data['Total'].append(value if 'total' in key.lower() else "")

        except:
            continue

    return pd.DataFrame(city_data)

# Example usage with a list of cities
city_names = ['astana',	'kokshetau',	'stepnogorsk',	'atbasar',	'shchuchinsk',	'almaty',	'esik',	'kapshagai-kapchagai',	'kaskelen',	'talgar',	'taldykorgan',	'aktobe',	'khromtau',	'atyrau',	'Kulsary',	'semei',	'Oskemen',	'taraz',	'aksai',	'Uralsk',	'balkhash',	'zhezkazgan',	'karaganda',	'satpaev',	'temirtau',	'kostanai',	'rudnyi-1',	'aral-sk',	'kyzylorda',	'aktau',	'zhanaozen',	'pavlodar',	'ekibastuz',	'petropavlovsk-1',	'turkestan',	'shymkent-chimkent',	'beineu',	'aiteke-bi',	'shamalgan-stantsiia',	'baiserke',	'sao-paulo',	'mexico-city',	'boa-vista',	'salvador',	'fortaleza',	'monterrey',	'saltillo',	'merida-1',	'cucuta',	'bogota',	'lima',	'santiago-1',	'guadalajara',	'puebla',	'ciudad-juarez',	'tijuana',	'leon-de-los-aldama',	'chihuahua',	'san-luis-potosi',	'aguascalientes',	'santiago-de-queretaro',	'hermosillo',	'medellin',	'barranquilla',	'cartagena-1',	'popayan',	'valledupar',	'monteria',	'pasto',	'pereira',	'bucaramanga',	'sincelejo',	'cali',	'trujillo-1',	'arequipa',	'ibague',	'shieli',	'recife',	'joao-pessoa',	'rio-de-janeiro',	'curitiba',	'quito',	'puerto-vallarta',	'villavicencio',	'manaus',	'brasilia',	'goiania',	'campo-grande',	'belo-horizonte',	'belem',	'campinas',	'porto-alegre',	'barrancabermeja',	'Chapaev',	'alexandria-lskndry',	'Cairo',	'Damietta',	'Tanta',	'ismailia-lsm-yly',	'mansoura-lmnswr',]
result_df = pd.DataFrame(columns=['City_name', 'Urban', 'Metro', 'City', 'Total'])

for city in tqdm(city_names):
    city_info = get_city_info(city)
    if not city_info.empty:
        result_df = pd.concat([result_df, city_info], ignore_index=True)

# Bad solution, it will remove every string value
# result_df['Urban'] = result_df['Urban'].astype(object).replace(',','.').apply(lambda x: pd.to_numeric(x, errors='ignore'))
# result_df['Metro'] = result_df['Metro'].astype(object).replace(',','.').apply(lambda x: pd.to_numeric(x, errors='ignore'))

# Remove duplicates and drop null rows
result_df_cleaned = result_df.drop_duplicates().dropna()

# Sort and display the DataFrame
result_df_sorted = result_df_cleaned.sort_values(by=['City_name', 'Urban', 'Metro', 'City', 'Total'], ascending=False)
result_df_sorted

  0%|          | 0/100 [00:00<?, ?it/s]  1%|          | 1/100 [00:01<02:02,  1.23s/it]  2%|▏         | 2/100 [00:02<01:57,  1.20s/it]  3%|▎         | 3/100 [00:02<01:28,  1.10it/s]  4%|▍         | 4/100 [00:03<01:13,  1.30it/s]  5%|▌         | 5/100 [00:04<01:14,  1.28it/s]  6%|▌         | 6/100 [00:05<01:20,  1.16it/s]  7%|▋         | 7/100 [00:06<01:19,  1.17it/s]  8%|▊         | 8/100 [00:07<01:20,  1.14it/s]  9%|▉         | 9/100 [00:07<01:14,  1.22it/s] 10%|█         | 10/100 [00:08<01:08,  1.32it/s] 11%|█         | 11/100 [00:09<01:07,  1.33it/s] 12%|█▏        | 12/100 [00:10<01:22,  1.06it/s] 13%|█▎        | 13/100 [00:11<01:24,  1.03it/s] 14%|█▍        | 14/100 [00:12<01:14,  1.15it/s] 15%|█▌        | 15/100 [00:12<01:02,  1.36it/s] 16%|█▌        | 16/100 [00:13<00:58,  1.43it/s] 17%|█▋        | 17/100 [00:13<00:54,  1.53it/s] 18%|█▊        | 18/100 [00:14<00:54,  1.49it/s] 19%|█▉        | 19/100 [00:15<00:56,  1.42it/s] 20%|██        | 20/100 [00:15<00:47,

Unnamed: 0,City_name,Urban,Metro,City,Total
208,zhezkazgan,90661,,,
207,zhezkazgan,,,86227,
201,zhezkazgan,,,,
299,zhanaozen,,,,516 km2 (199 sq mi)
302,zhanaozen,,,,147962
...,...,...,...,...,...
1081,Damietta,,,,
1068,Cairo,,22183000,,
1064,Cairo,,"2,734 km2 (1,056 sq mi)",,
1067,Cairo,,,101001661,
