In [19]:
import pandas as pd
import numpy as np

In [7]:
# Read in data
df = pd.read_csv("https://raw.githubusercontent.com/su-mt4007/data/refs/heads/main/cell_phones_total.csv")
print(df.info()) 
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 57 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   iso-3   214 non-null    object 
 1   1960    200 non-null    float64
 2   1965    200 non-null    float64
 3   1966    0 non-null      float64
 4   1967    0 non-null      float64
 5   1968    0 non-null      float64
 6   1969    0 non-null      float64
 7   1970    200 non-null    float64
 8   1971    0 non-null      float64
 9   1972    0 non-null      float64
 10  1973    0 non-null      float64
 11  1974    0 non-null      float64
 12  1975    200 non-null    float64
 13  1976    200 non-null    float64
 14  1977    200 non-null    float64
 15  1978    200 non-null    float64
 16  1979    200 non-null    float64
 17  1980    200 non-null    object 
 18  1981    199 non-null    object 
 19  1982    199 non-null    object 
 20  1983    198 non-null    object 
 21  1984    196 non-null    object 
 22  19

In [21]:
df["1967"].unique()

array([nan])

In [23]:
df["2018"].unique()

array([nan, '22M', '13.3M', '2.71M', '82.6k', '20.1M', '58.6M', '3.58M',
       '27.6M', '11M', '10.3M', '6.32M', '11.4M', '9.46M', '19.3M',
       '162M', '8.39M', '2.09M', '389k', '3.46M', '11.6M', '246k', '207M',
       '329k', '566k', '704k', '3.38M', '1.28M', '33.2M', '10.8M',
       '25.2M', '1.65B', '33.8M', '18.5M', '36.5M', '5M', '64.5M', '499k',
       '610k', '8.5M', '5.37M', '186k', '1.2M', '12.7M', '108M', '395k',
       '75.8k', '7.22M', '8.94M', '47.2M', '15.8M', '93.8M', '54.2M',
       '1.92M', '7.15M', '70.4M', '56.8k', '2.93M', '79.5M', '5.46M',
       '40.9M', '41k', '12M', '3.18M', '1.48M', '591k', '12.2M', '116k',
       '62.5k', '20.5M', '19.9M', '7.59M', '4.39M', '6.4M', '10M', '319M',
       '1.18B', '4.97M', '88.7M', '411k', '10.7M', '83.3M', '2.96M',
       '8.73M', '180M', '26.1M', '49.5M', '8.74M', '19.4M', '53.3k',
       '66.4M', '7.1M', '3.66M', '4.42M', '185k', '47.3k', '30.3M',
       '4.59M', '799k', '2.07M', '2.18M', '44.7M', '32.7k', '3.64M',
      

## In the dataset, converting blank spaces to 0 is a reasonable solution since all values are numerical.
## For the remaining values, according to common human conventions, "M" stands for 1 000 000, "k" stands for 1 000, and so on.

In [11]:
def convert_to_number(s): 
    if pd.isna(s): 
        return 0 
    if isinstance(s, (int, float)): 
            return s # If it is already a number, return it directly
    multiplier = 1 
    if 'k' in s: 
        multiplier = 1_000 
    elif 'M' in s: 
        multiplier = 1_000_000 
    elif 'B' in s: 
        multiplier = 1_000_000_000 
    return float(s.rstrip('kMB')) * multiplier 
    
print("Original data preview:") 
print(df.head()) 
 
for column in df.columns[1:]: 
    df[column] = df[column].apply(convert_to_number) 

print("\nData after conversion:") 
print(df.info()) 
print(df.head())

Original data preview:
  iso-3  1960  1965  1966  1967  1968  1969  1970  1971  1972  ...  \
0   ABW   0.0   0.0     0     0     0     0   0.0     0     0  ...   
1   AFG   0.0   0.0     0     0     0     0   0.0     0     0  ...   
2   AGO   0.0   0.0     0     0     0     0   0.0     0     0  ...   
3   ALB   0.0   0.0     0     0     0     0   0.0     0     0  ...   
4   AND   0.0   0.0     0     0     0     0   0.0     0     0  ...   

         2010        2011        2012        2013        2014        2015  \
0    132000.0         0.0    135000.0    139000.0    140000.0    141000.0   
1  10200000.0  13800000.0  15300000.0  16800000.0  18400000.0  19700000.0   
2   9400000.0  12100000.0  12800000.0  13300000.0  14100000.0  13900000.0   
3   2690000.0   3100000.0   3500000.0   3690000.0   3360000.0   3400000.0   
4     65500.0     65000.0     63900.0     63900.0     66200.0     71300.0   

         2016        2017        2018        2019  
0         0.0         0.0         0.0    

## Now we kan replace back all the 0 to "nan".

In [21]:
df.replace(0, np.nan, inplace=True)
print(df.info()) 
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 57 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   iso-3   214 non-null    object 
 1   1960    0 non-null      float64
 2   1965    0 non-null      float64
 3   1966    0 non-null      float64
 4   1967    0 non-null      float64
 5   1968    0 non-null      float64
 6   1969    0 non-null      float64
 7   1970    0 non-null      float64
 8   1971    0 non-null      float64
 9   1972    0 non-null      float64
 10  1973    0 non-null      float64
 11  1974    0 non-null      float64
 12  1975    0 non-null      float64
 13  1976    0 non-null      float64
 14  1977    0 non-null      float64
 15  1978    0 non-null      float64
 16  1979    0 non-null      float64
 17  1980    1 non-null      float64
 18  1981    4 non-null      float64
 19  1982    6 non-null      float64
 20  1983    6 non-null      float64
 21  1984    9 non-null      float64
 22  19

## And Express the other numerical values in scientific notation.

In [27]:
# Convert numerical values to scientific notation format while preserving original precision 
def format_scientific_notation(x): 
    if isinstance(x, (int, float)) and not pd.isna(x): 
        return '{:g}'.format(x) 
    return x 
    
df = df.applymap(format_scientific_notation) 
# Print the transformed dataframe information 
print(df.info()) 
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 57 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   iso-3   214 non-null    object 
 1   1960    0 non-null      float64
 2   1965    0 non-null      float64
 3   1966    0 non-null      float64
 4   1967    0 non-null      float64
 5   1968    0 non-null      float64
 6   1969    0 non-null      float64
 7   1970    0 non-null      float64
 8   1971    0 non-null      float64
 9   1972    0 non-null      float64
 10  1973    0 non-null      float64
 11  1974    0 non-null      float64
 12  1975    0 non-null      float64
 13  1976    0 non-null      float64
 14  1977    0 non-null      float64
 15  1978    0 non-null      float64
 16  1979    0 non-null      float64
 17  1980    1 non-null      object 
 18  1981    4 non-null      object 
 19  1982    6 non-null      object 
 20  1983    6 non-null      object 
 21  1984    9 non-null      object 
 22  19

  df = df.applymap(format_scientific_notation)
