In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

# Joining Data

First, we load the raw cellphone data and convert it. this is using the same code I made in the last homework, where NaNs are forward-filled.

In [58]:
cell = pd.read_csv("https://raw.githubusercontent.com/su-mt4007/data/refs/heads/main/cell_phones_total.csv")
cell.set_index("iso-3", inplace = True)
for col in cell:
    if len(cell[col].unique()) <= 2:
        cell[col].fillna(0, inplace= True)
        
def str_to_number(x, sizes = {"k" : 10**3, "K" : 10**3
                     , "m" : 10**6, "M": 10**6
                     , "b": 10**9, "B" : 10**9}):
    
    # Takes a number expressed in the form (number)(letter) and converts it into the corresponding number using the "sizes dictionary"
    
    num = re.match(r"[0-9\.]+" , str(x))
    l = re.search(r".$", str(x))
    
    # If both num and l are successful matches, we have a string of the correct form to convert
    if num and l:
        # accessing the strings we found
        num = num.group(0)
        l = l.group(0)
        
        # adding the results as a number
        if l in sizes.keys():
                return float(num) * sizes[l]
        else: 
                return float(num)

    else:
        return float(x)
        
def df_to_numbers(df):
    return df.map(str_to_number, na_action = "ignore")
    
cell = df_to_numbers(cell)
cell.ffill(axis = 1, inplace = True)


Now let's load the population data and inspect it.

In [59]:
pop = pd.read_csv("https://raw.githubusercontent.com/su-mt4007/data/refs/heads/main/pop_data.csv").drop(columns = "Unnamed: 0").set_index("iso-3")
pop

Unnamed: 0_level_0,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
iso-3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ABW,54608.0,55811.0,56682.0,57475.0,58178.0,58782.0,59291.0,59522.0,59471.0,59330.0,...,101288.0,102112.0,102880.0,103594.0,104257.0,104874.0,105439.0,105962.0,106442.0,106585.0
AFE,130692579.0,134169237.0,137835590.0,141630546.0,145605995.0,149742351.0,153955516.0,158313235.0,162875171.0,167596160.0,...,537792950.0,552530654.0,567892149.0,583651101.0,600008424.0,616377605.0,632746570.0,649757148.0,667242986.0,685112979.0
AFG,8622466.0,8790140.0,8969047.0,9157465.0,9355514.0,9565147.0,9783147.0,10010030.0,10247780.0,10494489.0,...,29249157.0,30466479.0,31541209.0,32716210.0,33753499.0,34636207.0,35643418.0,36686784.0,37769499.0,38972230.0
AFW,97256290.0,99314028.0,101445032.0,103667517.0,105959979.0,108336203.0,110798486.0,113319950.0,115921723.0,118615741.0,...,366489204.0,376797999.0,387204553.0,397855507.0,408690375.0,419778384.0,431138704.0,442646825.0,454306063.0,466189102.0
AGO,5357195.0,5441333.0,5521400.0,5599827.0,5673199.0,5736582.0,5787044.0,5827503.0,5868203.0,5928386.0,...,24259111.0,25188292.0,26147002.0,27128337.0,28127721.0,29154746.0,30208628.0,31273533.0,32353588.0,33428486.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
XKX,947000.0,966000.0,994000.0,1022000.0,1050000.0,1078000.0,1106000.0,1135000.0,1163000.0,1191000.0,...,1791000.0,1807106.0,1818117.0,1812771.0,1788196.0,1777557.0,1791003.0,1797085.0,1788878.0,1790133.0
YEM,5542459.0,5646668.0,5753386.0,5860197.0,5973803.0,6097298.0,6228430.0,6368014.0,6515904.0,6673981.0,...,25475610.0,26223391.0,26984002.0,27753304.0,28516545.0,29274002.0,30034389.0,30790513.0,31546691.0,32284046.0
ZAF,16520441.0,16989464.0,17503133.0,18042215.0,18603097.0,19187194.0,19789771.0,20410677.0,21050540.0,21704214.0,...,52443325.0,53145033.0,53873616.0,54729551.0,55876504.0,56422274.0,56641209.0,57339635.0,58087055.0,58801927.0
ZMB,3119430.0,3219451.0,3323427.0,3431381.0,3542764.0,3658024.0,3777680.0,3901288.0,4029173.0,4159007.0,...,14265814.0,14744658.0,15234976.0,15737793.0,16248230.0,16767761.0,17298054.0,17835893.0,18380477.0,18927715.0


There are no large swathes of missing values like in the cell phone data, so we'll have to take a closer look. In the following code i pick all rows that have at least one missing value.

In [60]:
pop[pop.isnull().any(axis=1)]

Unnamed: 0_level_0,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
iso-3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
INX,,,,,,,,,,,...,,,,,,,,,,
PSE,,,,,,,,,,,...,3882986.0,3979998.0,4076708.0,4173398.0,4270092.0,4367088.0,4454805.0,4569087.0,4685306.0,4803269.0


Only two rows have any missing values, meaning all others already have workable data. But first we have to decide what to do with these two rows. If we take a closer look at `INX` we can see that there is not a single non-`NaN` value in it. As such, we have no information to interpolate or otherwise fix the data.

In [61]:
temp = pop.loc[pop.index == "INX"].squeeze()
print(temp)
temp.unique()

1960   NaN
1961   NaN
1962   NaN
1963   NaN
1964   NaN
        ..
2016   NaN
2017   NaN
2018   NaN
2019   NaN
2020   NaN
Name: INX, Length: 61, dtype: float64


array([nan])

Next we'll take a look at the data for `PSE`. This is the country code for Palestine, which according to wikipedia declared their independance 1988. This explains what we see if we remove the `Nan` values.

In [62]:
pop.loc[pop.index == "PSE"].squeeze().dropna()

1990    1978248.0
1991    2068845.0
1992    2163591.0
1993    2262676.0
1994    2366298.0
1995    2474666.0
1996    2587997.0
1997    2706518.0
1998    2776568.0
1999    2848431.0
2000    2922153.0
2001    2997784.0
2002    3075373.0
2003    3154969.0
2004    3236626.0
2005    3320396.0
2006    3406334.0
2007    3494496.0
2008    3591977.0
2009    3689099.0
2010    3786161.0
2011    3882986.0
2012    3979998.0
2013    4076708.0
2014    4173398.0
2015    4270092.0
2016    4367088.0
2017    4454805.0
2018    4569087.0
2019    4685306.0
2020    4803269.0
Name: PSE, dtype: float64

The `NaN` values end in 1990, just after Palestine declared independance. Thus, it is reasonable to set these values to 0, as there was no state of Palestine before then.

In [63]:
# Remove INX and fill NaNs of PSE
pop = pop.drop(axis = 0, index = "INX").fillna(0)

In [64]:
# give columns name
pop.columns.name = "year"

In [65]:
pop.stack()

iso-3  year
ABW    1960       54608.0
       1961       55811.0
       1962       56682.0
       1963       57475.0
       1964       58178.0
                  ...    
ZWE    2016    14452704.0
       2017    14751101.0
       2018    15052184.0
       2019    15354608.0
       2020    15669666.0
Length: 16165, dtype: float64

In [66]:
count = pd.read_csv("https://raw.githubusercontent.com/su-mt4007/data/refs/heads/main/country_data.csv")[["name","alpha-3"]]
count.rename(columns={"alpha-3":"iso-3","name":"country"}, inplace = 1)

In [73]:
cell.columns.name = "year"

newdf = pd.merge(cell.stack().reset_index().rename(columns={0: "n_cellphones"}), pop.stack().reset_index().rename(columns={0: "population"}))
newdf["new"] = newdf["n_cellphones"] / newdf["population"]
newdf = pd.merge(newdf,count)

In [74]:
newdf.drop(columns=["n_cellphones","population","iso-3"],inplace = True)
newdf


Unnamed: 0,year,new,country
0,1960,0.000000,Aruba
1,1965,0.000000,Aruba
2,1966,0.000000,Aruba
3,1967,0.000000,Aruba
4,1968,0.000000,Aruba
...,...,...,...
11923,2015,0.904278,Zimbabwe
11924,2016,0.892567,Zimbabwe
11925,2017,0.955861,Zimbabwe
11926,2018,0.857018,Zimbabwe


In [75]:
newdf.set_index(["country","year"], inplace = True)

In [76]:
newdf

Unnamed: 0_level_0,Unnamed: 1_level_0,new
country,year,Unnamed: 2_level_1
Aruba,1960,0.000000
Aruba,1965,0.000000
Aruba,1966,0.000000
Aruba,1967,0.000000
Aruba,1968,0.000000
...,...,...
Zimbabwe,2015,0.904278
Zimbabwe,2016,0.892567
Zimbabwe,2017,0.955861
Zimbabwe,2018,0.857018


In [83]:
newdf.unstack()

Unnamed: 0_level_0,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new,new
year,1960,1965,1966,1967,1968,1969,1970,1971,1972,1973,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Afghanistan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.361835,0.471808,0.502191,0.532637,0.562412,0.583643,0.623625,0.670531,0.599671,0.598366
Albania,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.923440,1.067054,1.206730,1.274571,1.162990,1.180267,1.171725,1.263287,0.945445,0.921452
Algeria,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.914761,0.974180,1.006426,1.039457,1.117126,1.092477,1.165116,1.113365,1.125766,1.102906
American Samoa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.041022,0.041429,0.041906,0.042457,0.043089,0.043802,0.044600,0.045489,0.046465,0.047548
Andorra,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.915841,0.921110,0.899835,0.895372,0.924310,0.993784,1.049076,1.087531,1.101142,1.151383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Virgin Islands (British),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.723763,1.666547,1.713522,1.863419,1.891547,1.947831,1.334947,1.463739,1.318609,1.306762
Virgin Islands (U.S.),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.741069,0.741527,0.742226,0.743236,0.744332,0.745507,0.746866,0.748502,0.750460,0.752796
Yemen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.448595,0.459263,0.530061,0.622591,0.616143,0.526010,0.560224,0.512746,0.496906,0.484995
Zambia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.395154,0.571997,0.712122,0.682640,0.641767,0.713924,0.715659,0.774654,0.869034,0.935775


# Regex

First we load the text document into a list of each line

In [59]:
with open("comments.txt", "r") as f:
    comments = f.read()
    comments = comments.split("\n")

To find all instances of a hashtag in a string we use the `re.findall` function with the regular expression `r"#\w+"`. This matches any part of the string that has a hashtag followed by one or more word characters. As the character `+` is "greedy" it matches as many characters as possible, and so only matches the full hashtag.

In [60]:
re.findall(r"#\w+", comments[0])

['#programming', '#tips']

To find a full string that has both `#programming` and `#python` in it I use `r.fullmatch`, which checks if the whole string metches the expression (you could also use `^` at the beginning and `$` at the end of the expression to get the same result.). The actual expression i used simply checks for either "#python" followed by "#programming" or "#programming" followed by "#python". There is definitely a more elegant and generalizable solution to this problem, but this works.

In [63]:
re.fullmatch(r"(.*#python.*#programming.*)|(.*#python.*#programming.*)", comments[1], re.I)

In this case no match is made since comment 2 does not contain both of those hashtags. To show that it works we try it on another comment

In [73]:
print("comment: " + comments[5])
print("regex: " + re.fullmatch(r"(.*#python.*#programming.*)|(.*#programming.*#python.*)", comments[5], re.I).group(0))

comment: 6. "I learned a lot. #programming #python #tips"
regex: 6. "I learned a lot. #programming #python #tips"


To extract all hashtags from our text I simply loop though all lines of text and use our previously made hashtag-finder, and then add the hashtags to a list if it is not already there.

In [62]:
hashtags = []

for s in comments:
    matches = re.findall(r"#\w+", s)
    for m in matches:
        if (m in hashtags) == False:
            hashtags.append(m)
            
hashtags

['#programming',
 '#tips',
 '#coding',
 '#python',
 '#tech',
 '#data',
 '#analysis',
 '#innovation',
 '#analytics',
 '#insights',
 '#research']