In [98]:
from bs4 import BeautifulSoup
import os
import pickle
import pandas as pd

In [99]:
class Tweet():
    
    def __init__(self, author, permalink, time, text):
        self.author = author
        self.permalink = permalink
        self.time = time
        self.text = text

def emoji_converter(emoji):
    return {
        "Emoji: Croix" : lambda : ":x:",
        "Emoji: Coche blanche en gras" : lambda : ":white_check_mark:",
        "Emoji: Triangle pointant vers la droite" : lambda : ":arrow_right:",
        "Emoji: Panneau chantier " : lambda : ":construction:",
        "Emoji: Index pointant vers la droite" : lambda : ":point_right:",
        "Emoji: Clé" : lambda : ":wrench:"
    }.get(emoji,lambda: None)()

def tweet_converter(tweet):
    s = ""
    for e in tweet.contents:
        # Image
        if e.name == "img" and "Emoji" in e.attrs.get("class"):
            emoji = emoji_converter(e.attrs.get("aria-label"))
            if emoji:
                s += emoji
        # Add text
        if type(e) is bs4.element.NavigableString:
            s += e
        # Transform @mention as text
        if e.name == "a" and "twitter-atreply" in e.attrs.get("class"):
            s += e.text
        # Add #hashtag as text
        if e.name == "a" and "twitter-hashtag" in e.attrs.get("class"):
            s += e.text
        # Add link as text
        if e.name == "a" and "twitter-timeline-link" in e.attrs.get("class"):
            s += " " + e.attrs.get("href")
    return s


In [117]:
FILES = !ls pages
ratp_tweets = {}
for file in FILES:
    print(file)
    with open(f"pages/{file}", "r") as f:
        soup = BeautifulSoup(f.read(), 'html.parser')
        
        author = os.path.splitext(file)[0]
        
        tweets = []
        
        tweets_div = soup.find_all("div", class_="tweet")
        for tweet in tweets_div:
            text = tweet.find("p", class_="tweet-text")
            text = tweet_converter(text)
            time = tweet.find("a", class_="tweet-timestamp js-permalink js-nav js-tooltip").attrs.get("title") 
            url  = tweet.attrs.get("data-permalink-path")
            tweets.append(Tweet(author, url, time, text))
            
        ratp_tweets.update({f"{author}": tweets})

Ligne10_RATP.html
Ligne11_RATP.html
Ligne12_RATP.html
Ligne13_RATP.html
Ligne14_RATP.html
Ligne1_RATP.html
Ligne2_RATP.html
Ligne3_RATP.html
Ligne4_RATP.html
Ligne5_RATP.html
Ligne6_RATP.html
Ligne7_RATP.html
Ligne8_RATP.html
Ligne9_RATP.html
RER_A.html
RERB.html
RERC_SNCF.html
RERD_SNCF.html
T1_RATP.html
T2_RATP.html
T3a_RATP.html
T3b_RATP.html
T6_RATP.html
T7_RATP.html
T8_RATP.html


In [101]:
with open("tweets.pickle", "wb") as f:
    pickle.dump(ratp_tweets, f)

## Dataset creation

In [126]:
columns = ["author", "time", "permalink", "text"]
df = pd.DataFrame(columns=columns)

In [127]:
for ligne, tweets in ratp_tweets.items():
    data = [[tweet.author, tweet.time, tweet.permalink, tweet.text] for tweet in tweets]
    df = pd.concat([
        df,
        pd.DataFrame(data=data, columns=columns)
    ], ignore_index=True)

In [128]:
df.shape

(19513, 4)

In [129]:
df

Unnamed: 0,author,time,permalink,text
0,Ligne10_RATP,08:53 - 5 déc. 2019,/Ligne10_RATP/status/1202632010122043393,[Mouvement Social] Prévisions de trafic pour ...
1,Ligne10_RATP,22:06 - 5 déc. 2019,/RATPgroup/status/1202831581876015105,[Mouvement Social] 1 bus sur 3 circule aujour...
2,Ligne10_RATP,08:29 - 4 déc. 2019,/Ligne10_RATP/status/1202263793025331201,"[Mouvement Social] A partir du 5 décembre, l..."
3,Ligne10_RATP,02:56 - 4 déc. 2019,/Ligne10_RATP/status/1202179755845529601,"11:55, la rame repart de Vaneau en dir. de Pt ..."
4,Ligne10_RATP,02:51 - 4 déc. 2019,/Ligne10_RATP/status/1202178491241902080,"11:50, la rame stationne à Vaneau en dir. de P..."
...,...,...,...,...
19508,T8_RATP,11:40 - 2 mai 2016,/T8_RATP/status/727206268578967553,Retour à un trafic régulier sur l'ensemble de ...
19509,T8_RATP,11:21 - 2 mai 2016,/T8_RATP/status/727201486585913344,"20:18, le trafic est interrompu entre EPINAY -..."
19510,T8_RATP,06:41 - 28 avr. 2016,/T2_RATP/status/725681308429606913,#ConcoursPhoto #photogRATPhie : toutes les inf...
19511,T8_RATP,14:52 - 27 avr. 2016,/T8_RATP/status/725442647800815616,Retour à un trafic régulier sur l'ensemble de ...


In [130]:
with open("df.pickle", "wb") as f:
    pickle.dump(df, f)