# Recommender System

In [63]:
#!pip install -U pip setuptools wheel
#!pip install -U spacy
#!python -m spacy download en_core_web_lg
#!python -m spacy download de_core_news_lg

import pandas as pd
import spacy
import os as os

nlp = spacy.load("en_core_web_lg")
#nlp = spacy.load("de_core_news_lg")


### Load prepared Dataset

In [64]:
filename = "all_toots.csv"
path= "../scraper/datasets"
data = pd.read_csv(os.path.join(path, filename), sep=";")
data.head()

Unnamed: 0,toot_id,content,reblogs_count,favourites_count,replies_count,mentions,tags,language,created_at,edited_at,instance
0,110322104651328999,Study makes troubling revelation about the bot...,3,0,0,[],"[{'name': 'ocean', 'url': 'https://mastodon.so...",en,2023-05-06 14:02:02+00:00,,mastodon.social
1,110322072260430195,ふわふわじゃないのに高いの…最悪じゃん〜\n\n,0,0,0,[],[],ja,2023-05-06 13:53:49+00:00,,mastodon.social
2,110322107441942062,\n\n,0,0,0,[],[],en,2023-05-06 14:02:46.388000+00:00,,mastodon.social
3,110322074699247900,2週間ぶり\n\n,1,0,0,[],[],,2023-05-06 13:54:26.666000+00:00,,mastodon.social
4,110322069624451350,せ、生命活動・・・\n[#おうどんラジオ](https://social.vivaldi.n...,0,0,0,[],"[{'name': 'おうどんラジオ', 'url': 'https://mastodon....",ja,2023-05-06 13:53:08+00:00,,mastodon.social


### Select toots in english and german

In [65]:
mask_language = (data["language"] == "en") | (data["language"] == "de")
data = data[mask_language]
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60050 entries, 0 to 99999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   toot_id           60050 non-null  int64 
 1   content           60050 non-null  object
 2   reblogs_count     60050 non-null  int64 
 3   favourites_count  60050 non-null  int64 
 4   replies_count     60050 non-null  int64 
 5   mentions          60050 non-null  object
 6   tags              60050 non-null  object
 7   language          60050 non-null  object
 8   created_at        60050 non-null  object
 9   edited_at         1880 non-null   object
 10  instance          60050 non-null  object
dtypes: int64(4), object(7)
memory usage: 5.5+ MB


In [66]:
test_toot_df = data
#delete entries with same toot_id
test_toot_df = test_toot_df.drop_duplicates(subset="toot_id", keep="first")
test_toot_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1524 entries, 0 to 56662
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   toot_id           1524 non-null   int64 
 1   content           1524 non-null   object
 2   reblogs_count     1524 non-null   int64 
 3   favourites_count  1524 non-null   int64 
 4   replies_count     1524 non-null   int64 
 5   mentions          1524 non-null   object
 6   tags              1524 non-null   object
 7   language          1524 non-null   object
 8   created_at        1524 non-null   object
 9   edited_at         47 non-null     object
 10  instance          1524 non-null   object
dtypes: int64(4), object(7)
memory usage: 142.9+ KB


### Recommender System for local timeline

- Step 1. Get relevant toots depending on content after selecting the interests (after registration) from people in local timeline
- Step 2. Get toots from people you follow 
- Step 3. Get persons with simular interests (who to follow)
- Step 4. Get toots by hashtags (filter hashtags by interests)
- Step 5. Mix data
- Step 6. Rank the toots in a ranking system and sort them descending

##### Initial problems on setup: 
- missing toots in local timeline
- missing persons with simular interests
- missing toots from peope you follow

##### Solutions:

- Create initial content in local timeline bot content 
- ....

#### Step 1: Get relevant toots depending on content after selecting the interests (after registration) from people in local timeline

In [67]:
interests = ["Klettern", "Gaming", "Datascience", "Politik", "Roboter"] #create list of interests after login/registration

##### Simularity Check with spacy

In [68]:
def lemmatize_text(text):
    """Function to lemmatize text data and remove the stopwords."""
    doc = nlp(text)
    
    # Lemmatization and removal of stop words
    processed_tokens = [token.lemma_ for token in doc if not token.is_stop]
    
    # Return the formatted text as a string
    processed_text = ' '.join(processed_tokens)
    
    return processed_text

In [69]:
# Create new column with lemmatized text
test_toot_df["content_lemma"] = test_toot_df["content"].apply(lemmatize_text)
test_toot_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_toot_df["content_lemma"] = test_toot_df["content"].apply(lemmatize_text)


Unnamed: 0,toot_id,content,reblogs_count,favourites_count,replies_count,mentions,tags,language,created_at,edited_at,instance,content_lemma
0,110322104651328999,Study makes troubling revelation about the bot...,3,0,0,[],"[{'name': 'ocean', 'url': 'https://mastodon.so...",en,2023-05-06 14:02:02+00:00,,mastodon.social,study make troubling revelation ocean : ' \n t...
2,110322107441942062,\n\n,0,0,0,[],[],en,2023-05-06 14:02:46.388000+00:00,,mastodon.social,\n\n
6,110322056600760455,It’s like a sketch. I can’t quite believe it’s...,0,0,0,[],[],en,2023-05-06 13:49:46+00:00,,mastodon.social,like sketch . believe real . \n\n
7,110322106189475983,Catching up with Dwellings? Add all four back ...,0,0,0,"[{'id': 109297712949714412, 'username': 'jstep...","[{'name': 'crowdfunding', 'url': 'https://mast...",en,2023-05-06 14:02:27.281000+00:00,,mastodon.social,catch dwelling ? add issue \n [ @jstephenscomi...
11,110322085673775812,[#KBOS](https://mastodon.social/tags/KBOS) /\n...,0,0,0,[],"[{'name': 'kbos', 'url': 'https://mastodon.soc...",en,2023-05-06 13:57:14.245000+00:00,,mastodon.social,[ # kbos](https://mastodon.social / tag / KBOS...


In [70]:
def calculate_content_similarity_score(interests, toot_dataframe, sort_dataframe_by_content_similarity=True):
    """Function to calculate the similarity score between the interests and the toot content."""
    
    # Create a list of tuples (similarity, toot) for the most similar toots
    similarity_scores = []
    for _, toot in toot_dataframe.iterrows():
        toot_content = toot['content_lemma']
        toot_doc = nlp(toot_content)
        
        # Calculate the average similarity between the interests and the toot content
        similarity_scores_sum = 0
        for interest in interests:
            interest_doc = nlp(interest)
            similarity_scores_sum += toot_doc.similarity(interest_doc)
        
        # Calculate the average similarity score
        similarity_score = similarity_scores_sum / len(interests)
        
        similarity_scores.append((similarity_score, toot))
    
    # Create a new DataFrame with the additional column similarity_score
    result_dataframe = toot_dataframe.copy()
    result_dataframe['content_similarity_score'] = [score for score, _ in similarity_scores]
    
    if sort_dataframe_by_content_similarity:
        # Sort the DataFrame by the column similarity_score (descending) and reset the index
        result_dataframe.sort_values('content_similarity_score', ascending=False, inplace=True)
        result_dataframe.reset_index(drop=True, inplace=True)
    
    
    return result_dataframe

In [71]:
similar_toots = calculate_content_similarity_score(interests, test_toot_df)
similar_toots.head()

  similarity_scores_sum += toot_doc.similarity(interest_doc)


Unnamed: 0,toot_id,content,reblogs_count,favourites_count,replies_count,mentions,tags,language,created_at,edited_at,instance,content_lemma,content_similarity_score
0,110322104757107418,Posted by Post Growth Innovation Lab in\n[http...,0,0,0,[],[],en,2023-05-06 14:01:31+00:00,,mastodon.social,post Post Growth Innovation Lab \n [ https://t...,0.175365
1,110322099419002639,European Political Science Review \n[https://...,0,0,0,[],[],en,2023-05-06 14:00:11+00:00,,mastodon.social,European Political Science Review \n [ https:...,0.169615
2,110322092296070139,Michigan Democrats:\n\nGov Whitmer answering q...,0,0,0,[],[],en,2023-05-06 13:58:54+00:00,2023-05-06 13:59:50+00:00,mastodon.social,Michigan Democrats : \n\n Gov Whitmer answer q...,0.155854
3,110322104419492754,Posted by Matthias Schmelzer in\n[https://twit...,0,0,0,[],[],en,2023-05-06 14:01:31+00:00,,mastodon.social,post Matthias Schmelzer \n [ https://twitter.c...,0.154557
4,110322097753001501,1973 :mkcastle: Lillian Disney helps dedicate ...,0,0,0,[],[],en,2023-05-06 14:00:12+00:00,,mastodon.social,1973 : mkcastle : Lillian Disney help dedicate...,0.152425


### Interaction Score
Im folgenden Abschnitt wird ein Interaktion Score berechnet der sich aus der Summe der Interaktionen (favourites_count, replies_count, reblogs_count) zusammensetzt. Dieser Score wird anschließend auf 0-1 nomiert. 

In [72]:
def calculate_interaction_score(toot_df, sort_by_interaction_score=False):
    """Function to calculate the interaction score of a toot."""
    
    # Calculate the interaction score
    toot_df['interaction_score'] = toot_df['favourites_count'] + toot_df['replies_count'] + toot_df['reblogs_count']
    
    # Normalize the interaction score to the value range [0, 1]
    max_interaction_score = toot_df['interaction_score'].max()
    toot_df['interaction_score'] = toot_df['interaction_score'] / max_interaction_score
    
    if sort_by_interaction_score:
        # Sort the DataFrame according to the interaction score (descending)
        toot_df.sort_values('interaction_score', ascending=False, inplace=True)
        toot_df.reset_index(drop=True, inplace=True)
    
    return toot_df

In [73]:
similar_toots = calculate_interaction_score(similar_toots)
similar_toots.head()

Unnamed: 0,toot_id,content,reblogs_count,favourites_count,replies_count,mentions,tags,language,created_at,edited_at,instance,content_lemma,content_similarity_score,interaction_score
0,110322104757107418,Posted by Post Growth Innovation Lab in\n[http...,0,0,0,[],[],en,2023-05-06 14:01:31+00:00,,mastodon.social,post Post Growth Innovation Lab \n [ https://t...,0.175365,0.0
1,110322099419002639,European Political Science Review \n[https://...,0,0,0,[],[],en,2023-05-06 14:00:11+00:00,,mastodon.social,European Political Science Review \n [ https:...,0.169615,0.0
2,110322092296070139,Michigan Democrats:\n\nGov Whitmer answering q...,0,0,0,[],[],en,2023-05-06 13:58:54+00:00,2023-05-06 13:59:50+00:00,mastodon.social,Michigan Democrats : \n\n Gov Whitmer answer q...,0.155854,0.0
3,110322104419492754,Posted by Matthias Schmelzer in\n[https://twit...,0,0,0,[],[],en,2023-05-06 14:01:31+00:00,,mastodon.social,post Matthias Schmelzer \n [ https://twitter.c...,0.154557,0.0
4,110322097753001501,1973 :mkcastle: Lillian Disney helps dedicate ...,0,0,0,[],[],en,2023-05-06 14:00:12+00:00,,mastodon.social,1973 : mkcastle : Lillian Disney help dedicate...,0.152425,0.0


### Vorübergehender Ranking Score
Im folgenden Abschnitt wird ein Ranking Score berechnet der sich aus der Summe der der gewichteten Scores zusammensetzt. Das Dataframe wird nach dem Ranking Score definiert.

In [74]:
def calculate_ranking_score(toot_df, similarity_weight, interaction_weight):
    """Function to calculate the ranking score of a toot."""
    
    # Calculate the ranking score
    toot_df['ranking_score'] = (similarity_weight * toot_df['content_similarity_score']) + (interaction_weight * toot_df['interaction_score']) 
    
    # Sort the DataFrame according to the ranking score (descending)
    toot_df.sort_values('ranking_score', ascending=False, inplace=True)
    toot_df.reset_index(drop=True, inplace=True)
    
    return toot_df

In [75]:
# Set the weights for Similarity score and Interaction score
similarity_weight = 0.9
interaction_weight = 0.1

# Calculate the ranking score and expand the DataFrame 
toot_df_with_ranking = calculate_ranking_score(similar_toots, similarity_weight, interaction_weight)
toot_df_with_ranking.head()

Unnamed: 0,toot_id,content,reblogs_count,favourites_count,replies_count,mentions,tags,language,created_at,edited_at,instance,content_lemma,content_similarity_score,interaction_score,ranking_score
0,110322104757107418,Posted by Post Growth Innovation Lab in\n[http...,0,0,0,[],[],en,2023-05-06 14:01:31+00:00,,mastodon.social,post Post Growth Innovation Lab \n [ https://t...,0.175365,0.0,0.157829
1,110322099419002639,European Political Science Review \n[https://...,0,0,0,[],[],en,2023-05-06 14:00:11+00:00,,mastodon.social,European Political Science Review \n [ https:...,0.169615,0.0,0.152654
2,110322095551120299,"""Come out,"" the knight called, ""and marry me!""...",163,37,4,[],"[{'name': 'smallstories', 'url': 'https://mast...",en,2023-05-06 13:59:41+00:00,,mastodon.social,""" come , "" knight call , "" marry ! "" \n drago...",0.096065,0.616314,0.14809
3,110322092296070139,Michigan Democrats:\n\nGov Whitmer answering q...,0,0,0,[],[],en,2023-05-06 13:58:54+00:00,2023-05-06 13:59:50+00:00,mastodon.social,Michigan Democrats : \n\n Gov Whitmer answer q...,0.155854,0.0,0.140269
4,110322104419492754,Posted by Matthias Schmelzer in\n[https://twit...,0,0,0,[],[],en,2023-05-06 14:01:31+00:00,,mastodon.social,post Matthias Schmelzer \n [ https://twitter.c...,0.154557,0.0,0.139102


In [76]:
for toot_content in toot_df_with_ranking[:10].content:
    print(toot_content)

Posted by Post Growth Innovation Lab in
[https://twitter.com/post_growth/status/1654610658669785088](https://twitter.com/post_growth/status/1654610658669785088):  
Michael chats with the University of Vigo’s Mario Pansera about innovation's
role in rethinking growth. They discuss his Post-Growth Innovation Lab's...


European Political Science Review  
[https://en.wikipedia.org/wiki/European_Political_Science_Review](https://en.wikipedia.org/wiki/European_Political_Science_Review)


"Come out," the knight called, "and marry me!"  
The dragon emerged from his lair. "Pardon?"  
"You wrote a proclamation, saying that whoever rid the kingdom of its king
would gain your, er, paw in marriage and half the kingdom."  
"That was a jo- Wait, you did?"  
"Yes!"  
"Very well."  
[#MicroFiction](https://mastodon.art/tags/MicroFiction)
[#TootFic](https://mastodon.art/tags/TootFic)
[#SmallStories](https://mastodon.art/tags/SmallStories)


Michigan Democrats:

Gov Whitmer answering question Taylor Mac

## Probleme:
- Performance
    - Der Similarity Check dauert relativ lange
    - Das Lemmatizen dauert relativ lange
    -> Kann beim Öffnen der Timeline zu langer Ladezeit führen.
    
    Mögliche Lösung: 
    - Kategorisierung des Toot Contents nach dem Veröffentlichen, Persistierung in DB, Mustererkennung mit Regex (Abgleich der Interessen mit Kategorien) 
    - Topic Modelling um, Interessen mit Themen abzugleichen https://towardsdatascience.com/short-text-topic-modelling-lda-vs-gsdmm-20f1db742e14.
    - Anzahl der Toots beschränken -> Nur Toots der letzten Stunden/Tage laden
    - Vorverarbeitung: Lemmatizierung und Entfernen der Stopwords nach Veröffentlichung durchführen und speichern

- Content beinhaltet selten Interessen Keywords. Ähnlichkeitsvergleich eher schwierig

## Weitere Schritte:
- Step 2. Get toots from people you follow 
- Step 3. Get persons with similar interests (who to follow)
- Step 4. Get toots by hashtags (filter hashtags by interests) -> Keyword Corrolation https://ourcodingclub.github.io/tutorials/topic-modelling-python/
- Step 5. Mix data
- Step 6. Rank the toots in a ranking system with content similarity, hashtag similarity, interactions, actuality (how new they are), weight for toots from people you follow or persons with similar interests and sort them descending

# Topic Modeling

In [95]:
from sklearn.feature_extraction.text import CountVectorizer


In [101]:
def add_topic_labels(df, text_column):
    # Initialize CountVectorizer
    vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern=r'\w+|\$[\d\.]+|\S+')

    # Apply transformation
    tf = vectorizer.fit_transform(df[text_column]).toarray()

    # Get feature names
    tf_feature_names = vectorizer.get_feature_names_out()

    # Add topic labels as a new feature in the DataFrame
    df['topic_label'] = tf.argmax(axis=1)
    
    # Convert topic labels to text
    df['topic_text'] = df['topic_label'].map(lambda label: tf_feature_names[label])

    return df

In [102]:
df_with_topics= add_topic_labels(toot_df_with_ranking, 'content_lemma')

In [103]:
df_with_topics.head()

Unnamed: 0,toot_id,content,reblogs_count,favourites_count,replies_count,mentions,tags,language,created_at,edited_at,instance,content_lemma,content_similarity_score,interaction_score,ranking_score,topic_label,topic_text
0,110322104757107418,Posted by Post Growth Innovation Lab in\n[http...,0,0,0,[],[],en,2023-05-06 14:01:31+00:00,,mastodon.social,post Post Growth Innovation Lab \n [ https://t...,0.175365,0.0,0.157829,71,post
1,110322099419002639,European Political Science Review \n[https://...,0,0,0,[],[],en,2023-05-06 14:00:11+00:00,,mastodon.social,European Political Science Review \n [ https:...,0.169615,0.0,0.152654,7,)
2,110322095551120299,"""Come out,"" the knight called, ""and marry me!""...",163,37,4,[],"[{'name': 'smallstories', 'url': 'https://mast...",en,2023-05-06 13:59:41+00:00,,mastodon.social,""" come , "" knight call , "" marry ! "" \n drago...",0.096065,0.616314,0.14809,1,""""
3,110322092296070139,Michigan Democrats:\n\nGov Whitmer answering q...,0,0,0,[],[],en,2023-05-06 13:58:54+00:00,2023-05-06 13:59:50+00:00,mastodon.social,Michigan Democrats : \n\n Gov Whitmer answer q...,0.155854,0.0,0.140269,7,)
4,110322104419492754,Posted by Matthias Schmelzer in\n[https://twit...,0,0,0,[],[],en,2023-05-06 14:01:31+00:00,,mastodon.social,post Matthias Schmelzer \n [ https://twitter.c...,0.154557,0.0,0.139102,1,""""
