# NLP Polarisatie Zelftest obv embeddings

## Mount G-Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/My Drive/Post-X/NLPtool/'

## Get Politician Twitter Handles

In [None]:
from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
creds, _ = default()

gc = gspread.authorize(creds)

worksheet = gc.open('politician-twitter-handles').sheet1

# get_all_values gives a list of rows.
rows = worksheet.get_all_values()

# Convert to a DataFrame and render.
import pandas as pd
df = pd.DataFrame.from_records(rows)

# Convert to lists
politicians = df.loc[1:, 0]
twitter_handles = df.loc[1:, 1]
party_acros = df.loc[1:,2]
party_names = df.loc[1:,3]


## Get Tweets

In [None]:
import tweepy
import csv
from datetime import datetime

# Twitter API credentials
consumer_key = ""
consumer_secret = ""
access_key = ""
access_secret = ""

# Authorization to consumer key and consumer secret
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)

# Access to user's access key and access secret
auth.set_access_token(access_key, access_secret)

# Calling API
api = tweepy.API(auth)

# Open/Create a file to append data
path = '/content/drive/My Drive/Post-X/NLPtool/data/'
csvFile = open(path+datetime.today().strftime('%Y%m%d')+'-raw-tweets.csv', 'a')

# Use csv Writer
csvWriter = csv.writer(csvFile)

# Write header
csvWriter.writerow(['politician_name','party_acro','twitter_handle','created_at', 'id_str', 'in_reply_to_status_id_str', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'retweeted_status', 'retweet_count', 'full_text'])

# Get Politician Tweets function
def get_politician_tweet(username, number_of_tweets, politician_name, party_acro):
  for tweet in tweepy.Cursor(api.user_timeline, screen_name=username, tweet_mode="extended").items(number_of_tweets):
      retweeted_status = hasattr(tweet, 'retweeted_status')
      csvWriter.writerow([politician_name, party_acro, username, tweet.created_at, tweet.id_str,
                          tweet.in_reply_to_status_id_str, tweet.in_reply_to_user_id_str,
                          tweet.in_reply_to_screen_name, retweeted_status,
                          tweet.retweet_count, tweet.full_text.encode('utf-8')])

# Get Politicians and get 500 tweets for each of them
for politician_name, twitter_handle, party_acro in zip(politicians, twitter_handles, party_acros):
     get_politician_tweet(twitter_handle, 500, politician_name, party_acro)

print("Tweets fetched and stored in csv")

csvFile.close()

## Get Tweets - Depreciated

In [None]:
consumer_key = ""
consumer_secret = ""
_access_token = ""
_access_token_secret = ""

import tweepy
from tweepy.auth import OAuthHandler
import numpy as np

def get_politican_tweet(username, number_of_tweets):
     auth = OAuthHandler(consumer_key, consumer_secret)
     auth.set_access_token(_access_token, _access_token_secret)
     # Calling api
     api = tweepy.API(auth)
     tweets = api.user_timeline(screen_name=username, count = number_of_tweets)
     tmp=[]
     tweets_for_csv = [tweet.text for tweet in tweets] # CSV file created
     for j in tweets_for_csv:
         # Appending tweets to the empty array tmp
         tmp.append(j)
     return tmp

final_df = pd.DataFrame()

for name, twitterhandle, partyacro in zip(politicians, twitterhandles, partyacros):
     tweet = get_politican_tweet(twitterhandle, 200)
     temp_df = pd.DataFrame({'tweet':tweet})
     temp_df['politican'] = name
     temp_df['partyacro'] = partyacro
     final_df = final_df.append(temp_df)

from datetime import datetime

path = '/content/drive/My Drive/Post-X/NLPtool/'

with open(path+'data/'+datetime.today().strftime('%Y%m%d')+'-raw-tweets.csv', 'w', encoding = 'utf-8-sig') as f:
  final_df.to_csv(f)

## Turn CSV into Pandas dataframe

In [None]:
import pandas as pd

# Specify the file path or name
file = path+'/data/20230529-raw-tweets.csv'

# Read the csv file using pandas
df = pd.read_csv(file)

# Print the DataFrame to verify the result
print(df)

## New Embeddings Code

In [None]:
!pip install nltk

import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Download Dutch stopwords
nltk.download('stopwords')

# Initialize the stemmer
stemmer = SnowballStemmer('dutch')

# Load Dutch stopwords
stop_words = set(stopwords.words('dutch'))

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize
    tokens = text.split()

    # Remove stopwords and stem
    tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]

    return ' '.join(tokens)

# Apply the preprocessing to the 'full_text' column
df['processed_text'] = df['full_text'].apply(preprocess_text)

print(df)


In [None]:
import os
import ipywidgets as widgets
from IPython.display import display, clear_output

if os.path.exists(path+'labeling.csv'):
    df = pd.read_csv(path+'labeling.csv')
else:
    print('No existing labeling file found.')

# Define the widgets
start_idx = widgets.IntText(value=0, description='Start Index:', continuous_update=False)
button_next = widgets.Button(description="Next")
button_prev = widgets.Button(description="Back")
button_save = widgets.Button(description="Save Progress")
extreme_words = widgets.Checkbox(value=False, description='Extreme Words')
dichotomous_lang = widgets.Checkbox(value=False, description='Dichotomous Language')
emotional_appeals = widgets.Checkbox(value=False, description='Emotional Appeals')
dehumanizing_lang = widgets.Checkbox(value=False, description='Dehumanizing Language')
negative_assumptions = widgets.Checkbox(value=False, description='Negative Assumptions')
ignoring_counterarguments = widgets.Checkbox(value=False, description='Ignoring Counterarguments')
overgeneralization = widgets.Checkbox(value=False, description='Overgeneralization')
unverifiable_claims = widgets.Checkbox(value=False, description='Unverifiable Claims')
output = widgets.Output()

# Set the current tweet index
tweet_idx = start_idx.value

# Define the update function
def update(change):
    with output:
        clear_output()
        print(f'Tweet {tweet_idx+1}/{len(df)}: {df.loc[tweet_idx, "full_text"]}')

# Define the button click event handlers
def on_button_next_clicked(b):
    global tweet_idx
    if tweet_idx < len(df)-1:
        tweet_idx += 1
    update(None)

def on_button_prev_clicked(b):
    global tweet_idx
    if tweet_idx > 0:
        tweet_idx -= 1
    update(None)

def on_button_save_clicked(b):
    df.to_csv(path+'labeling.csv', index=False)
    print("Progress saved.")

# Attach the event handlers to the buttons
button_next.on_click(on_button_next_clicked)
button_prev.on_click(on_button_prev_clicked)
button_save.on_click(on_button_save_clicked)

# Display the widgets
display(start_idx, button_next, button_prev, button_save,
        extreme_words, dichotomous_lang, emotional_appeals,
        dehumanizing_lang, negative_assumptions, ignoring_counterarguments,
        overgeneralization, unverifiable_claims, output)

# Trigger the initial update
update(None)


# Additional code

## Remove URLs

In [None]:
import re

# Function to remove URLs from a string
def remove_url(text):
    return re.sub(r'http\S+|www.\S+', '', text, flags=re.MULTILINE)

# Apply the function to the 'full_text' column
df['full_text'] = df['full_text'].apply(remove_url)

## Calculate readability



In [None]:
!pip install textstat

import textstat

# Set the language to Dutch. textstat supports Dutch since version 0.7.0.
textstat.set_lang("nl")

# Apply the function to the 'full_text' column and store the results in a new column
df['flesch_reading_ease'] = df['full_text'].apply(textstat.flesch_reading_ease)

# Print the DataFrame to verify the result
print(df)

## Create histogram of readability scores

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Generate histogram
for party_acro, party_name in zip(party_acros, party_names):
  sns.histplot(df[df['party_acro']==party_acro].flesch_reading_ease, kde=True, label = party_name)

plt.title('Readability Scores')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xlim(-10,110)

path = '/content/drive/My Drive/Post-X/NLPtool/'
plt.savefig(path + '/output/20230529_readability_scores')

## Word Cloud Generation

In [None]:
from wordcloud import WordCloud

def remove_hex_codes(text):
    return re.sub(r'\\x[a-fA-F0-9]{2,}', '', text)

# Apply this function to the 'full_text' column in the DataFrame
df['cleaned_text'] = df['full_text'].apply(remove_hex_codes)

def remove_byte_indicator(text):
    return text.lstrip("b'").rstrip("'")

# Apply the function to the 'cleaned_text' column
df['cleaned_text'] = df['cleaned_text'].apply(remove_byte_indicator)

import nltk
nltk.download('stopwords')

import string
from nltk.corpus import stopwords

# Define Dutch stopwords
dutch_stopwords = set(stopwords.words('dutch'))

def remove_punctuation_and_stopwords(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Lowercase the words
    text = text.lower()

    # Remove stopwords
    words = text.split()
    words = [word for word in words if word not in dutch_stopwords]

    # Reconstruct the sentence
    text = ' '.join(words)

    return text

# Apply the function to the 'cleaned_text' column
df['cleaned_text'] = df['cleaned_text'].apply(remove_punctuation_and_stopwords)

# Create a word cloud for each politician
for politician in politicians:
    plt.figure(figsize=(10,8))
    plt.title(politician)

    # Get all tweets from this politician
    tweets = df[(df['politician_name'] == politician) & (df['retweeted_status'] != True)]['cleaned_text'].values

    # Create one big string from all tweets
    text = ' '.join(tweets)

    # Create and generate a word cloud image
    wordcloud = WordCloud(background_color="white").generate(text)

    # Display the generated image
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()
    plt.savefig(path + '/output/20230529_'+politician+' wordcloud')

## Analyze dataset

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Assuming that df is your DataFrame and 'created_at' is the column with the tweet timestamp

# Convert 'created_at' column to datetime if it's not
df['created_at'] = pd.to_datetime(df['created_at'])

# Get the number of tweets per politician (not a retweet)
tweet_counts = df[df['retweeted_status'] != True]['politician_name'].value_counts()

# Get the number of retweets per politician
retweet_counts = df[df['retweeted_status'] == True]['politician_name'].value_counts()

# Get the date of the oldest tweet per politician
oldest_tweets = df_no_retweets.groupby('politician_name')['created_at'].min()

# Create a new DataFrame for the plot
plot_data = pd.DataFrame({
    'Number_of_tweets': tweet_counts,
    'Number_of_retweets': retweet_counts,
    'Oldest_tweet': oldest_tweets
}).reset_index()

# Create a bar plot for the number of tweets
fig, ax1 = plt.subplots(figsize=(10, 8))

ax1.bar(plot_data['index'], plot_data['Number_of_tweets'], color='blue', label='Tweets')
ax1.bar(plot_data['index'], plot_data['Number_of_retweets'], color='green', bottom=plot_data['Number_of_tweets'], label='Retweets')
ax1.set_ylabel('Number of Tweets', color='blue')
ax1.set_xlabel('Politician Name')
ax1.tick_params(axis='y', labelcolor='blue')

# Explicitly set the rotation of the x-axis labels on the ax1 object
for label in ax1.get_xticklabels():
    label.set_rotation(90)

# Create a second y-axis for the date of the oldest tweet
ax2 = ax1.twinx()
ax2.plot(plot_data['index'], plot_data['Oldest_tweet'], color='red')
ax2.set_ylabel('Date of Oldest Tweet', color='red')
ax2.tick_params(axis='y', labelcolor='red')

# Add a legend
ax1.legend()

plt.title('Number of Tweets, Retweets, and Date of Oldest Tweet per Politician')
plt.tight_layout()
plt.show()


## Stemming and Lemming

In [None]:
!python -m spacy download nl_core_news_lg
import spacy
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

nltk.download('punkt')

# Initialize the Dutch stemmer
stemmer = SnowballStemmer("dutch")

# Load SpaCy's Dutch model
nlp = spacy.load('nl_core_news_lg')

# Filter out retweets
df_no_retweets = df[df['retweeted_status'] != True]

# Tokenize and stem each tweet
df_no_retweets['tokens'] = df_no_retweets['cleaned_text'].apply(word_tokenize)
df_no_retweets['stemmed'] = df_no_retweets['tokens'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])

# Lemmatize each tweet using SpaCy
df_no_retweets['lemmatized'] = df_no_retweets['cleaned_text'].apply(lambda text: [token.lemma_ for token in nlp(text)])


## Store in CSV

In [None]:
df_no_retweets.to_csv(path+'/data/20230529-processed-no-retweets.csv', index=False)

df_no_retweets

## Load CSV to dataframe

In [None]:
# Specify the file path or name
file = path+'/data/20230529-processed-no-retweets.csv'

# Read the csv file using pandas
df_no_retweets = pd.read_csv(file)

import ast

# Function to convert string to list
def convert_string_to_list(s):
    try:
        return ast.literal_eval(s)
    except (ValueError, SyntaxError):
        # In case of any exception, return an empty list
        return []

# Apply the function to the 'lemmatized' column
df_no_retweets['lemmatized'] = df_no_retweets['lemmatized'].apply(convert_string_to_list)

# Print the DataFrame to verify the result
print(df_no_retweets)

## Drop unnecessary columns

In [None]:
df_no_retweets = df_no_retweets.drop(['full_text', 'tokens'], axis=1)
df_no_retweets

## Sentiment Score

In [None]:
!pip install pattern

from pattern.nl import sentiment

def calculate_sentiment(text):
    sentiment_score, _ = sentiment(text)  # Unpack the tuple
    return sentiment_score

# Now, apply this function to each cleaned_retweet
df_no_retweets['sentiment_score'] = df_no_retweets['cleaned_text'].apply(calculate_sentiment)

df_no_retweets

## Analyze sentiment scores

In [None]:
def sentiment_grouping(sentiment_compound):
    if sentiment_compound >= 0.05:
        return 'Positive'
    elif (sentiment_compound > -0.05 and sentiment_compound < 0.05):
        return 'Neutral'
    elif sentiment_compound <= -0.05:
        return 'Negative'

df_no_retweets['sentiment'] = df_no_retweets['sentiment_score'].apply(sentiment_grouping)
df_no_retweets.groupby(['politician_name', 'sentiment']).size()


## Visualize sentiment

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Count the sentiment categories for each politician
sentiment_counts = df_no_retweets.groupby('politician_name')['sentiment'].value_counts()

# Get the list of politicians
politicians = df_no_retweets['politician_name'].unique()

# Calculate the number of rows needed for the subplots
num_rows = int(np.ceil(len(politicians) / 4))

# Create the subplots
fig, axs = plt.subplots(num_rows, 4, figsize=(20, 6*num_rows))

# Flatten the axis array for easier indexing
axs = axs.flatten()

# Define a color map for the sentiment categories
color_map = {'Positive': 'green', 'Neutral': 'blue', 'Negative': 'red'}

for i, politician in enumerate(politicians):
    # If the politician has tweets in the DataFrame
    if politician in sentiment_counts:
        # Get the sentiment counts for this politician
        counts = sentiment_counts[politician]

        # Create a list of colors for the pie chart slices based on the sentiment categories
        colors = [color_map[label] for label in counts.index]

        # Create a pie chart in the i-th subplot
        axs[i].pie(counts, labels=counts.index, autopct='%1.1f%%', startangle=140, colors=colors)
        axs[i].set_title(f'Sentiment Distribution for {politician}')

# Remove empty subplots
for i in range(len(politicians), len(axs)):
    fig.delaxes(axs[i])

plt.tight_layout()
plt.show()
plt.savefig(path + '/output/20230529-sentiment-charts')


## Bag of Words - Feature engineering

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize a CountVectorizer
vectorizer = CountVectorizer()

# For each politician, generate a bag of words
for politician in df_no_retweets['politician_name'].unique():
    # Get the tweets for this politician
    tweets = df_no_retweets[df_no_retweets['politician_name'] == politician]['lemmatized']

    # Convert list of tokens back to sentences
    sentences = tweets.apply(' '.join)

    # Generate the bag of words
    bow = vectorizer.fit_transform(sentences)

    # Create a DataFrame for better visualization
    bow_df = pd.DataFrame(bow.toarray(), columns=vectorizer.get_feature_names_out())

    # Show the bag of words
    print(f'Bag of Words for {politician}:')
    print(bow_df)
    print("\n")

## BoW Score

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize a CountVectorizer
vectorizer = CountVectorizer()

# Create a dictionary to hold the BoW for each politician
politician_bow = {}

for politician in df_no_retweets['politician_name'].unique():
    # Get the lemmatized tweets for this politician
    tweets = df_no_retweets[df_no_retweets['politician_name'] == politician]['lemmatized']

    # Convert list of lemmas back to sentences
    sentences = tweets.apply(' '.join)

    # Generate the bag of words
    bow = vectorizer.fit_transform(sentences)

    # Save the feature names (words) in the dictionary
    politician_bow[politician] = set(vectorizer.get_feature_names_out())

# Function to count the number of words that appear in the BoW
def count_bow_words(tokens, bow):
    return sum(token in bow for token in tokens)

# Create a new column for each politician
for politician, bow in politician_bow.items():
    df_no_retweets[f'{politician}_bow_count'] = df_no_retweets['lemmatized'].apply(count_bow_words, bow=bow)

df_no_retweets


## Show BoW Scores

In [None]:
import pandas as pd

# Extract the columns that end with "_bow_count"
bow_columns = [col for col in df_no_retweets.columns if col.endswith('_bow_count')]

# Pre-calculate filters
filtered_data = {politician: df_no_retweets[df_no_retweets['politician_name'] == politician] for politician in df_no_retweets['politician_name'].unique()}

# Initialize an empty DataFrame to store the results
summary_pd = pd.DataFrame(columns=bow_columns, index=filtered_data.keys())

# Calculate sum of bow counts
for politician, data in filtered_data.items():
    summary_pd.loc[politician] = data[bow_columns].sum().to_dict()

print(summary_pd)


## Visualize the interactions

In [None]:
import seaborn as sns

# Convert all columns to numeric type
summary_pd = summary_pd.apply(pd.to_numeric, errors='coerce')

# Create a heatmap
sns.heatmap(summary_pd, cmap="YlGnBu")

plt.title('Politician Interaction Heatmap based on BoW')
plt.show()
plt.savefig(path + '/output/20230529-bow-heatmap')



## No common words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize a CountVectorizer
vectorizer = CountVectorizer()

# Create a dictionary to store the BoWs
bows = {}

# For each politician, generate a bag of words
for politician in df_no_retweets['politician_name'].unique():
    # Get the tweets for this politician
    tweets = df_no_retweets[df_no_retweets['politician_name'] == politician]['lemmatized']

    # Convert list of tokens back to sentences
    sentences = tweets.apply(' '.join)

    # Generate the bag of words
    bow = vectorizer.fit_transform(sentences)

    # Create a DataFrame for better visualization
    bow_df = pd.DataFrame(bow.toarray(), columns=vectorizer.get_feature_names_out())

    # Store the bag of words
    bows[politician] = bow_df

# Function to remove common words across all BoWs
def remove_common_words(bows):
    # Create a set with all words
    all_words = set().union(*[set(bow.columns) for bow in bows.values()])

    # Find common words
    common_words = {word for word in all_words if all(word in bow.columns for bow in bows.values())}

    # Remove common words from each BoW
    for bow in bows.values():
        bow.drop(columns=list(common_words), inplace=True, errors='ignore')
    return bows

# Remove common words
bows = remove_common_words(bows)

# Show the bag of words
for politician, bow in bows.items():
    print(f'Bag of Words for {politician}:')
    print(bow)
    print("\n")


In [None]:
# Re-calculate _bow_count values
for politician, bow_df in bows.items():
    unique_words = bow_df.columns
    df_no_retweets[f'{politician}_bow_count'] = df_no_retweets['lemmatized'].apply(lambda words: len([word for word in words if word in unique_words]))

# Prepare the summary dataframe
summary_df = pd.DataFrame(index=df_no_retweets['politician_name'].unique())
bow_columns = [col for col in df_no_retweets.columns if col.endswith('_bow_count')]

for politician in summary_df.index:
    for bow_column in bow_columns:
        total_count = df_no_retweets[df_no_retweets['politician_name'] == politician][bow_column].sum()
        summary_df.loc[politician, bow_column] = total_count

# Ensure that the data is of numeric type
summary_df = summary_df.astype(float)

# Generate heatmap
plt.figure(figsize=(10,10))
sns.heatmap(summary_df, cmap="YlGnBu")
plt.title('Politician Interaction Heatmap based on BoW w/o common words')
plt.show()
plt.savefig(path + '/output/20230529-bow-heatmap-no-common')


## TF IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

# Initialize a TfidfVectorizer
vectorizer = TfidfVectorizer()

# Convert list of lemmatized tokens back to sentences for each tweet
sentences = df_no_retweets['lemmatized'].apply(' '.join)

# Generate the TF-IDF feature matrix
tfidf_matrix = vectorizer.fit_transform(sentences)

# Create a DataFrame for better visualization
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Save the fitted TfidfVectorizer to a file
joblib.dump(vectorizer, path+'output/tfidf_vectorizer.pkl')

print(tfidf_df)


## Random Forest model training

In [None]:
!pip install joblib

import joblib

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# Add readability and sentiment scores to our feature matrix
tfidf_df['flesch_reading_ease'] = df_no_retweets['flesch_reading_ease']
tfidf_df['sentiment'] = df_no_retweets['sentiment_score']

# Create a label encoder for the target variable (politician_name)
le = LabelEncoder()
y = le.fit_transform(df_no_retweets['politician_name'])

# Perform a train-test split
X_train, X_test, y_train, y_test = train_test_split(tfidf_df, y, test_size=0.2, random_state=42)

# Initialize a RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
clf.fit(X_train, y_train)

# Save the model
joblib.dump(clf, path+'output/230529_rf_model.pkl')

# Predict the labels for the test set
y_pred = clf.predict(X_test)

# Print a classification report
print(classification_report(y_test, y_pred, target_names=le.classes_))

## Predict a tweet

In [None]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

import nltk
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import re

!python -m spacy download nl_core_news_lg
import spacy
from spacy.lemmatizer import Lemmatizer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Define Dutch stopwords
dutch_stopwords = set(stopwords.words('dutch'))

# Initialize stemmer
stemmer = SnowballStemmer('dutch')

# Initiatlize lemmatizer
lemmatizer = Lemmatizer()

def remove_urls(text):
    return re.sub(r'http\S+|www.\S+', '', text, flags=re.MULTILINE)

def remove_hex_codes(text):
    return re.sub(r'\\x[a-fA-F0-9]{2,}', '', text)

def remove_byte_indicator(text):
    return text.lstrip("b'").rstrip("'")

def remove_punctuation_and_stopwords(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Lowercase the words
    text = text.lower()

    # Remove stopwords
    words = text.split()
    words = [word for word in words if word not in dutch_stopwords]

    # Reconstruct the sentence
    text = ' '.join(words)

    return text

# Load the model
rf_model = joblib.load(path + 'output/230529_rf_model.pkl')

# Load the TfidfVectorizer
tfidf_vectorizer = joblib.load(path + 'output/tfidf_vectorizer.pkl')

# Function to preprocess the tweet
def preprocess_tweet(tweet):
    # Remove URLs, hexcodes and byte indicator
    tweet = remove_urls(tweet)
    tweet = remove_hex_codes(tweet)
    tweet = remove_byte_indicator(tweet)

    # Remove punctuation and stopwords
    tweet = remove_punctuation_and_stopwords(tweet)

    # Tokenize the tweet
    tokens = word_tokenize(tweet)

    # Lemmatize, stem, and remove stop words
    preprocessed_tokens = [stemmer.stem(lemmatizer.lemmatize(token)) for token in tokens]

    # Join tokens back into a sentence
    preprocessed_tweet = ' '.join(preprocessed_tokens)

    return preprocessed_tweet

# Function to vectorize the tweet
def vectorize_tweet(tweet):
    # Preprocess the tweet
    preprocessed_tweet = preprocess_tweet(tweet)

    # Transform the preprocessed tweet into TF-IDF features
    tfidf_vector = tfidf_vectorizer.transform([preprocessed_tweet])

    return tfidf_vector

# Function to predict the politician
def predict_politician(tweet):
    # Vectorize the tweet
    tweet_vector = vectorize_tweet(tweet)

    # Predict the politician using the Random Forest model
    prediction = rf_model.predict(tweet_vector)

    return prediction[0]

# Ask the user to enter a tweet
tweet = input("Please enter a tweet: ")

# Predict the politician and print the result
predicted_politician = predict_politician(tweet)
print(f"The tweet is predicted to be written by: {predicted_politician}")


## Unmount G-Drive



In [None]:
drive.flush_and_unmount()