In [None]:
from bs4 import BeautifulSoup
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
# nltk.download('all')
import pandas as pd
import ast
import spacy
from tqdm import tqdm

In [None]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

In [None]:
def handle_urls_and_mentions(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove user mentions
    text = re.sub(r'\/?u\/[^\s]+', '', text)
    return text

In [None]:
def remove_special_characters(text):
    return ''.join(char for char in text if char.isalnum() or char.isspace())

In [None]:
def convert_to_lowercase(text):
    return text.lower()

In [None]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

In [None]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

In [None]:
nlp = spacy.load("en_core_web_sm")

def lemmatize_text_spacy(text):
    # Process the text with spaCy
    doc = nlp(text)
    
    # Extract lemmatized tokens
    lemmatized_words = [token.lemma_ for token in doc]
    
    return ' '.join(lemmatized_words)

In [None]:
df=pd.read_csv('/kaggle/input/final-project/cananda_manual_raw.csv')

In [None]:
df.head()

In [None]:
df=df.dropna()

In [None]:
comments=df['comments']

In [None]:
comments

In [None]:
cleaned_comments=[]
for c in tqdm(comments,total=len(comments),desc='Cleaning'):
    og_comment=c
    try:
        c=remove_html_tags(c)
        c=handle_urls_and_mentions(c)
        c=remove_special_characters(c)
        c=convert_to_lowercase(c)
#         c=remove_stopwords(c)
        #     c=lemmatize_text(c)
        c=lemmatize_text_spacy(c)
    except Exception as e:
        print(f"Error extracting keywords for {og_comment} : {e}")
        continue
        
    cleaned_comments.append(c)
    

In [None]:
cleaned_comments

In [None]:
len(comments)

In [None]:
len(cleaned_comments)

In [None]:
cleaned_comments_df=pd.DataFrame({'comments':cleaned_comments,'manual_label':df['manual_label']})

In [None]:
cleaned_comments_df.head()

In [None]:
cleaned_comments_df.to_csv('cananda_manual_cleaned.csv',index=False)

In [None]:
remove_html_tags("Ugh, here we go again with the Israel-Palestine saga. Frankly, I'm tired of the endless debates. It's crystal clear to me – Israel has the right to defend itself, and anyone saying otherwise is just blinded by biased narratives. &lt;</b>&gt;🇮🇱💪")

In [None]:
handle_urls_and_mentions('''I'm not? I haven't spoken one piece of propaganda. What I'm talking about is well documented truth done by the United Nations. 

https://www.unrwa.org/2014-gaza-conflict

Here is the link. You can read how many civilians died in Gaza as a result of Israeli bombing. 

"2,251 Palestinians were killed; 1,462 of them are believed to be civilians, including 551 children and 299 women. 66 Israeli soldiers and five civilians, including one child, were also killed. Overall, 11,231 Palestinians were injured during the conflict, including 3,540 women and 3,436 children. Roughly one third of these children will have to cope with disabilities lasting throughout life as a result of their injuries."

Now if you can't read and will call this also propaganda, that's on you.
''')

In [None]:
remove_special_characters("So whose fault was it for the millions of Palestinians that have gotten killed over decades and nobody took a stand ? Why the hypocrisy is what Iâ€™m asking ?")

In [None]:
convert_to_lowercase("So whose fault was it for the millions of Palestinians that have gotten killed over decades and nobody took a stand  Why the hypocrisy is what Iâm asking ")

In [None]:
lemmatize_text_spacy('so whose fault was it for the millions of palestinians that have gotten killed over decades and nobody took a stand  why the hypocrisy is what iâm asking')

In [None]:
remove_stopwords("so whose fault be it for the million of palestinians that have gotten kill over decade and nobody take a stand   why the hypocrisy be what iâm ask")