In [2]:
# Imports
import pandas as pd
import nltk
import re
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to C:\Users\Thales
[nltk_data]     Mustafa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Thales
[nltk_data]     Mustafa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Thales
[nltk_data]     Mustafa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
# Load dataset
df = pd.read_csv('argsme_dataset.csv')

# Data Preprocess
print('is null ? : \n', df.isnull().sum())
print('\n is duplicated :  ', df.duplicated().sum())
print('\ndata shape :', df.shape)

AttributeError: module 'pandas' has no attribute 'read'

In [7]:
df.head(3)

Unnamed: 0,doc_id,conclusion,premises_texts,source_title,topic,acquisition,combined_text
0,c67482ba-2019-04-18T13:32:05Z-00000-000,contracept form high school student,oppon forfeit everi round none argument answer...,debat argument contracept form high school stu...,contracept form high school student,2019-04-18 13:32:05+00:00,oppon forfeit everi round none argument answer...
1,c67482ba-2019-04-18T13:32:05Z-00001-000,contracept form high school student,propos school fund program condom cost money c...,debat argument contracept form high school stu...,contracept form high school student,2019-04-18 13:32:05+00:00,propos school fund program condom cost money c...
2,c67482ba-2019-04-18T13:32:05Z-00002-000,contracept form high school student,school compel interest provid contracept stude...,debat argument contracept form high school stu...,contracept form high school student,2019-04-18 13:32:05+00:00,school compel interest provid contracept stude...


In [None]:
df.drop_duplicates(inplace=True)
df.drop(
    columns=['premises', 'aspects', 'aspects_names', 'source_domain', 'source_text', 'source_text_conclusion_start',
             'source_text_conclusion_end', 'source_text_premise_start', 'source_text_premise_end', 'date', 'author',
             'source_id', 'source_next_argument_id', 'source_previous_argument_id', 'source_url', 'author_image_url',
             'author_organization', 'author_role', 'mode'], inplace=True)
df = df.dropna()

In [None]:
# Initialize stopwords, stemmer, and lemmatizer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [None]:
# Preprocessing functions
def preprocess_text(text, apply_remove_p=True):
    text = re.sub(r'(https?://|ftp://|tinyurl\.com/)\S+(\s|$)|(([a-zA-Z0-9-]+\.)+(com|org|net|gov|edu|co\.uk|co\.in|co\.[a-z]+))', ' ', text)
    text = re.sub(r'(https?:\/\/)?([a-zA-Z0-9-]+\.)+(com|org|net|gov|edu|co\.uk|co\.in|co\.[a-z]+)', '', text)
    text = re.sub(r'(\w)\1+', r'\1', text)
    text = re.sub(r'(.)\1+', r'\1', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\d+', '', text)
    if apply_remove_p:
        text = re.sub(r'(\w)(\W+)(\w)', lambda match: match.group(1) + ' ' + match.group(3).lower(), text)
    text = text.lower()
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~|\•|\)|\("""), '  ', text)
    words = word_tokenize(text)
    words = [word for word in words if word.lower() not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)


In [None]:
df['premises_texts'] = df['premises_texts'].apply(preprocess_text, args=(True,))
df['conclusion'] = df['conclusion'].apply(preprocess_text, args=(True,))
df['source_title'] = df['source_title'].apply(preprocess_text, args=(False,))
df['topic'] = df['topic'].apply(preprocess_text, args=(True,))

In [None]:
# Save the preprocessed dataset
df.to_csv('argsme_dataset_cleaned.csv', index=False)

In [None]:
# TF-IDF

# Load the preprocessed data
csv_file_path = 'argsme_dataset_cleaned.csv'
df = pd.read_csv(csv_file_path)

# Combine the text from 'premises_texts', 'conclusion', 'source_title', and 'topic' columns
df['combined_text'] = df['premises_texts'].astype(str) + ' ' + df['conclusion'].astype(str) + ' ' + df['source_title'].astype(str) + ' ' + df['topic'].astype(str)

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the combined text
tfidf_matrix = vectorizer.fit_transform(df['combined_text'])

# Save the TF-IDF model and matrix
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(tfidf_matrix, 'tfidf_matrix.pkl')

In [None]:
# Clustering

