In [1]:
!pip install emoji num2words

Collecting typing-extensions>=4.7.0
  Using cached typing_extensions-4.12.2-py3-none-any.whl (37 kB)
Installing collected packages: typing-extensions
  Attempting uninstall: typing-extensions
    Found existing installation: typing_extensions 4.5.0
    Uninstalling typing_extensions-4.5.0:
      Successfully uninstalled typing_extensions-4.5.0
Successfully installed typing-extensions-4.12.2


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.3.0 requires mkl<=2021.4.0,>=2021.1.1, which is not installed.
tensorflow 2.10.1 requires protobuf<3.20,>=3.9.2, but you have protobuf 3.20.3 which is incompatible.
spacy 3.3.3 requires typing-extensions<4.6.0,>=3.7.4.1, but you have typing-extensions 4.12.2 which is incompatible.
cached-path 1.1.6 requires huggingface-hub<0.11.0,>=0.8.1, but you have huggingface-hub 0.24.5 which is incompatible.
allennlp 2.10.1 requires torch<1.13.0,>=1.10.0, but you have torch 2.3.0 which is incompatible.
allennlp 2.10.1 requires torchvision<0.14.0,>=0.8.1, but you have torchvision 0.18.0 which is incompatible.


### Check of Null values

In [2]:
import pandas as pd
import re
import emoji
from num2words import num2words
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Load your dataset
raw_dataset = pd.read_csv('data_huang_devansh.csv')
pd.isnull(raw_dataset).sum()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ppt59\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Content    8
Label      0
dtype: int64

In [3]:
raw_dataset.dropna(inplace=True)
pd.isnull(raw_dataset).sum()

Content    0
Label      0
dtype: int64

In [4]:
raw_dataset['Label'].value_counts()

0    708633
1    133694
Name: Label, dtype: int64

### Preprocessing the data

In [5]:
profanities_df = pd.read_csv('profanity_en.csv')

profanities_dict = dict(zip(profanities_df['text'], profanities_df['canonical_form_1']))

def replace_misspelled_profanities(text):
    # Replace misspelled profanities with correct ones
    words = text.split()
    for i in range(len(words)):
        word = words[i]
        if word in profanities_dict:
            words[i] = profanities_dict[word]
    return ' '.join(words)

raw_dataset['Content'] = raw_dataset['Content'].apply(replace_misspelled_profanities)

In [6]:
print('Before preprocessing')
print(raw_dataset)
emoticons = {
    ":)": "smile",
    ":-)": "smile",
    ":(": "sad",
    ":-(": "sad",
    ";)": "wink",
    ";-)": "wink",
    ":D": "laugh",
    ":-D": "laugh",
    ":P": "playful",
    ":-P": "playful",
    ":-O": "surprised",
    ":O": "surprised",
    ":-*": "kiss",
    ":*": "kiss",
    ":'(": "crying",
    ":-/": "skeptical",
    ":/": "skeptical",
    ":-|": "neutral",
    ":|": "neutral"
}
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    # Convert all text to lowercase
    text = str(text)
    text = text.lower()
    
    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    
    # Remove hyperlinks
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove user mentions
    text = re.sub(r'@\w+', '', text)
    
    # Convert emojis to text
    text = emoji.demojize(text, delimiters=("", " "))
    
    # Convert emoticons to text
    for emoticon, description in emoticons.items():
        text = text.replace(emoticon, description)
    
    # Remove date and time values
    text = re.sub(r'\b\d{1,2}[:/.-]\d{1,2}[:/.-]\d{2,4}\b', '', text)  # Dates like 12/12/2022
    text = re.sub(r'\b\d{1,2}:\d{2}(?:[:]\d{2})?(?:\s?[APMapm]{2})?\b', '', text)  # Times like 12:00, 12:00 PM
    
    # Remove accented numbers and characters
    text = re.sub(r'\^(\w+|\d+)', '', text)
    
    # Convert remaining numbers to words
    def convert_number(match):
        number = int(match.group())
        try:
            # Only convert numbers smaller than a certain threshold
            if abs(number) < 1e18:
                return num2words(number)
            else:
                return str(number)  # Leave large numbers as they are
        except OverflowError:
            return str(number)  # Fallback for any unexpected OverflowError
    
    text = re.sub(r'\b\d+\b', convert_number, text)
    
    # Remove ampersands from the beginning of words
    text = re.sub(r'\b&(\w+)', r'\1', text)
    
    # Remove specific characters
    text = re.sub(r'[_"<>`\-;%()|+&=*%.,!?:#$@[\]/]', '', text)
    
    # Remove multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove newline characters
    text = text.replace('\n', ' ')
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    return text

# Apply preprocessing to the content column
raw_dataset['Content'] = raw_dataset['Content'].apply(preprocess_text)
print('After preprocessing')
print(raw_dataset)

Before preprocessing
                                                  Content  Label
0       `- This is not ``creative``. Those are the dic...      0
1       ` :: the term ``standard model`` is itself les...      0
2       True or false, the situation as of March 2002 ...      0
3       Next, maybe you could work on being less conde...      0
4                     This page will need disambiguation.      0
...                                                   ...    ...
842330  "Never really gave it much thought. I just fig...      0
842331                 "Nadie se salva de la regla 34 xd"      0
842332               "Question: Are you a boy or a girl?"      0
842333  "Leave your email or phone number and maybe yo...      1
842334  "From the scenarios you present, I see you bel...      0

[842327 rows x 2 columns]
After preprocessing
                                                  Content  Label
0       creative dictionary definitions terms insuranc...      0
1       term standard 

In [7]:
raw_dataset

Unnamed: 0,Content,Label
0,creative dictionary definitions terms insuranc...,0
1,term standard model less npov think we'd prefe...,0
2,true false situation march two thousand two sa...,0
3,next maybe could work less condescending sugge...,0
4,page need disambiguation,0
...,...,...
842330,never really gave much thought figured back door,0
842331,nadie se salva de la regla thirtyfour xd,0
842332,question boy girl,0
842333,leave email phone number maybe two twits meet ...,1


### Check for Duplicates

In [8]:
raw_dataset[raw_dataset.duplicated()]

Unnamed: 0,Content,Label
632,keep,0
647,keep,0
648,keep,0
649,keep,0
727,article licensing hi i've get users multilicen...,0
...,...,...
842330,never really gave much thought figured back door,0
842331,nadie se salva de la regla thirtyfour xd,0
842332,question boy girl,0
842333,leave email phone number maybe two twits meet ...,1


In [9]:
raw_dataset.drop_duplicates(inplace=True)

In [10]:
raw_dataset.to_csv('final_cleaned_dataset.csv', index=False)

In [11]:
raw_dataset.dropna(inplace=True)
pd.isnull(raw_dataset).sum()

Content    0
Label      0
dtype: int64